Pywikipedia-svn March 2011

pywikipedia-svn@lists.wikimedia.org

8 participants
124 discussions

SVN: [9060] branches/rewrite/scripts/redirect.py
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9060 Revision: 9060 Author: xqt Date: 2011-03-13 14:10:11 +0000 (Sun, 13 Mar 2011) Log Message: ----------- eol-style Modified Paths: -------------- branches/rewrite/scripts/redirect.py Property Changed: ---------------- branches/rewrite/scripts/redirect.py Modified: branches/rewrite/scripts/redirect.py =================================================================== --- branches/rewrite/scripts/redirect.py 2011-03-13 14:06:08 UTC (rev 9059) +++ branches/rewrite/scripts/redirect.py 2011-03-13 14:10:11 UTC (rev 9060) @@ -1,708 +1,708 @@ -# -*- coding: utf-8 -*- -""" -Script to resolve double redirects, and to delete broken redirects. Requires -access to MediaWiki's maintenance pages or to a XML dump file. Delete -function requires adminship. - -Syntax: - - python redirect.py action [-arguments ...] - -where action can be one of these: - -double Fix redirects which point to other redirects -broken Delete redirects where targets don\'t exist. Requires adminship. -both Both of the above. Permitted only with -api. Implies -api. - -and arguments can be: - --moves Use the page move log to find double-redirect candidates. Only - works with action "double". - --namespace:n Namespace to process. Can be given multiple times, for several - namespaces. If omitted, only the main (article) namespace is - treated. - --offset:n With -moves, the number of hours ago to start scanning moved - pages. Otherwise, ignored. - --start:title The starting page title in each namespace. Page need not exist. - --until:title The possible last page title in each namespace. Page needs not - exist. - --total:n The maximum count of redirects to work upon. If omitted, there - is no limit. - --always Don't prompt you for each replacement. - -""" - -# XML not yet implemented: deleted help text follows -##-xml Retrieve information from a local XML dump -## (http://download.wikimedia.org). Argument can also be given as -## "-xml:filename.xml". Cannot be used with -api or -moves. -## If neither of -xml -api -moves is given, info will be loaded -## from a special page of the live wiki. - -# -# (C) Daniel Herding, 2004. -# (C) Purodha Blissenbach, 2009. -# (C) xqt, 2009-2010 -# (C) Pywikipedia bot team, 2004-2010 -# -# Distributed under the terms of the MIT license. -# -__version__='$Id: redirect.py 7789 2009-12-17 19:20:12Z xqt $' -# -import re, sys, datetime -import pywikibot -from pywikibot import config, i18n -# import xmlreader - - -class RedirectGenerator: - def __init__(self, xmlFilename=None, namespaces=[], offset=-1, - use_move_log=False, use_api=False, start=None, until=None, - number=None): - self.site = pywikibot.getSite() -## self.xmlFilename = xmlFilename - self.namespaces = namespaces - if use_api and self.namespaces == []: - self.namespaces = [ 0 ] - self.offset = offset - self.use_move_log = use_move_log - self.use_api = use_api - self.api_start = start - self.api_until = until - self.api_number = number - -# note: rewrite branch does not yet support XML dumps, so this is commented out -# until that support is added -## def get_redirects_from_dump(self, alsoGetPageTitles=False): -## ''' -## Load a local XML dump file, look at all pages which have the -## redirect flag set, and find out where they're pointing at. Return -## a dictionary where the redirect names are the keys and the redirect -## targets are the values. -## ''' -## xmlFilename = self.xmlFilename -## redict = {} -## # open xml dump and read page titles out of it -## dump = xmlreader.XmlDump(xmlFilename) -## redirR = self.site.redirectRegex() -## readPagesCount = 0 -## if alsoGetPageTitles: -## pageTitles = set() -## for entry in dump.parse(): -## readPagesCount += 1 -## # always print status message after 10000 pages -## if readPagesCount % 10000 == 0: -## pywikibot.output(u'%i pages read...' % readPagesCount) -## if len(self.namespaces) > 0: -## if pywikibot.Page(self.site, entry.title).namespace() \ -## not in self.namespaces: -## continue -## if alsoGetPageTitles: -## pageTitles.add(entry.title.replace(' ', '_')) -## -## m = redirR.match(entry.text) -## if m: -## target = m.group(1) -## # There might be redirects to another wiki. Ignore these. -## for code in self.site.family.langs.keys(): -## if target.startswith('%s:' % code) \ -## or target.startswith(':%s:' % code): -## if code == self.site.language(): -## # link to our wiki, but with the lang prefix -## target = target[(len(code)+1):] -## if target.startswith(':'): -## target = target[1:] -## else: -## pywikibot.output( -## u'NOTE: Ignoring %s which is a redirect to %s:' -## % (entry.title, code)) -## target = None -## break -## # if the redirect does not link to another wiki -## if target: -## source = entry.title.replace(' ', '_') -## target = target.replace(' ', '_') -## # remove leading and trailing whitespace -## target = target.strip('_') -## # capitalize the first letter -## if not pywikibot.getSite().nocapitalize: -## source = source[:1].upper() + source[1:] -## target = target[:1].upper() + target[1:] -## if '#' in target: -## target = target[:target.index('#')].rstrip("_") -## if '|' in target: -## pywikibot.output( -## u'HINT: %s is a redirect with a pipelink.' -## % entry.title) -## target = target[:target.index('|')].rstrip("_") -## if target: # in case preceding steps left nothing -## redict[source] = target -## if alsoGetPageTitles: -## return redict, pageTitles -## else: -## return redict -## - def get_redirect_pages_via_api(self): - """Return generator that yields - Pages that are redirects. - - """ - for ns in self.namespaces: - done = False - gen = self.site.allpages(start=self.api_start, - namespace=ns, - filterredir=True) - if self.api_number: - gen.set_maximum_items(self.api_number) - for p in gen: - done = self.api_until \ - and p.title(withNamespace=False) >= self.api_until - if done: - return - yield p - - def _next_redirect_group(self): - """ - Return a generator that retrieves pageids from the API 500 at a time - and yields them as a list - """ - apiQ = [] - for page in self.get_redirect_pages_via_api(): - apiQ.append(str(page._pageid)) - if len(apiQ) >= 500: - yield apiQ - apiQ = [] - if apiQ: - yield apiQ - - def get_redirects_via_api(self, maxlen=8): - """ - Return a generator that yields tuples of data about redirect Pages: - 0 - page title of a redirect page - 1 - type of redirect: - 0 - broken redirect, target page title missing - 1 - normal redirect, target page exists and is not a - redirect - 2..maxlen - start of a redirect chain of that many redirects - (currently, the API seems not to return sufficient - data to make these return values possible, but - that may change) - maxlen+1 - start of an even longer chain, or a loop - (currently, the API seems not to return sufficient - data to allow this return values, but that may - change) - None - start of a redirect chain of unknown length, or loop - 2 - target page title of the redirect, or chain (may not exist) - 3 - target page of the redirect, or end of chain, or page title where - chain or loop detecton was halted, or None if unknown - """ - for apiQ in self._next_redirect_group(): - gen = pywikibot.data.api.Request(action="query", redirects="", - pageids=apiQ) - data = gen.submit() - if 'error' in data: - raise RuntimeError("API query error: %s" % data) - if data == [] or 'query' not in data: - raise RuntimeError("No results given.") - redirects = {} - pages = {} - redirects = dict((x['from'], x['to']) - for x in data['query']['redirects']) - - for pagetitle in data['query']['pages'].values(): - if 'missing' in pagetitle and 'pageid' not in pagetitle: - pages[pagetitle['title']] = False - else: - pages[pagetitle['title']] = True - for redirect in redirects: - target = redirects[redirect] - result = 0 - final = None - try: - if pages[target]: - final = target - try: - while result <= maxlen: - result += 1 - final = redirects[final] - # result = None - except KeyError: - pass - except KeyError: - result = None - pass - yield (redirect, result, target, final) - - def retrieve_broken_redirects(self): - if self.use_api: - count = 0 - for (pagetitle, type, target, final) \ - in self.get_redirects_via_api(maxlen=2): - if type == 0: - yield pagetitle - if self.api_number: - count += 1 - if count >= self.api_number: - break -# TODO: add XML dump support -## elif self.xmlFilename == None: -## # retrieve information from the live wiki's maintenance page -## # broken redirect maintenance page's URL -## path = self.site.broken_redirects_address(default_limit=False) -## pywikibot.output(u'Retrieving special page...') -## maintenance_txt = self.site.getUrl(path) -## -## # regular expression which finds redirects which point to a -## # non-existing page inside the HTML -## Rredir = re.compile('\<li\>\<a href=".+?" title="(.*?)"') -## -## redir_names = Rredir.findall(maintenance_txt) -## pywikibot.output(u'Retrieved %d redirects from special page.\n' -## % len(redir_names)) -## for redir_name in redir_names: -## yield redir_name -## else: -## # retrieve information from XML dump -## pywikibot.output( -## u'Getting a list of all redirects and of all page titles...') -## redirs, pageTitles = self.get_redirects_from_dump( -## alsoGetPageTitles=True) -## for (key, value) in redirs.iteritems(): -## if value not in pageTitles: -## yield key - - def retrieve_double_redirects(self): - if self.use_move_log: - for redir_page in self.get_moved_pages_redirects(): - yield redir_page.title() - return - else: - count = 0 - for (pagetitle, type, target, final) \ - in self.get_redirects_via_api(maxlen=2): - if type != 0 and type != 1: - yield pagetitle - if self.api_number: - count += 1 - if count >= self.api_number: - break - -# TODO: API cannot yet deliver contents of "special" pages -## elif self.xmlFilename == None: -## # retrieve information from the live wiki's maintenance page -## # double redirect maintenance page's URL -### pywikibot.config.special_page_limit = 1000 -## path = self.site.double_redirects_address(default_limit = False) -## pywikibot.output(u'Retrieving special page...') -## maintenance_txt = self.site.getUrl(path) -## -## # regular expression which finds redirects which point to -## # another redirect inside the HTML -## Rredir = re.compile('\<li\>\<a href=".+?" title="(.*?)">') -## redir_names = Rredir.findall(maintenance_txt) -## pywikibot.output(u'Retrieved %i redirects from special page.\n' -## % len(redir_names)) -## for redir_name in redir_names: -## yield redir_name -## else: -## redict = self.get_redirects_from_dump() -## num = 0 -## for (key, value) in redict.iteritems(): -## num += 1 -## # check if the value - that is, the redirect target - is a -## # redirect as well -## if num > self.offset and value in redict: -## yield key -## pywikibot.output(u'\nChecking redirect %i of %i...' -## % (num + 1, len(redict))) - - def get_moved_pages_redirects(self): - '''generate redirects to recently-moved pages''' - # this will run forever, until user interrupts it - - if self.offset <= 0: - self.offset = 1 - start = datetime.datetime.utcnow() \ - - datetime.timedelta(0, self.offset*3600) - # self.offset hours ago - offset_time = start.strftime("%Y%m%d%H%M%S") - - move_gen = self.site.logevents(logtype="move", start=offset_time) - if self.api_number: - move_gen.set_maximum_items(self.api_number) - for logentry in move_gen: - moved_page = logentry.title() - try: - if not moved_page.isRedirectPage(): - continue - except pywikibot.BadTitle: - continue - except pywikibot.ServerError: - continue - # moved_page is now a redirect, so any redirects pointing - # to it need to be changed - try: - for page in moved_page.getReferences(follow_redirects=True, - redirectsOnly=True): - yield page - except pywikibot.NoPage: - # original title must have been deleted after move - continue - - -class RedirectRobot: - def __init__(self, action, generator, always=False, number=None): - self.site = pywikibot.getSite() - self.action = action - self.generator = generator - self.always = always - self.number = number - self.exiting = False - - def prompt(self, question): - if not self.always: - choice = pywikibot.inputChoice(question, - ['Yes', 'No', 'All', 'Quit'], - ['y', 'N', 'a', 'q'], 'N') - if choice == 'n': - return False - elif choice == 'q': - self.exiting = True - return False - elif choice == 'a': - self.always = True - return True - - def delete_broken_redirects(self): - # get reason for deletion text - reason = i18n.twtranslate(self.site, 'redirect-remove-broken') - for redir_name in self.generator.retrieve_broken_redirects(): - self.delete_1_broken_redirect(redir_name, reason) - if self.exiting: - break - - def delete_1_broken_redirect(self, redir_name, reason): - redir_page = pywikibot.Page(self.site, redir_name) - # Show the title of the page we're working on. - # Highlight the title in purple. - pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" - % redir_page.title()) - try: - targetPage = redir_page.getRedirectTarget() - except pywikibot.IsNotRedirectPage: - pywikibot.output(u'%s is not a redirect.' % redir_page.title()) - except pywikibot.NoPage: - pywikibot.output(u'%s doesn\'t exist.' % redir_page.title()) - else: - try: - targetPage.get() - except pywikibot.NoPage: - if self.prompt( - u'Redirect target %s does not exist. Do you want to delete %s?' - % (targetPage.title(asLink=True), - redir_page.title(asLink=True))): - try: - redir_page.delete(reason, prompt = False) - except pywikibot.NoUsername: - if i18n.twhas_key( - targetPage.site.lang, - 'redirect-broken-redirect-template') and \ - i18n.twhas_key(targetPage.site.lang, - 'redirect-remove-broken'): - pywikibot.output( - u"No sysop in user-config.py, put page to speedy deletion.") - content = redir_page.get(get_redirect=True) - ### TODO: Add bot's signature if needed - ### Not supported via TW yet - content = i18n.twtranslate( - targetPage.site.lang, - 'redirect-broken-redirect-template' - ) + "\n" + content - redir_page.put(content, reason) - except pywikibot.IsRedirectPage: - pywikibot.output( - u'Redirect target %s is also a redirect! Won\'t delete anything.' - % targetPage.title(asLink=True)) - else: - #we successfully get the target page, meaning that - #it exists and is not a redirect: no reason to touch it. - pywikibot.output( - u'Redirect target %s does exist! Won\'t delete anything.' - % targetPage.title(asLink=True)) - pywikibot.output(u'') - - def fix_double_redirects(self): - for redir_name in self.generator.retrieve_double_redirects(): - self.fix_1_double_redirect(redir_name) - if self.exiting: - break - - def fix_1_double_redirect(self, redir_name): - redir = pywikibot.Page(self.site, redir_name) - # Show the title of the page we're working on. - # Highlight the title in purple. - pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" - % redir.title()) - newRedir = redir - redirList = [] # bookkeeping to detect loops - while True: - redirList.append(u'%s:%s' % (newRedir.site.lang, - newRedir.title(withSection=False))) - try: - targetPage = newRedir.getRedirectTarget() - except pywikibot.IsNotRedirectPage: - if len(redirList) == 1: - pywikibot.output(u'Skipping: Page %s is not a redirect.' - % redir.title(asLink=True)) - break #do nothing - elif len(redirList) == 2: - pywikibot.output( - u'Skipping: Redirect target %s is not a redirect.' - % newRedir.title(asLink=True)) - break # do nothing - else: - pass # target found - except pywikibot.SectionError: - pywikibot.output( - u'Warning: Redirect target section %s doesn\'t exist.' - % newRedir.title(asLink=True)) - except pywikibot.CircularRedirect, e: - pywikibot.warning(u"Skipping circular redirect: [[%s]]" - % str(e)) - break - except pywikibot.BadTitle, e: - # str(e) is in the format 'BadTitle: [[Foo]]' - pywikibot.output( - u'Warning: Redirect target %s is not a valid page title.' - % str(e)[10:]) - break - except pywikibot.NoPage: - if len(redirList) == 1: - pywikibot.output(u'Skipping: Page %s does not exist.' - % redir.title(asLink=True)) - break - else: - if self.always: - pywikibot.output( - u"Skipping: Redirect target %s doesn't exist." - % newRedir.title(asLink=True)) - break # skip if automatic - else: - pywikibot.output( - u"Warning: Redirect target %s doesn't exist." - % newRedir.title(asLink=True)) - except pywikibot.ServerError: - pywikibot.output(u'Skipping: Server Error') - break - else: - pywikibot.output( - u' Links to: %s.' - % targetPage.title(asLink=True)) - if targetPage.site.sitename() == 'wikipedia:en': - mw_msg = targetPage.site.mediawiki_message( - 'wikieditor-toolbar-tool-redirect-example') - if targetPage.title() == mw_msg: - pywikibot.output( - u"Skipping toolbar example: Redirect source is potentially vandalized.") - break - if targetPage.site != self.site: - pywikibot.output( - u'Warning: redirect target (%s) is on a different site.' - % targetPage.title(asLink=True)) - if self.always: - break # skip if automatic - # watch out for redirect loops - if redirList.count(u'%s:%s' - % (targetPage.site.lang, - targetPage.title(withSection=False)) - ) > 0: - pywikibot.output( - u'Warning: Redirect target %s forms a redirect loop.' - % targetPage.title(asLink=True)) - break ### doesn't work. edits twice! -## try: -## content = targetPage.get(get_redirect=True) -## except pywikibot.SectionError: -## content = pywikibot.Page( -## targetPage.site, -## targetPage.title(withSection=False) -## ).get(get_redirect=True) -## if i18n.twhas_key( -## targetPage.site.lang, -## 'redirect-broken-redirect-template') and \ -## i18n.twhas_key(targetPage.site.lang, -## 'redirect-remove-loop'): -## pywikibot.output(u"Tagging redirect for deletion") -## # Delete the two redirects -## content = i18n.twtranslate( -## targetPage.site.lang, -## 'redirect-remove-loop', -## ) + "\n" + content -## summ = i18n.twtranslate( -## targetPage.site.lang, -## 'redirect-broken-redirect-template') -## targetPage.put(content, summ) -## redir.put(content, summ) -## break # TODO Better implement loop redirect - else: # redirect target found - if targetPage.isStaticRedirect(): - pywikibot.output( - u" Redirect target is STATICREDIRECT.") - pass - else: - newRedir = targetPage - continue - try: - oldText = redir.get(get_redirect=True) - except pywikibot.BadTitle: - pywikibot.output(u"Bad Title Error") - break - text = self.site.redirectRegex().sub( - '#%s %s' % (self.site.redirect(True), - targetPage.title(asLink=True)), oldText) - if text == oldText: - pywikibot.output(u"Note: Nothing left to do on %s" - % redir.title(asLink=True)) - break - summary = i18n.twtranslate(self.site, 'redirect-fix-double', - {'to': targetPage.title(asLink=True)} - ) - pywikibot.showDiff(oldText, text) - if self.prompt(u'Do you want to accept the changes?'): - try: - redir.put(text, summary) - except pywikibot.LockedPage: - pywikibot.output(u'%s is locked.' % redir.title()) - except pywikibot.SpamfilterError, error: - pywikibot.output( - u"Saving page [[%s]] prevented by spam filter: %s" - % (redir.title(), error.url)) - except pywikibot.PageNotSaved, error: - pywikibot.output(u"Saving page [[%s]] failed: %s" - % (redir.title(), error)) - except pywikibot.NoUsername: - pywikibot.output( - u"Page [[%s]] not saved; sysop privileges required." - % redir.title()) - except pywikibot.Error, error: - pywikibot.output( - u"Unexpected error occurred trying to save [[%s]]: %s" - % (redir.title(), error)) - break - - def fix_double_or_delete_broken_redirects(self): - # TODO: part of this should be moved to generator, the rest merged into self.run() - # get reason for deletion text - delete_reason = i18n.twtranslate(self.site, 'redirect-remove-broken') - count = 0 - for (redir_name, code, target, final)\ - in self.generator.get_redirects_via_api(maxlen=2): - if code == 1: - continue - elif code == 0: - self.delete_1_broken_redirect(redir_name, delete_reason) - count += 1 - else: - self.fix_1_double_redirect(redir_name) - count += 1 - if self.exiting or (self.number and count >= self.number): - break - - def run(self): - # TODO: make all generators return a redirect type indicator, - # thus make them usable with 'both' - if self.action == 'double': - self.fix_double_redirects() - elif self.action == 'broken': - self.delete_broken_redirects() - elif self.action == 'both': - self.fix_double_or_delete_broken_redirects() - -def main(*args): - # read command line parameters - # what the bot should do (either resolve double redirs, or delete broken - # redirs) - action = None - # where the bot should get his infos from (either None to load the - # maintenance special page from the live wiki, or the filename of a - # local XML dump file) - xmlFilename = None - # Which namespace should be processed when using a XML dump - # default to -1 which means all namespaces will be processed - namespaces = [] - # at which redirect shall we start searching double redirects again - # (only with dump); default to -1 which means all redirects are checked - offset = -1 - moved_pages = False - api = True # rewrite always uses api, probably should get rid of this - start = '' - until = '' - number = None - always = False - for arg in pywikibot.handleArgs(*args): - if arg == 'double' or arg == 'do': - action = 'double' - elif arg == 'broken' or arg == 'br': - action = 'broken' - elif arg == 'both': - action = 'both' - elif arg.startswith('-xml'): - if len(arg) == 4: - xmlFilename = pywikibot.input( - u'Please enter the XML dump\'s filename: ') - else: - xmlFilename = arg[5:] - elif arg.startswith('-moves'): - moved_pages = True - elif arg.startswith('-namespace:'): - ns = arg[11:] - if ns == '': - ## "-namespace:" does NOT yield -namespace:0 further down the road! - ns = pywikibot.input( - u'Please enter a namespace by its number: ') -# u'Please enter a namespace by its name or number: ') -# TODO! at least for some generators. - if ns == '': - ns = '0' - try: - ns = int(ns) - except ValueError: -#-namespace:all Process all namespaces. Works only with the API read interface. - pass - if not ns in namespaces: - namespaces.append(ns) - elif arg.startswith('-offset:'): - offset = int(arg[8:]) - elif arg.startswith('-start:'): - start = arg[7:] - elif arg.startswith('-until:'): - until = arg[7:] - elif arg.startswith('-total:'): - number = int(arg[8:]) - elif arg == '-always': - always = True - else: - pywikibot.output(u'Unknown argument: %s' % arg) - - if xmlFilename: - pywikibot.error(u"Sorry, xmlreader is not yet implemented in rewrite") - elif not action: # or (xmlFilename and moved_pages) - # or (api and xmlFilename): - pywikibot.showHelp('redirect') - else: - gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages, - api, start, until, number) - bot = RedirectRobot(action, gen, always, number) - bot.run() - -if __name__ == '__main__': - try: - main() - finally: - pywikibot.stopme() +# -*- coding: utf-8 -*- +""" +Script to resolve double redirects, and to delete broken redirects. Requires +access to MediaWiki's maintenance pages or to a XML dump file. Delete +function requires adminship. + +Syntax: + + python redirect.py action [-arguments ...] + +where action can be one of these: + +double Fix redirects which point to other redirects +broken Delete redirects where targets don\'t exist. Requires adminship. +both Both of the above. Permitted only with -api. Implies -api. + +and arguments can be: + +-moves Use the page move log to find double-redirect candidates. Only + works with action "double". + +-namespace:n Namespace to process. Can be given multiple times, for several + namespaces. If omitted, only the main (article) namespace is + treated. + +-offset:n With -moves, the number of hours ago to start scanning moved + pages. Otherwise, ignored. + +-start:title The starting page title in each namespace. Page need not exist. + +-until:title The possible last page title in each namespace. Page needs not + exist. + +-total:n The maximum count of redirects to work upon. If omitted, there + is no limit. + +-always Don't prompt you for each replacement. + +""" + +# XML not yet implemented: deleted help text follows +##-xml Retrieve information from a local XML dump +## (http://download.wikimedia.org). Argument can also be given as +## "-xml:filename.xml". Cannot be used with -api or -moves. +## If neither of -xml -api -moves is given, info will be loaded +## from a special page of the live wiki. + +# +# (C) Daniel Herding, 2004. +# (C) Purodha Blissenbach, 2009. +# (C) xqt, 2009-2010 +# (C) Pywikipedia bot team, 2004-2010 +# +# Distributed under the terms of the MIT license. +# +__version__='$Id: redirect.py 7789 2009-12-17 19:20:12Z xqt $' +# +import re, sys, datetime +import pywikibot +from pywikibot import config, i18n +# import xmlreader + + +class RedirectGenerator: + def __init__(self, xmlFilename=None, namespaces=[], offset=-1, + use_move_log=False, use_api=False, start=None, until=None, + number=None): + self.site = pywikibot.getSite() +## self.xmlFilename = xmlFilename + self.namespaces = namespaces + if use_api and self.namespaces == []: + self.namespaces = [ 0 ] + self.offset = offset + self.use_move_log = use_move_log + self.use_api = use_api + self.api_start = start + self.api_until = until + self.api_number = number + +# note: rewrite branch does not yet support XML dumps, so this is commented out +# until that support is added +## def get_redirects_from_dump(self, alsoGetPageTitles=False): +## ''' +## Load a local XML dump file, look at all pages which have the +## redirect flag set, and find out where they're pointing at. Return +## a dictionary where the redirect names are the keys and the redirect +## targets are the values. +## ''' +## xmlFilename = self.xmlFilename +## redict = {} +## # open xml dump and read page titles out of it +## dump = xmlreader.XmlDump(xmlFilename) +## redirR = self.site.redirectRegex() +## readPagesCount = 0 +## if alsoGetPageTitles: +## pageTitles = set() +## for entry in dump.parse(): +## readPagesCount += 1 +## # always print status message after 10000 pages +## if readPagesCount % 10000 == 0: +## pywikibot.output(u'%i pages read...' % readPagesCount) +## if len(self.namespaces) > 0: +## if pywikibot.Page(self.site, entry.title).namespace() \ +## not in self.namespaces: +## continue +## if alsoGetPageTitles: +## pageTitles.add(entry.title.replace(' ', '_')) +## +## m = redirR.match(entry.text) +## if m: +## target = m.group(1) +## # There might be redirects to another wiki. Ignore these. +## for code in self.site.family.langs.keys(): +## if target.startswith('%s:' % code) \ +## or target.startswith(':%s:' % code): +## if code == self.site.language(): +## # link to our wiki, but with the lang prefix +## target = target[(len(code)+1):] +## if target.startswith(':'): +## target = target[1:] +## else: +## pywikibot.output( +## u'NOTE: Ignoring %s which is a redirect to %s:' +## % (entry.title, code)) +## target = None +## break +## # if the redirect does not link to another wiki +## if target: +## source = entry.title.replace(' ', '_') +## target = target.replace(' ', '_') +## # remove leading and trailing whitespace +## target = target.strip('_') +## # capitalize the first letter +## if not pywikibot.getSite().nocapitalize: +## source = source[:1].upper() + source[1:] +## target = target[:1].upper() + target[1:] +## if '#' in target: +## target = target[:target.index('#')].rstrip("_") +## if '|' in target: +## pywikibot.output( +## u'HINT: %s is a redirect with a pipelink.' +## % entry.title) +## target = target[:target.index('|')].rstrip("_") +## if target: # in case preceding steps left nothing +## redict[source] = target +## if alsoGetPageTitles: +## return redict, pageTitles +## else: +## return redict +## + def get_redirect_pages_via_api(self): + """Return generator that yields + Pages that are redirects. + + """ + for ns in self.namespaces: + done = False + gen = self.site.allpages(start=self.api_start, + namespace=ns, + filterredir=True) + if self.api_number: + gen.set_maximum_items(self.api_number) + for p in gen: + done = self.api_until \ + and p.title(withNamespace=False) >= self.api_until + if done: + return + yield p + + def _next_redirect_group(self): + """ + Return a generator that retrieves pageids from the API 500 at a time + and yields them as a list + """ + apiQ = [] + for page in self.get_redirect_pages_via_api(): + apiQ.append(str(page._pageid)) + if len(apiQ) >= 500: + yield apiQ + apiQ = [] + if apiQ: + yield apiQ + + def get_redirects_via_api(self, maxlen=8): + """ + Return a generator that yields tuples of data about redirect Pages: + 0 - page title of a redirect page + 1 - type of redirect: + 0 - broken redirect, target page title missing + 1 - normal redirect, target page exists and is not a + redirect + 2..maxlen - start of a redirect chain of that many redirects + (currently, the API seems not to return sufficient + data to make these return values possible, but + that may change) + maxlen+1 - start of an even longer chain, or a loop + (currently, the API seems not to return sufficient + data to allow this return values, but that may + change) + None - start of a redirect chain of unknown length, or loop + 2 - target page title of the redirect, or chain (may not exist) + 3 - target page of the redirect, or end of chain, or page title where + chain or loop detecton was halted, or None if unknown + """ + for apiQ in self._next_redirect_group(): + gen = pywikibot.data.api.Request(action="query", redirects="", + pageids=apiQ) + data = gen.submit() + if 'error' in data: + raise RuntimeError("API query error: %s" % data) + if data == [] or 'query' not in data: + raise RuntimeError("No results given.") + redirects = {} + pages = {} + redirects = dict((x['from'], x['to']) + for x in data['query']['redirects']) + + for pagetitle in data['query']['pages'].values(): + if 'missing' in pagetitle and 'pageid' not in pagetitle: + pages[pagetitle['title']] = False + else: + pages[pagetitle['title']] = True + for redirect in redirects: + target = redirects[redirect] + result = 0 + final = None + try: + if pages[target]: + final = target + try: + while result <= maxlen: + result += 1 + final = redirects[final] + # result = None + except KeyError: + pass + except KeyError: + result = None + pass + yield (redirect, result, target, final) + + def retrieve_broken_redirects(self): + if self.use_api: + count = 0 + for (pagetitle, type, target, final) \ + in self.get_redirects_via_api(maxlen=2): + if type == 0: + yield pagetitle + if self.api_number: + count += 1 + if count >= self.api_number: + break +# TODO: add XML dump support +## elif self.xmlFilename == None: +## # retrieve information from the live wiki's maintenance page +## # broken redirect maintenance page's URL +## path = self.site.broken_redirects_address(default_limit=False) +## pywikibot.output(u'Retrieving special page...') +## maintenance_txt = self.site.getUrl(path) +## +## # regular expression which finds redirects which point to a +## # non-existing page inside the HTML +## Rredir = re.compile('\<li\>\<a href=".+?" title="(.*?)"') +## +## redir_names = Rredir.findall(maintenance_txt) +## pywikibot.output(u'Retrieved %d redirects from special page.\n' +## % len(redir_names)) +## for redir_name in redir_names: +## yield redir_name +## else: +## # retrieve information from XML dump +## pywikibot.output( +## u'Getting a list of all redirects and of all page titles...') +## redirs, pageTitles = self.get_redirects_from_dump( +## alsoGetPageTitles=True) +## for (key, value) in redirs.iteritems(): +## if value not in pageTitles: +## yield key + + def retrieve_double_redirects(self): + if self.use_move_log: + for redir_page in self.get_moved_pages_redirects(): + yield redir_page.title() + return + else: + count = 0 + for (pagetitle, type, target, final) \ + in self.get_redirects_via_api(maxlen=2): + if type != 0 and type != 1: + yield pagetitle + if self.api_number: + count += 1 + if count >= self.api_number: + break + +# TODO: API cannot yet deliver contents of "special" pages +## elif self.xmlFilename == None: +## # retrieve information from the live wiki's maintenance page +## # double redirect maintenance page's URL +### pywikibot.config.special_page_limit = 1000 +## path = self.site.double_redirects_address(default_limit = False) +## pywikibot.output(u'Retrieving special page...') +## maintenance_txt = self.site.getUrl(path) +## +## # regular expression which finds redirects which point to +## # another redirect inside the HTML +## Rredir = re.compile('\<li\>\<a href=".+?" title="(.*?)">') +## redir_names = Rredir.findall(maintenance_txt) +## pywikibot.output(u'Retrieved %i redirects from special page.\n' +## % len(redir_names)) +## for redir_name in redir_names: +## yield redir_name +## else: +## redict = self.get_redirects_from_dump() +## num = 0 +## for (key, value) in redict.iteritems(): +## num += 1 +## # check if the value - that is, the redirect target - is a +## # redirect as well +## if num > self.offset and value in redict: +## yield key +## pywikibot.output(u'\nChecking redirect %i of %i...' +## % (num + 1, len(redict))) + + def get_moved_pages_redirects(self): + '''generate redirects to recently-moved pages''' + # this will run forever, until user interrupts it + + if self.offset <= 0: + self.offset = 1 + start = datetime.datetime.utcnow() \ + - datetime.timedelta(0, self.offset*3600) + # self.offset hours ago + offset_time = start.strftime("%Y%m%d%H%M%S") + + move_gen = self.site.logevents(logtype="move", start=offset_time) + if self.api_number: + move_gen.set_maximum_items(self.api_number) + for logentry in move_gen: + moved_page = logentry.title() + try: + if not moved_page.isRedirectPage(): + continue + except pywikibot.BadTitle: + continue + except pywikibot.ServerError: + continue + # moved_page is now a redirect, so any redirects pointing + # to it need to be changed + try: + for page in moved_page.getReferences(follow_redirects=True, + redirectsOnly=True): + yield page + except pywikibot.NoPage: + # original title must have been deleted after move + continue + + +class RedirectRobot: + def __init__(self, action, generator, always=False, number=None): + self.site = pywikibot.getSite() + self.action = action + self.generator = generator + self.always = always + self.number = number + self.exiting = False + + def prompt(self, question): + if not self.always: + choice = pywikibot.inputChoice(question, + ['Yes', 'No', 'All', 'Quit'], + ['y', 'N', 'a', 'q'], 'N') + if choice == 'n': + return False + elif choice == 'q': + self.exiting = True + return False + elif choice == 'a': + self.always = True + return True + + def delete_broken_redirects(self): + # get reason for deletion text + reason = i18n.twtranslate(self.site, 'redirect-remove-broken') + for redir_name in self.generator.retrieve_broken_redirects(): + self.delete_1_broken_redirect(redir_name, reason) + if self.exiting: + break + + def delete_1_broken_redirect(self, redir_name, reason): + redir_page = pywikibot.Page(self.site, redir_name) + # Show the title of the page we're working on. + # Highlight the title in purple. + pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % redir_page.title()) + try: + targetPage = redir_page.getRedirectTarget() + except pywikibot.IsNotRedirectPage: + pywikibot.output(u'%s is not a redirect.' % redir_page.title()) + except pywikibot.NoPage: + pywikibot.output(u'%s doesn\'t exist.' % redir_page.title()) + else: + try: + targetPage.get() + except pywikibot.NoPage: + if self.prompt( + u'Redirect target %s does not exist. Do you want to delete %s?' + % (targetPage.title(asLink=True), + redir_page.title(asLink=True))): + try: + redir_page.delete(reason, prompt = False) + except pywikibot.NoUsername: + if i18n.twhas_key( + targetPage.site.lang, + 'redirect-broken-redirect-template') and \ + i18n.twhas_key(targetPage.site.lang, + 'redirect-remove-broken'): + pywikibot.output( + u"No sysop in user-config.py, put page to speedy deletion.") + content = redir_page.get(get_redirect=True) + ### TODO: Add bot's signature if needed + ### Not supported via TW yet + content = i18n.twtranslate( + targetPage.site.lang, + 'redirect-broken-redirect-template' + ) + "\n" + content + redir_page.put(content, reason) + except pywikibot.IsRedirectPage: + pywikibot.output( + u'Redirect target %s is also a redirect! Won\'t delete anything.' + % targetPage.title(asLink=True)) + else: + #we successfully get the target page, meaning that + #it exists and is not a redirect: no reason to touch it. + pywikibot.output( + u'Redirect target %s does exist! Won\'t delete anything.' + % targetPage.title(asLink=True)) + pywikibot.output(u'') + + def fix_double_redirects(self): + for redir_name in self.generator.retrieve_double_redirects(): + self.fix_1_double_redirect(redir_name) + if self.exiting: + break + + def fix_1_double_redirect(self, redir_name): + redir = pywikibot.Page(self.site, redir_name) + # Show the title of the page we're working on. + # Highlight the title in purple. + pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % redir.title()) + newRedir = redir + redirList = [] # bookkeeping to detect loops + while True: + redirList.append(u'%s:%s' % (newRedir.site.lang, + newRedir.title(withSection=False))) + try: + targetPage = newRedir.getRedirectTarget() + except pywikibot.IsNotRedirectPage: + if len(redirList) == 1: + pywikibot.output(u'Skipping: Page %s is not a redirect.' + % redir.title(asLink=True)) + break #do nothing + elif len(redirList) == 2: + pywikibot.output( + u'Skipping: Redirect target %s is not a redirect.' + % newRedir.title(asLink=True)) + break # do nothing + else: + pass # target found + except pywikibot.SectionError: + pywikibot.output( + u'Warning: Redirect target section %s doesn\'t exist.' + % newRedir.title(asLink=True)) + except pywikibot.CircularRedirect, e: + pywikibot.warning(u"Skipping circular redirect: [[%s]]" + % str(e)) + break + except pywikibot.BadTitle, e: + # str(e) is in the format 'BadTitle: [[Foo]]' + pywikibot.output( + u'Warning: Redirect target %s is not a valid page title.' + % str(e)[10:]) + break + except pywikibot.NoPage: + if len(redirList) == 1: + pywikibot.output(u'Skipping: Page %s does not exist.' + % redir.title(asLink=True)) + break + else: + if self.always: + pywikibot.output( + u"Skipping: Redirect target %s doesn't exist." + % newRedir.title(asLink=True)) + break # skip if automatic + else: + pywikibot.output( + u"Warning: Redirect target %s doesn't exist." + % newRedir.title(asLink=True)) + except pywikibot.ServerError: + pywikibot.output(u'Skipping: Server Error') + break + else: + pywikibot.output( + u' Links to: %s.' + % targetPage.title(asLink=True)) + if targetPage.site.sitename() == 'wikipedia:en': + mw_msg = targetPage.site.mediawiki_message( + 'wikieditor-toolbar-tool-redirect-example') + if targetPage.title() == mw_msg: + pywikibot.output( + u"Skipping toolbar example: Redirect source is potentially vandalized.") + break + if targetPage.site != self.site: + pywikibot.output( + u'Warning: redirect target (%s) is on a different site.' + % targetPage.title(asLink=True)) + if self.always: + break # skip if automatic + # watch out for redirect loops + if redirList.count(u'%s:%s' + % (targetPage.site.lang, + targetPage.title(withSection=False)) + ) > 0: + pywikibot.output( + u'Warning: Redirect target %s forms a redirect loop.' + % targetPage.title(asLink=True)) + break ### doesn't work. edits twice! +## try: +## content = targetPage.get(get_redirect=True) +## except pywikibot.SectionError: +## content = pywikibot.Page( +## targetPage.site, +## targetPage.title(withSection=False) +## ).get(get_redirect=True) +## if i18n.twhas_key( +## targetPage.site.lang, +## 'redirect-broken-redirect-template') and \ +## i18n.twhas_key(targetPage.site.lang, +## 'redirect-remove-loop'): +## pywikibot.output(u"Tagging redirect for deletion") +## # Delete the two redirects +## content = i18n.twtranslate( +## targetPage.site.lang, +## 'redirect-remove-loop', +## ) + "\n" + content +## summ = i18n.twtranslate( +## targetPage.site.lang, +## 'redirect-broken-redirect-template') +## targetPage.put(content, summ) +## redir.put(content, summ) +## break # TODO Better implement loop redirect + else: # redirect target found + if targetPage.isStaticRedirect(): + pywikibot.output( + u" Redirect target is STATICREDIRECT.") + pass + else: + newRedir = targetPage + continue + try: + oldText = redir.get(get_redirect=True) + except pywikibot.BadTitle: + pywikibot.output(u"Bad Title Error") + break + text = self.site.redirectRegex().sub( + '#%s %s' % (self.site.redirect(True), + targetPage.title(asLink=True)), oldText) + if text == oldText: + pywikibot.output(u"Note: Nothing left to do on %s" + % redir.title(asLink=True)) + break + summary = i18n.twtranslate(self.site, 'redirect-fix-double', + {'to': targetPage.title(asLink=True)} + ) + pywikibot.showDiff(oldText, text) + if self.prompt(u'Do you want to accept the changes?'): + try: + redir.put(text, summary) + except pywikibot.LockedPage: + pywikibot.output(u'%s is locked.' % redir.title()) + except pywikibot.SpamfilterError, error: + pywikibot.output( + u"Saving page [[%s]] prevented by spam filter: %s" + % (redir.title(), error.url)) + except pywikibot.PageNotSaved, error: + pywikibot.output(u"Saving page [[%s]] failed: %s" + % (redir.title(), error)) + except pywikibot.NoUsername: + pywikibot.output( + u"Page [[%s]] not saved; sysop privileges required." + % redir.title()) + except pywikibot.Error, error: + pywikibot.output( + u"Unexpected error occurred trying to save [[%s]]: %s" + % (redir.title(), error)) + break + + def fix_double_or_delete_broken_redirects(self): + # TODO: part of this should be moved to generator, the rest merged into self.run() + # get reason for deletion text + delete_reason = i18n.twtranslate(self.site, 'redirect-remove-broken') + count = 0 + for (redir_name, code, target, final)\ + in self.generator.get_redirects_via_api(maxlen=2): + if code == 1: + continue + elif code == 0: + self.delete_1_broken_redirect(redir_name, delete_reason) + count += 1 + else: + self.fix_1_double_redirect(redir_name) + count += 1 + if self.exiting or (self.number and count >= self.number): + break + + def run(self): + # TODO: make all generators return a redirect type indicator, + # thus make them usable with 'both' + if self.action == 'double': + self.fix_double_redirects() + elif self.action == 'broken': + self.delete_broken_redirects() + elif self.action == 'both': + self.fix_double_or_delete_broken_redirects() + +def main(*args): + # read command line parameters + # what the bot should do (either resolve double redirs, or delete broken + # redirs) + action = None + # where the bot should get his infos from (either None to load the + # maintenance special page from the live wiki, or the filename of a + # local XML dump file) + xmlFilename = None + # Which namespace should be processed when using a XML dump + # default to -1 which means all namespaces will be processed + namespaces = [] + # at which redirect shall we start searching double redirects again + # (only with dump); default to -1 which means all redirects are checked + offset = -1 + moved_pages = False + api = True # rewrite always uses api, probably should get rid of this + start = '' + until = '' + number = None + always = False + for arg in pywikibot.handleArgs(*args): + if arg == 'double' or arg == 'do': + action = 'double' + elif arg == 'broken' or arg == 'br': + action = 'broken' + elif arg == 'both': + action = 'both' + elif arg.startswith('-xml'): + if len(arg) == 4: + xmlFilename = pywikibot.input( + u'Please enter the XML dump\'s filename: ') + else: + xmlFilename = arg[5:] + elif arg.startswith('-moves'): + moved_pages = True + elif arg.startswith('-namespace:'): + ns = arg[11:] + if ns == '': + ## "-namespace:" does NOT yield -namespace:0 further down the road! + ns = pywikibot.input( + u'Please enter a namespace by its number: ') +# u'Please enter a namespace by its name or number: ') +# TODO! at least for some generators. + if ns == '': + ns = '0' + try: + ns = int(ns) + except ValueError: +#-namespace:all Process all namespaces. Works only with the API read interface. + pass + if not ns in namespaces: + namespaces.append(ns) + elif arg.startswith('-offset:'): + offset = int(arg[8:]) + elif arg.startswith('-start:'): + start = arg[7:] + elif arg.startswith('-until:'): + until = arg[7:] + elif arg.startswith('-total:'): + number = int(arg[8:]) + elif arg == '-always': + always = True + else: + pywikibot.output(u'Unknown argument: %s' % arg) + + if xmlFilename: + pywikibot.error(u"Sorry, xmlreader is not yet implemented in rewrite") + elif not action: # or (xmlFilename and moved_pages) + # or (api and xmlFilename): + pywikibot.showHelp('redirect') + else: + gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages, + api, start, until, number) + bot = RedirectRobot(action, gen, always, number) + bot.run() + +if __name__ == '__main__': + try: + main() + finally: + pywikibot.stopme() Property changes on: branches/rewrite/scripts/redirect.py ___________________________________________________________________ Added: svn:eol-style + native

1 0

SVN: [9059] branches/rewrite/scripts/i18n/__init__.py
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9059 Revision: 9059 Author: xqt Date: 2011-03-13 14:06:08 +0000 (Sun, 13 Mar 2011) Log Message: ----------- eol-style Property Changed: ---------------- branches/rewrite/scripts/i18n/__init__.py Property changes on: branches/rewrite/scripts/i18n/__init__.py ___________________________________________________________________ Added: svn:eol-style + native

1 0

SVN: [9058] branches/rewrite
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9058 Revision: 9058 Author: xqt Date: 2011-03-13 12:57:05 +0000 (Sun, 13 Mar 2011) Log Message: ----------- stripped trailing whitespace; eol-style Modified Paths: -------------- branches/rewrite/distribute_setup.py branches/rewrite/tests/i18n_tests.py branches/rewrite/tests/page_tests.py Property Changed: ---------------- branches/rewrite/tests/i18n/__init__.py branches/rewrite/tests/i18n/test.py Modified: branches/rewrite/distribute_setup.py =================================================================== --- branches/rewrite/distribute_setup.py 2011-03-13 12:50:06 UTC (rev 9057) +++ branches/rewrite/distribute_setup.py 2011-03-13 12:57:05 UTC (rev 9058) @@ -1,4 +1,4 @@ -#!python +#!python """Bootstrap distribute installation If you want to use setuptools in your package's setup.py, just include this Property changes on: branches/rewrite/tests/i18n/__init__.py ___________________________________________________________________ Added: svn:eol-style + native Property changes on: branches/rewrite/tests/i18n/test.py ___________________________________________________________________ Added: svn:eol-style + native Modified: branches/rewrite/tests/i18n_tests.py =================================================================== --- branches/rewrite/tests/i18n_tests.py 2011-03-13 12:50:06 UTC (rev 9057) +++ branches/rewrite/tests/i18n_tests.py 2011-03-13 12:57:05 UTC (rev 9058) @@ -44,7 +44,7 @@ u'test-non-localized EN') self.assertEqual(i18n.translate('ru', self.msg_non_localized), u'test-non-localized EN') - + def testNoEnglish(self): self.assertEqual(i18n.translate('en', self.msg_no_english), u'test-no-english JA') @@ -52,8 +52,8 @@ u'test-no-english JA') self.assertEqual(i18n.translate('nl', self.msg_no_english), u'test-no-english JA') - + class TestTWTranslate(unittest.TestCase): def testLocalized(self): self.assertEqual(i18n.twtranslate('en', 'test-localized'), @@ -80,11 +80,11 @@ u'test-non-localized EN') self.assertEqual(i18n.twtranslate('ru', 'test-non-localized'), u'test-non-localized EN') - + def testNoEnglish(self): self.assertRaises(i18n.TranslationError, i18n.twtranslate, 'en', 'test-no-english') - + if __name__ == '__main__': try: unittest.main() Modified: branches/rewrite/tests/page_tests.py =================================================================== --- branches/rewrite/tests/page_tests.py 2011-03-13 12:50:06 UTC (rev 9057) +++ branches/rewrite/tests/page_tests.py 2011-03-13 12:57:05 UTC (rev 9058) @@ -90,7 +90,7 @@ def testHashCmp(self): # All links point to en:wikipedia:Test l1 = pywikibot.page.Link('Test', source=self.enwiki) - l2 = pywikibot.page.Link('en:Test', source=self.frwiki) + l2 = pywikibot.page.Link('en:Test', source=self.frwiki) l3 = pywikibot.page.Link('wikipedia:en:Test', source=self.itwikt) def assertHashCmp(link1, link2): self.assertEqual(link1, link2) @@ -202,7 +202,7 @@ self.assertEqual(p1.isImage(), False) self.assertEqual(p2.isImage(), True) self.assertEqual(p3.isImage(), False) - + def testApiMethods(self): """Test various methods that rely on API.""" # since there is no way to predict what data the wiki will return, @@ -253,7 +253,7 @@ self.assertTrue(isinstance(p, pywikibot.Link)) for p2 in mainpage.interwiki(expand=False): self.assertTrue(isinstance(p2, pywikibot.Link)) - self.assertTrue(p2 in iw) + self.assertTrue(p2 in iw) for p in mainpage.langlinks(): self.assertTrue(isinstance(p, pywikibot.Link)) for p in mainpage.imagelinks():

1 0

SVN: [9057] branches/rewrite/tests/i18n_tests.py
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9057 Revision: 9057 Author: xqt Date: 2011-03-13 12:50:06 +0000 (Sun, 13 Mar 2011) Log Message: ----------- +keywords prop. Property Changed: ---------------- branches/rewrite/tests/i18n_tests.py Property changes on: branches/rewrite/tests/i18n_tests.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native

1 0

SVN: [9056] branches/rewrite
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9056 Revision: 9056 Author: xqt Date: 2011-03-13 12:46:39 +0000 (Sun, 13 Mar 2011) Log Message: ----------- eol-style Modified Paths: -------------- branches/rewrite/COPYING branches/rewrite/INSTALL branches/rewrite/distribute_setup.py Property Changed: ---------------- branches/rewrite/COPYING branches/rewrite/INSTALL branches/rewrite/README branches/rewrite/distribute_setup.py Modified: branches/rewrite/COPYING =================================================================== --- branches/rewrite/COPYING 2011-03-13 12:39:28 UTC (rev 9055) +++ branches/rewrite/COPYING 2011-03-13 12:46:39 UTC (rev 9056) @@ -1,23 +1,23 @@ -Copyright (c) 2004-2010 Pywikipedia bot team - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the "Software"), to deal in the Software without -restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. - +Copyright (c) 2004-2010 Pywikipedia bot team + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + Property changes on: branches/rewrite/COPYING ___________________________________________________________________ Added: svn:eol-style + native Modified: branches/rewrite/INSTALL =================================================================== --- branches/rewrite/INSTALL 2011-03-13 12:39:28 UTC (rev 9055) +++ branches/rewrite/INSTALL 2011-03-13 12:46:39 UTC (rev 9056) @@ -1,21 +1,21 @@ -To install the Pywikipediabot framework: - -1) Extract/unzip this package to a directory on your computer. -2) Open a command prompt in that directory, and run the command: - - python setup.py install - -This will install the package to the "site-packages" directory of your Python -installation. Linux/Unix users may need to add "sudo" at the beginning of the -command line to get access to the site-packages directory; or, they can -install the package to a different location by running: - - python setup.py install --home=/path/to/location - -where "/path/to/location" is the directory that you want the Pywikipediabot -files installed to. - -Note: You will need to have an active Internet connection when you run the -setup script, in case the installer needs to download other modules required -by the framework. - +To install the Pywikipediabot framework: + +1) Extract/unzip this package to a directory on your computer. +2) Open a command prompt in that directory, and run the command: + + python setup.py install + +This will install the package to the "site-packages" directory of your Python +installation. Linux/Unix users may need to add "sudo" at the beginning of the +command line to get access to the site-packages directory; or, they can +install the package to a different location by running: + + python setup.py install --home=/path/to/location + +where "/path/to/location" is the directory that you want the Pywikipediabot +files installed to. + +Note: You will need to have an active Internet connection when you run the +setup script, in case the installer needs to download other modules required +by the framework. + Property changes on: branches/rewrite/INSTALL ___________________________________________________________________ Added: svn:eol-style + native Property changes on: branches/rewrite/README ___________________________________________________________________ Added: svn:eol-style + native Modified: branches/rewrite/distribute_setup.py =================================================================== --- branches/rewrite/distribute_setup.py 2011-03-13 12:39:28 UTC (rev 9055) +++ branches/rewrite/distribute_setup.py 2011-03-13 12:46:39 UTC (rev 9056) @@ -1,477 +1,477 @@ -#!python -"""Bootstrap distribute installation - -If you want to use setuptools in your package's setup.py, just include this -file in the same directory with it, and add this to the top of your setup.py:: - - from distribute_setup import use_setuptools - use_setuptools() - -If you want to require a specific version of setuptools, set a download -mirror, or use an alternate download directory, you can do so by supplying -the appropriate options to ``use_setuptools()``. - -This file can also be run as a script to install or upgrade setuptools. -""" -import os -import sys -import time -import fnmatch -import tempfile -import tarfile -from distutils import log - -try: - from site import USER_SITE -except ImportError: - USER_SITE = None - -try: - import subprocess - - def _python_cmd(*args): - args = (sys.executable,) + args - return subprocess.call(args) == 0 - -except ImportError: - # will be used for python 2.3 - def _python_cmd(*args): - args = (sys.executable,) + args - # quoting arguments if windows - if sys.platform == 'win32': - def quote(arg): - if ' ' in arg: - return '"%s"' % arg - return arg - args = [quote(arg) for arg in args] - return os.spawnl(os.P_WAIT, sys.executable, *args) == 0 - -DEFAULT_VERSION = "0.6.10" -DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/" -SETUPTOOLS_FAKED_VERSION = "0.6c11" - -SETUPTOOLS_PKG_INFO = """\ -Metadata-Version: 1.0 -Name: setuptools -Version: %s -Summary: xxxx -Home-page: xxx -Author: xxx -Author-email: xxx -License: xxx -Description: xxx -""" % SETUPTOOLS_FAKED_VERSION - - -def _install(tarball): - # extracting the tarball - tmpdir = tempfile.mkdtemp() - log.warn('Extracting in %s', tmpdir) - old_wd = os.getcwd() - try: - os.chdir(tmpdir) - tar = tarfile.open(tarball) - _extractall(tar) - tar.close() - - # going in the directory - subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) - os.chdir(subdir) - log.warn('Now working in %s', subdir) - - # installing - log.warn('Installing Distribute') - if not _python_cmd('setup.py', 'install'): - log.warn('Something went wrong during the installation.') - log.warn('See the error message above.') - finally: - os.chdir(old_wd) - - -def _build_egg(egg, tarball, to_dir): - # extracting the tarball - tmpdir = tempfile.mkdtemp() - log.warn('Extracting in %s', tmpdir) - old_wd = os.getcwd() - try: - os.chdir(tmpdir) - tar = tarfile.open(tarball) - _extractall(tar) - tar.close() - - # going in the directory - subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) - os.chdir(subdir) - log.warn('Now working in %s', subdir) - - # building an egg - log.warn('Building a Distribute egg in %s', to_dir) - _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) - - finally: - os.chdir(old_wd) - # returning the result - log.warn(egg) - if not os.path.exists(egg): - raise IOError('Could not build the egg.') - - -def _do_download(version, download_base, to_dir, download_delay): - egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg' - % (version, sys.version_info[0], sys.version_info[1])) - if not os.path.exists(egg): - tarball = download_setuptools(version, download_base, - to_dir, download_delay) - _build_egg(egg, tarball, to_dir) - sys.path.insert(0, egg) - import setuptools - setuptools.bootstrap_install_from = egg - - -def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, - to_dir=os.curdir, download_delay=15, no_fake=True): - # making sure we use the absolute path - to_dir = os.path.abspath(to_dir) - was_imported = 'pkg_resources' in sys.modules or \ - 'setuptools' in sys.modules - try: - try: - import pkg_resources - if not hasattr(pkg_resources, '_distribute'): - if not no_fake: - _fake_setuptools() - raise ImportError - except ImportError: - return _do_download(version, download_base, to_dir, download_delay) - try: - pkg_resources.require("distribute>="+version) - return - except pkg_resources.VersionConflict: - e = sys.exc_info()[1] - if was_imported: - sys.stderr.write( - "The required version of distribute (>=%s) is not available,\n" - "and can't be installed while this script is running. Please\n" - "install a more recent version first, using\n" - "'easy_install -U distribute'." - "\n\n(Currently using %r)\n" % (version, e.args[0])) - sys.exit(2) - else: - del pkg_resources, sys.modules['pkg_resources'] # reload ok - return _do_download(version, download_base, to_dir, - download_delay) - except pkg_resources.DistributionNotFound: - return _do_download(version, download_base, to_dir, - download_delay) - finally: - if not no_fake: - _create_fake_setuptools_pkg_info(to_dir) - -def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, - to_dir=os.curdir, delay=15): - """Download distribute from a specified location and return its filename - - `version` should be a valid distribute version number that is available - as an egg for download under the `download_base` URL (which should end - with a '/'). `to_dir` is the directory where the egg will be downloaded. - `delay` is the number of seconds to pause before an actual download - attempt. - """ - # making sure we use the absolute path - to_dir = os.path.abspath(to_dir) - try: - from urllib.request import urlopen - except ImportError: - from urllib2 import urlopen - tgz_name = "distribute-%s.tar.gz" % version - url = download_base + tgz_name - saveto = os.path.join(to_dir, tgz_name) - src = dst = None - if not os.path.exists(saveto): # Avoid repeated downloads - try: - log.warn("Downloading %s", url) - src = urlopen(url) - # Read/write all in one block, so we don't create a corrupt file - # if the download is interrupted. - data = src.read() - dst = open(saveto, "wb") - dst.write(data) - finally: - if src: - src.close() - if dst: - dst.close() - return os.path.realpath(saveto) - - -def _patch_file(path, content): - """Will backup the file then patch it""" - existing_content = open(path).read() - if existing_content == content: - # already patched - log.warn('Already patched.') - return False - log.warn('Patching...') - _rename_path(path) - f = open(path, 'w') - try: - f.write(content) - finally: - f.close() - return True - - -def _same_content(path, content): - return open(path).read() == content - -def _no_sandbox(function): - def __no_sandbox(*args, **kw): - try: - from setuptools.sandbox import DirectorySandbox - def violation(*args): - pass - DirectorySandbox._old = DirectorySandbox._violation - DirectorySandbox._violation = violation - patched = True - except ImportError: - patched = False - - try: - return function(*args, **kw) - finally: - if patched: - DirectorySandbox._violation = DirectorySandbox._old - del DirectorySandbox._old - - return __no_sandbox - -@_no_sandbox -def _rename_path(path): - new_name = path + '.OLD.%s' % time.time() - log.warn('Renaming %s into %s', path, new_name) - os.rename(path, new_name) - return new_name - -def _remove_flat_installation(placeholder): - if not os.path.isdir(placeholder): - log.warn('Unkown installation at %s', placeholder) - return False - found = False - for file in os.listdir(placeholder): - if fnmatch.fnmatch(file, 'setuptools*.egg-info'): - found = True - break - if not found: - log.warn('Could not locate setuptools*.egg-info') - return - - log.warn('Removing elements out of the way...') - pkg_info = os.path.join(placeholder, file) - if os.path.isdir(pkg_info): - patched = _patch_egg_dir(pkg_info) - else: - patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO) - - if not patched: - log.warn('%s already patched.', pkg_info) - return False - # now let's move the files out of the way - for element in ('setuptools', 'pkg_resources.py', 'site.py'): - element = os.path.join(placeholder, element) - if os.path.exists(element): - _rename_path(element) - else: - log.warn('Could not find the %s element of the ' - 'Setuptools distribution', element) - return True - - -def _after_install(dist): - log.warn('After install bootstrap.') - placeholder = dist.get_command_obj('install').install_purelib - _create_fake_setuptools_pkg_info(placeholder) - -@_no_sandbox -def _create_fake_setuptools_pkg_info(placeholder): - if not placeholder or not os.path.exists(placeholder): - log.warn('Could not find the install location') - return - pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1]) - setuptools_file = 'setuptools-%s-py%s.egg-info' % \ - (SETUPTOOLS_FAKED_VERSION, pyver) - pkg_info = os.path.join(placeholder, setuptools_file) - if os.path.exists(pkg_info): - log.warn('%s already exists', pkg_info) - return - - log.warn('Creating %s', pkg_info) - f = open(pkg_info, 'w') - try: - f.write(SETUPTOOLS_PKG_INFO) - finally: - f.close() - - pth_file = os.path.join(placeholder, 'setuptools.pth') - log.warn('Creating %s', pth_file) - f = open(pth_file, 'w') - try: - f.write(os.path.join(os.curdir, setuptools_file)) - finally: - f.close() - -def _patch_egg_dir(path): - # let's check if it's already patched - pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') - if os.path.exists(pkg_info): - if _same_content(pkg_info, SETUPTOOLS_PKG_INFO): - log.warn('%s already patched.', pkg_info) - return False - _rename_path(path) - os.mkdir(path) - os.mkdir(os.path.join(path, 'EGG-INFO')) - pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') - f = open(pkg_info, 'w') - try: - f.write(SETUPTOOLS_PKG_INFO) - finally: - f.close() - return True - - -def _before_install(): - log.warn('Before install bootstrap.') - _fake_setuptools() - - -def _under_prefix(location): - if 'install' not in sys.argv: - return True - args = sys.argv[sys.argv.index('install')+1:] - for index, arg in enumerate(args): - for option in ('--root', '--prefix'): - if arg.startswith('%s=' % option): - top_dir = arg.split('root=')[-1] - return location.startswith(top_dir) - elif arg == option: - if len(args) > index: - top_dir = args[index+1] - return location.startswith(top_dir) - elif option == '--user' and USER_SITE is not None: - return location.startswith(USER_SITE) - return True - - -def _fake_setuptools(): - log.warn('Scanning installed packages') - try: - import pkg_resources - except ImportError: - # we're cool - log.warn('Setuptools or Distribute does not seem to be installed.') - return - ws = pkg_resources.working_set - try: - setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools', - replacement=False)) - except TypeError: - # old distribute API - setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools')) - - if setuptools_dist is None: - log.warn('No setuptools distribution found') - return - # detecting if it was already faked - setuptools_location = setuptools_dist.location - log.warn('Setuptools installation detected at %s', setuptools_location) - - # if --root or --preix was provided, and if - # setuptools is not located in them, we don't patch it - if not _under_prefix(setuptools_location): - log.warn('Not patching, --root or --prefix is installing Distribute' - ' in another location') - return - - # let's see if its an egg - if not setuptools_location.endswith('.egg'): - log.warn('Non-egg installation') - res = _remove_flat_installation(setuptools_location) - if not res: - return - else: - log.warn('Egg installation') - pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO') - if (os.path.exists(pkg_info) and - _same_content(pkg_info, SETUPTOOLS_PKG_INFO)): - log.warn('Already patched.') - return - log.warn('Patching...') - # let's create a fake egg replacing setuptools one - res = _patch_egg_dir(setuptools_location) - if not res: - return - log.warn('Patched done.') - _relaunch() - - -def _relaunch(): - log.warn('Relaunching...') - # we have to relaunch the process - args = [sys.executable] + sys.argv - sys.exit(subprocess.call(args)) - - -def _extractall(self, path=".", members=None): - """Extract all members from the archive to the current working - directory and set owner, modification time and permissions on - directories afterwards. `path' specifies a different directory - to extract to. `members' is optional and must be a subset of the - list returned by getmembers(). - """ - import copy - import operator - from tarfile import ExtractError - directories = [] - - if members is None: - members = self - - for tarinfo in members: - if tarinfo.isdir(): - # Extract directories with a safe mode. - directories.append(tarinfo) - tarinfo = copy.copy(tarinfo) - tarinfo.mode = 448 # decimal for oct 0700 - self.extract(tarinfo, path) - - # Reverse sort directories. - if sys.version_info < (2, 4): - def sorter(dir1, dir2): - return cmp(dir1.name, dir2.name) - directories.sort(sorter) - directories.reverse() - else: - directories.sort(key=operator.attrgetter('name'), reverse=True) - - # Set correct owner, mtime and filemode on directories. - for tarinfo in directories: - dirpath = os.path.join(path, tarinfo.name) - try: - self.chown(tarinfo, dirpath) - self.utime(tarinfo, dirpath) - self.chmod(tarinfo, dirpath) - except ExtractError: - e = sys.exc_info()[1] - if self.errorlevel > 1: - raise - else: - self._dbg(1, "tarfile: %s" % e) - - -def main(argv, version=DEFAULT_VERSION): - """Install or upgrade setuptools and EasyInstall""" - tarball = download_setuptools() - _install(tarball) - - -if __name__ == '__main__': - main(sys.argv[1:]) +#!python +"""Bootstrap distribute installation + +If you want to use setuptools in your package's setup.py, just include this +file in the same directory with it, and add this to the top of your setup.py:: + + from distribute_setup import use_setuptools + use_setuptools() + +If you want to require a specific version of setuptools, set a download +mirror, or use an alternate download directory, you can do so by supplying +the appropriate options to ``use_setuptools()``. + +This file can also be run as a script to install or upgrade setuptools. +""" +import os +import sys +import time +import fnmatch +import tempfile +import tarfile +from distutils import log + +try: + from site import USER_SITE +except ImportError: + USER_SITE = None + +try: + import subprocess + + def _python_cmd(*args): + args = (sys.executable,) + args + return subprocess.call(args) == 0 + +except ImportError: + # will be used for python 2.3 + def _python_cmd(*args): + args = (sys.executable,) + args + # quoting arguments if windows + if sys.platform == 'win32': + def quote(arg): + if ' ' in arg: + return '"%s"' % arg + return arg + args = [quote(arg) for arg in args] + return os.spawnl(os.P_WAIT, sys.executable, *args) == 0 + +DEFAULT_VERSION = "0.6.10" +DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/" +SETUPTOOLS_FAKED_VERSION = "0.6c11" + +SETUPTOOLS_PKG_INFO = """\ +Metadata-Version: 1.0 +Name: setuptools +Version: %s +Summary: xxxx +Home-page: xxx +Author: xxx +Author-email: xxx +License: xxx +Description: xxx +""" % SETUPTOOLS_FAKED_VERSION + + +def _install(tarball): + # extracting the tarball + tmpdir = tempfile.mkdtemp() + log.warn('Extracting in %s', tmpdir) + old_wd = os.getcwd() + try: + os.chdir(tmpdir) + tar = tarfile.open(tarball) + _extractall(tar) + tar.close() + + # going in the directory + subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) + os.chdir(subdir) + log.warn('Now working in %s', subdir) + + # installing + log.warn('Installing Distribute') + if not _python_cmd('setup.py', 'install'): + log.warn('Something went wrong during the installation.') + log.warn('See the error message above.') + finally: + os.chdir(old_wd) + + +def _build_egg(egg, tarball, to_dir): + # extracting the tarball + tmpdir = tempfile.mkdtemp() + log.warn('Extracting in %s', tmpdir) + old_wd = os.getcwd() + try: + os.chdir(tmpdir) + tar = tarfile.open(tarball) + _extractall(tar) + tar.close() + + # going in the directory + subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0]) + os.chdir(subdir) + log.warn('Now working in %s', subdir) + + # building an egg + log.warn('Building a Distribute egg in %s', to_dir) + _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir) + + finally: + os.chdir(old_wd) + # returning the result + log.warn(egg) + if not os.path.exists(egg): + raise IOError('Could not build the egg.') + + +def _do_download(version, download_base, to_dir, download_delay): + egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg' + % (version, sys.version_info[0], sys.version_info[1])) + if not os.path.exists(egg): + tarball = download_setuptools(version, download_base, + to_dir, download_delay) + _build_egg(egg, tarball, to_dir) + sys.path.insert(0, egg) + import setuptools + setuptools.bootstrap_install_from = egg + + +def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, + to_dir=os.curdir, download_delay=15, no_fake=True): + # making sure we use the absolute path + to_dir = os.path.abspath(to_dir) + was_imported = 'pkg_resources' in sys.modules or \ + 'setuptools' in sys.modules + try: + try: + import pkg_resources + if not hasattr(pkg_resources, '_distribute'): + if not no_fake: + _fake_setuptools() + raise ImportError + except ImportError: + return _do_download(version, download_base, to_dir, download_delay) + try: + pkg_resources.require("distribute>="+version) + return + except pkg_resources.VersionConflict: + e = sys.exc_info()[1] + if was_imported: + sys.stderr.write( + "The required version of distribute (>=%s) is not available,\n" + "and can't be installed while this script is running. Please\n" + "install a more recent version first, using\n" + "'easy_install -U distribute'." + "\n\n(Currently using %r)\n" % (version, e.args[0])) + sys.exit(2) + else: + del pkg_resources, sys.modules['pkg_resources'] # reload ok + return _do_download(version, download_base, to_dir, + download_delay) + except pkg_resources.DistributionNotFound: + return _do_download(version, download_base, to_dir, + download_delay) + finally: + if not no_fake: + _create_fake_setuptools_pkg_info(to_dir) + +def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL, + to_dir=os.curdir, delay=15): + """Download distribute from a specified location and return its filename + + `version` should be a valid distribute version number that is available + as an egg for download under the `download_base` URL (which should end + with a '/'). `to_dir` is the directory where the egg will be downloaded. + `delay` is the number of seconds to pause before an actual download + attempt. + """ + # making sure we use the absolute path + to_dir = os.path.abspath(to_dir) + try: + from urllib.request import urlopen + except ImportError: + from urllib2 import urlopen + tgz_name = "distribute-%s.tar.gz" % version + url = download_base + tgz_name + saveto = os.path.join(to_dir, tgz_name) + src = dst = None + if not os.path.exists(saveto): # Avoid repeated downloads + try: + log.warn("Downloading %s", url) + src = urlopen(url) + # Read/write all in one block, so we don't create a corrupt file + # if the download is interrupted. + data = src.read() + dst = open(saveto, "wb") + dst.write(data) + finally: + if src: + src.close() + if dst: + dst.close() + return os.path.realpath(saveto) + + +def _patch_file(path, content): + """Will backup the file then patch it""" + existing_content = open(path).read() + if existing_content == content: + # already patched + log.warn('Already patched.') + return False + log.warn('Patching...') + _rename_path(path) + f = open(path, 'w') + try: + f.write(content) + finally: + f.close() + return True + + +def _same_content(path, content): + return open(path).read() == content + +def _no_sandbox(function): + def __no_sandbox(*args, **kw): + try: + from setuptools.sandbox import DirectorySandbox + def violation(*args): + pass + DirectorySandbox._old = DirectorySandbox._violation + DirectorySandbox._violation = violation + patched = True + except ImportError: + patched = False + + try: + return function(*args, **kw) + finally: + if patched: + DirectorySandbox._violation = DirectorySandbox._old + del DirectorySandbox._old + + return __no_sandbox + +@_no_sandbox +def _rename_path(path): + new_name = path + '.OLD.%s' % time.time() + log.warn('Renaming %s into %s', path, new_name) + os.rename(path, new_name) + return new_name + +def _remove_flat_installation(placeholder): + if not os.path.isdir(placeholder): + log.warn('Unkown installation at %s', placeholder) + return False + found = False + for file in os.listdir(placeholder): + if fnmatch.fnmatch(file, 'setuptools*.egg-info'): + found = True + break + if not found: + log.warn('Could not locate setuptools*.egg-info') + return + + log.warn('Removing elements out of the way...') + pkg_info = os.path.join(placeholder, file) + if os.path.isdir(pkg_info): + patched = _patch_egg_dir(pkg_info) + else: + patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO) + + if not patched: + log.warn('%s already patched.', pkg_info) + return False + # now let's move the files out of the way + for element in ('setuptools', 'pkg_resources.py', 'site.py'): + element = os.path.join(placeholder, element) + if os.path.exists(element): + _rename_path(element) + else: + log.warn('Could not find the %s element of the ' + 'Setuptools distribution', element) + return True + + +def _after_install(dist): + log.warn('After install bootstrap.') + placeholder = dist.get_command_obj('install').install_purelib + _create_fake_setuptools_pkg_info(placeholder) + +@_no_sandbox +def _create_fake_setuptools_pkg_info(placeholder): + if not placeholder or not os.path.exists(placeholder): + log.warn('Could not find the install location') + return + pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1]) + setuptools_file = 'setuptools-%s-py%s.egg-info' % \ + (SETUPTOOLS_FAKED_VERSION, pyver) + pkg_info = os.path.join(placeholder, setuptools_file) + if os.path.exists(pkg_info): + log.warn('%s already exists', pkg_info) + return + + log.warn('Creating %s', pkg_info) + f = open(pkg_info, 'w') + try: + f.write(SETUPTOOLS_PKG_INFO) + finally: + f.close() + + pth_file = os.path.join(placeholder, 'setuptools.pth') + log.warn('Creating %s', pth_file) + f = open(pth_file, 'w') + try: + f.write(os.path.join(os.curdir, setuptools_file)) + finally: + f.close() + +def _patch_egg_dir(path): + # let's check if it's already patched + pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') + if os.path.exists(pkg_info): + if _same_content(pkg_info, SETUPTOOLS_PKG_INFO): + log.warn('%s already patched.', pkg_info) + return False + _rename_path(path) + os.mkdir(path) + os.mkdir(os.path.join(path, 'EGG-INFO')) + pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO') + f = open(pkg_info, 'w') + try: + f.write(SETUPTOOLS_PKG_INFO) + finally: + f.close() + return True + + +def _before_install(): + log.warn('Before install bootstrap.') + _fake_setuptools() + + +def _under_prefix(location): + if 'install' not in sys.argv: + return True + args = sys.argv[sys.argv.index('install')+1:] + for index, arg in enumerate(args): + for option in ('--root', '--prefix'): + if arg.startswith('%s=' % option): + top_dir = arg.split('root=')[-1] + return location.startswith(top_dir) + elif arg == option: + if len(args) > index: + top_dir = args[index+1] + return location.startswith(top_dir) + elif option == '--user' and USER_SITE is not None: + return location.startswith(USER_SITE) + return True + + +def _fake_setuptools(): + log.warn('Scanning installed packages') + try: + import pkg_resources + except ImportError: + # we're cool + log.warn('Setuptools or Distribute does not seem to be installed.') + return + ws = pkg_resources.working_set + try: + setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools', + replacement=False)) + except TypeError: + # old distribute API + setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools')) + + if setuptools_dist is None: + log.warn('No setuptools distribution found') + return + # detecting if it was already faked + setuptools_location = setuptools_dist.location + log.warn('Setuptools installation detected at %s', setuptools_location) + + # if --root or --preix was provided, and if + # setuptools is not located in them, we don't patch it + if not _under_prefix(setuptools_location): + log.warn('Not patching, --root or --prefix is installing Distribute' + ' in another location') + return + + # let's see if its an egg + if not setuptools_location.endswith('.egg'): + log.warn('Non-egg installation') + res = _remove_flat_installation(setuptools_location) + if not res: + return + else: + log.warn('Egg installation') + pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO') + if (os.path.exists(pkg_info) and + _same_content(pkg_info, SETUPTOOLS_PKG_INFO)): + log.warn('Already patched.') + return + log.warn('Patching...') + # let's create a fake egg replacing setuptools one + res = _patch_egg_dir(setuptools_location) + if not res: + return + log.warn('Patched done.') + _relaunch() + + +def _relaunch(): + log.warn('Relaunching...') + # we have to relaunch the process + args = [sys.executable] + sys.argv + sys.exit(subprocess.call(args)) + + +def _extractall(self, path=".", members=None): + """Extract all members from the archive to the current working + directory and set owner, modification time and permissions on + directories afterwards. `path' specifies a different directory + to extract to. `members' is optional and must be a subset of the + list returned by getmembers(). + """ + import copy + import operator + from tarfile import ExtractError + directories = [] + + if members is None: + members = self + + for tarinfo in members: + if tarinfo.isdir(): + # Extract directories with a safe mode. + directories.append(tarinfo) + tarinfo = copy.copy(tarinfo) + tarinfo.mode = 448 # decimal for oct 0700 + self.extract(tarinfo, path) + + # Reverse sort directories. + if sys.version_info < (2, 4): + def sorter(dir1, dir2): + return cmp(dir1.name, dir2.name) + directories.sort(sorter) + directories.reverse() + else: + directories.sort(key=operator.attrgetter('name'), reverse=True) + + # Set correct owner, mtime and filemode on directories. + for tarinfo in directories: + dirpath = os.path.join(path, tarinfo.name) + try: + self.chown(tarinfo, dirpath) + self.utime(tarinfo, dirpath) + self.chmod(tarinfo, dirpath) + except ExtractError: + e = sys.exc_info()[1] + if self.errorlevel > 1: + raise + else: + self._dbg(1, "tarfile: %s" % e) + + +def main(argv, version=DEFAULT_VERSION): + """Install or upgrade setuptools and EasyInstall""" + tarball = download_setuptools() + _install(tarball) + + +if __name__ == '__main__': + main(sys.argv[1:]) Property changes on: branches/rewrite/distribute_setup.py ___________________________________________________________________ Added: svn:eol-style + native

1 0

SVN: [9055] trunk/pywikipedia/archive/README
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9055 Revision: 9055 Author: xqt Date: 2011-03-13 12:39:28 +0000 (Sun, 13 Mar 2011) Log Message: ----------- notice for archive dir Added Paths: ----------- trunk/pywikipedia/archive/README Copied: trunk/pywikipedia/archive/README (from rev 9049, trunk/pywikipedia/botlists/README) =================================================================== --- trunk/pywikipedia/archive/README (rev 0) +++ trunk/pywikipedia/archive/README 2011-03-13 12:39:28 UTC (rev 9055) @@ -0,0 +1 @@ +These bot scripts are deprecated. Do not use them!

1 0

SVN: [9054] trunk/pywikipedia
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9054 Revision: 9054 Author: xqt Date: 2011-03-13 12:28:06 +0000 (Sun, 13 Mar 2011) Log Message: ----------- readme files for subdirs Added Paths: ----------- trunk/pywikipedia/cache/README trunk/pywikipedia/category/README Copied: trunk/pywikipedia/cache/README (from rev 9049, trunk/pywikipedia/botlists/README) =================================================================== --- trunk/pywikipedia/cache/README (rev 0) +++ trunk/pywikipedia/cache/README 2011-03-13 12:28:06 UTC (rev 9054) @@ -0,0 +1,2 @@ +This directory/folder is empty when you get the package. It is used by the +robots. Copied: trunk/pywikipedia/category/README (from rev 9049, trunk/pywikipedia/botlists/README) =================================================================== --- trunk/pywikipedia/category/README (rev 0) +++ trunk/pywikipedia/category/README 2011-03-13 12:28:06 UTC (rev 9054) @@ -0,0 +1,2 @@ +This directory/folder is empty when you get the package. It is used by the +robots.

1 0

SVN: [9053] trunk/pywikipedia/commonsdelinker
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9053 Revision: 9053 Author: xqt Date: 2011-03-13 12:24:49 +0000 (Sun, 13 Mar 2011) Log Message: ----------- stripped trailing whitespace Modified Paths: -------------- trunk/pywikipedia/commonsdelinker/checkusage.py trunk/pywikipedia/commonsdelinker/delinker.py trunk/pywikipedia/commonsdelinker/image_replacer.py trunk/pywikipedia/commonsdelinker/threadpool.py Modified: trunk/pywikipedia/commonsdelinker/checkusage.py =================================================================== --- trunk/pywikipedia/commonsdelinker/checkusage.py 2011-03-13 12:19:20 UTC (rev 9052) +++ trunk/pywikipedia/commonsdelinker/checkusage.py 2011-03-13 12:24:49 UTC (rev 9053) @@ -1,32 +1,32 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -This module provides a way for users of the Wikimedia toolserver to check the +This module provides a way for users of the Wikimedia toolserver to check the use of images from Commons on other Wikimedia wikis. It supports both running -checkusage against the database and against the live wikis. It is very -efficient as it only creates one HTTP connection and one MySQL connection +checkusage against the database and against the live wikis. It is very +efficient as it only creates one HTTP connection and one MySQL connection during its life time. It is not suitable for multithreading! - + The CheckUsage class' constructor accept as parameters the maximum number of wikis that should be checked, an option to use it only live and the parameters -to connect to the MySQL database. The top wikis in size will be checked. The +to connect to the MySQL database. The top wikis in size will be checked. The class provides multiple methods: - + get_usage(image) -This method will return a generator object that generates the usage of the +This method will return a generator object that generates the usage of the image, returned as the following tuple: (page_namespace, page_title, full_title). page_namespace is the numeric namespace, page_title the page title without namespace, full_title the page title including localized namespace. - + get_usage_db(dbname, image), get_usage_live(domain, image) Those methods allow querying a specific wiki, respectively against the database and against the live wiki. They accept respectively the database name and the -domain name. The return a generator which generates the same results as +domain name. The return a generator which generates the same results as get_usage(). - + get_usage_multi(images) Calls get_usage for each image and returns a dictionary with usages. - + get_replag(dbname) Returns the time in seconds since the latest known edit of dbname. """ @@ -37,13 +37,13 @@ # __version__ = '$Id$' # - + import httplib, urlparse, socket, time from urllib import urlencode import simplejson import wikipedia, family - + try: import MySQLdb except ImportError: @@ -53,7 +53,7 @@ except ImportError: pass __ver__ = '0.4c' - + def strip_ns(title): title = title.replace(' ', '_') if title.find(':') != -1: @@ -63,14 +63,14 @@ if title.startswith('Image:'): return strip_ns(title) return title - + def family(domain): if domain is None: raise RuntimeError('None is not a valid family') - + wiki = domain.split('.') # Standard family - if wiki[1] in ('wikipedia', 'wiktionary', 'wikibooks', + if wiki[1] in ('wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikinews', 'wikiversity'): return wiki[0], wiki[1] # Family on own domain @@ -92,7 +92,7 @@ #self._conn.set_debuglevel(100) self._conn.connect() - def request(self, method, path, headers, data): + def request(self, method, path, headers, data): if not headers: headers = {} if not data: data = '' headers['Connection'] = 'Keep-Alive' @@ -143,28 +143,28 @@ data = simplejson.load(res) finally: res.close() - + if 'error' in data: if data['error']['code'] == u'internal_api_error_DBConnectionError': return self.query_api(host, path, **kwargs) - raise wikipedia.Error(data['error']['code'], + raise wikipedia.Error(data['error']['code'], data['error']['info']) - + return data def close(self): self._conn.close() class HTTPPool(list): - def __init__(self, retry_timeout = 10, max_retries = -1, + def __init__(self, retry_timeout = 10, max_retries = -1, callback = lambda *args: None): - + self.retry_timeout = retry_timeout self.max_retries = -1 self.callback = callback self.current_retry = 0 - + list.__init__(self, ()) - + def query_api(self, host, path, **kwargs): conn = self.find_conn(host) while True: @@ -180,7 +180,7 @@ self.wait() conn = self.find_conn(host) - + def find_conn(self, host): for conn in self: if host in conn.hosts: @@ -199,37 +199,37 @@ conn.hosts = [] self.append(conn) return self - + def wait(self): if self.current_retry > self.max_retries and self.max_retries != -1: raise RuntimeError('Maximum retries exceeded') if self.current_retry: self.callback(self) - time.sleep(self.current_retry * self.retry_timeout) + time.sleep(self.current_retry * self.retry_timeout) self.current_retry += 1 - + def close(self): for conn in self: conn.close() del self[:] - + class CheckUsage(object): - def __init__(self, limit = 100, + def __init__(self, limit = 100, mysql_default_server = 3, mysql_host_prefix = 'sql-s', mysql_host_suffix = '', - mysql_kwargs = {}, no_db = False, use_autoconn = False, - - http_retry_timeout = 30, http_max_retries = -1, + mysql_kwargs = {}, no_db = False, use_autoconn = False, + + http_retry_timeout = 30, http_max_retries = -1, http_callback = lambda *args: None, - + mysql_retry_timeout = 60, mysql_max_retries = -1, mysql_callback = lambda *args: None): - - self.http = None + + self.http = None self.http_retry_timeout = http_retry_timeout self.http_max_retries = http_max_retries self.http_callback = http_callback - + if no_db: return self.mysql_host_prefix = mysql_host_prefix @@ -239,18 +239,18 @@ self.mysql_retry_timeout = mysql_retry_timeout self.mysql_max_retries = mysql_max_retries self.mysql_callback = mysql_callback - + self.connections = [] - + # Mapping database name -> mysql connection self.databases = {} # Mapping server id -> mysql connection self.servers = {} # Mapping database name -> (lang, family) self.sites = {} - + self.domains = {} - + self.unknown_families = [] # Mapping family name -> family object self.known_families = {} @@ -263,7 +263,7 @@ for dbname, domain, server in cursor.fetchall(): if server not in self.servers: self.servers[server] = self.connect_mysql(mysql_host_prefix + str(server) + mysql_host_suffix) - + # FIXME: wikimediafoundation! # TODO: This is one big mess try: @@ -275,7 +275,7 @@ else: self.sites[dbname] = (lang, fam) self.databases[dbname] = self.servers[server] - + self.domains[dbname] = domain @@ -286,8 +286,8 @@ if self.use_autoconn: database = mysql_autoconnection.connect( use_unicode = False, host = host, - retry_timeout = self.mysql_retry_timeout, - max_retries = self.mysql_max_retries, + retry_timeout = self.mysql_retry_timeout, + max_retries = self.mysql_max_retries, callback = self.mysql_callback, **self.mysql_kwargs) else: @@ -298,7 +298,7 @@ return database, cursor def connect_http(self): if not self.http: - self.http = HTTPPool(retry_timeout = self.http_retry_timeout, + self.http = HTTPPool(retry_timeout = self.http_retry_timeout, max_retries = self.http_max_retries, callback = self.http_callback) def get_usage(self, image): @@ -311,14 +311,14 @@ #image = strip_image(image) lang, family_name = self.sites[dbname] family = self.known_families[family_name] - + if family.shared_image_repository(lang) != (lang, family_name) and shared: left_join = 'LEFT JOIN %s.image ON (il_to = img_name) WHERE img_name IS NULL AND' % dbname else: left_join = 'WHERE'; query = """SELECT page_namespace, page_title FROM %s.page, %s.imagelinks %s page_id = il_from AND il_to = %%s""" - self.databases[dbname][1].execute(query % (dbname, dbname, left_join), + self.databases[dbname][1].execute(query % (dbname, dbname, left_join), (image.encode('utf-8', 'ignore'), )) for page_namespace, page_title in self.databases[dbname][1]: stripped_title = page_title.decode('utf-8', 'ignore') @@ -330,32 +330,32 @@ def get_usage_live(self, site, image, shared = False): self.connect_http() - + if type(site) is str: hostname = site apipath = '/w/api.php' else: hostname = site.hostname() apipath = site.apipath() - + # FIXME: Use continue kwargs = {'action': 'query', 'iutitle': u'Image:' + image, 'titles': u'Image:' + image, 'prop': 'info'} kwargs['list'] = 'imageusage' kwargs['iulimit'] = '500' - + res = self.http.query_api(hostname, apipath, **kwargs) if '-1' not in res['query']['pages'] and shared: return - + usages = res['query'].get('imageusage') if not usages: return - + # Apparently this someday changed from dict to list? if type(usages) is dict: usages = usages.values() - + for usage in usages: title = usage['title'].replace(' ', '_') namespace = usage['ns'] @@ -365,7 +365,7 @@ stripped_title = title yield namespace, stripped_title, title - + def exists(self, site, image): self.connect_http() # Check whether the image still is deleted on Commons. @@ -375,8 +375,8 @@ # BUG: This is ugly. return '-1' not in self.http.query_api(site.hostname(), site.apipath(), action = 'query', titles = 'Image:' + image)['query']['pages'] - - + + def close(self): if getattr(self, 'http'): self.http.close() @@ -384,7 +384,6 @@ for connection, cursor in self.databases.itervalues(): try: connection.close() - except: + except: pass - \ No newline at end of file Modified: trunk/pywikipedia/commonsdelinker/delinker.py =================================================================== --- trunk/pywikipedia/commonsdelinker/delinker.py 2011-03-13 12:19:20 UTC (rev 9052) +++ trunk/pywikipedia/commonsdelinker/delinker.py 2011-03-13 12:24:49 UTC (rev 9053) @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -This script keeps track of image deletions and delinks removed files +This script keeps track of image deletions and delinks removed files from (any) wiki. Usage on protected pages or pages containing blacklisted external links cannot be processed. @@ -15,7 +15,7 @@ Please refer to delinker.txt for full documentation. """ # -# +# # (C) Kyle/Orgullomoore, 2006-2007 # (C) Siebrand Mazeland, 2006-2007 # (C) Bryan Tong Minh, 2007-2008 @@ -55,7 +55,7 @@ output(u'%s Connection has been lost in %s. Attempting reconnection.' % (threading.currentThread(), repr(object)), False) if hasattr(object, 'error'): output(u'Error was %s: %s' % tuple(object.error)) - + def universal_unicode(s): if type(s) is str: return s.decode('utf-8', 'ignore') @@ -75,11 +75,11 @@ # the standard MySQL character set. kwargs['use_unicode'] = False kwargs['callback'] = wait_callback - + return mysql_autoconnection.connect(**kwargs) # TODO: Add support for sqlite3 raise RuntimeError('Unsupported database engine %s' % engine) - + class ImmutableByReference(object): def __init__(self, data): self.data = data @@ -100,30 +100,30 @@ threadpool.Thread.__init__(self, pool) self.CommonsDelinker = CommonsDelinker self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new') - + def delink_image(self, image, usage, timestamp, admin, reason, replacement = None): """ Performs the delink for image on usage. """ output(u'%s Usage of %s: %s' % (self, image, usage)) if self.CommonsDelinker.exec_hook('before_delink', (image, usage, timestamp, admin, reason, replacement)) is False: return - + skipped_images = {} for (lang, family), pages in usage.iteritems(): site = self.CommonsDelinker.get_site(lang, family) if not site: output(u'%s Warning! Unknown site %s:%s' % (self, family, lang)) continue - + try: summary = self.get_summary(site, image, admin, reason, replacement) - + for page_namespace, page_title, title in pages: if (site.lang, site.family.name) == (self.CommonsDelinker.site.lang, self.CommonsDelinker.site.family.name) and \ (page_namespace, page_title) == (6, image): continue - + if self.CommonsDelinker.set_edit(str(site), title): # The page is currently being editted. Postpone. if (lang, family) not in skipped_images: @@ -133,7 +133,7 @@ else: # Delink the image output(u'%s Delinking %s from %s' % (self, image, site)) - + try: try: result = self.replace_image(image, site, title, summary, replacement) @@ -147,14 +147,14 @@ (page_namespace, page_title, title)) finally: self.CommonsDelinker.unset_edit(str(site), title) - + # Add to logging queue if self.sql_layout == 'new': - self.CommonsDelinker.Loggers.append((timestamp, image, + self.CommonsDelinker.Loggers.append((timestamp, image, site.lang, site.family.name, page_namespace, page_title, result, replacement)) else: - self.CommonsDelinker.Loggers.append((timestamp, image, site.hostname(), + self.CommonsDelinker.Loggers.append((timestamp, image, site.hostname(), page_namespace, page_title, result, replacement)) finally: self.CommonsDelinker.unlock_site(site) @@ -168,14 +168,14 @@ elif replacement: # Let them know that we are done replacing. self.CommonsDelinker.Loggers.append((timestamp, image, replacement)) - + def replace_image(self, image, site, page_title, summary, replacement = None): """ The actual replacement. Giving None as argument for replacement will delink instead of replace.""" - + page = wikipedia.Page(site, page_title) hook = None - + # TODO: Per site config. if page.namespace() in self.CommonsDelinker.config['delink_namespaces']: try: @@ -183,25 +183,25 @@ except wikipedia.NoPage: return 'failed' new_text = text - + m_image = ImmutableByReference(image) m_replacement = ImmutableByReference(replacement) - self.CommonsDelinker.exec_hook('before_replace', + self.CommonsDelinker.exec_hook('before_replace', (page, summary, m_image, m_replacement)) image = m_image.get() replacement = m_replacement.get() - + def create_regex(s): first, other = re.escape(s[0]), re.escape(s[1:]) return ur'(?:[%s%s]%s)' % (first.upper(), first.lower(), other) def create_regex_i(s): return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(), c.lower()) for c in s]) - + namespaces = site.namespace(6, all = True) + site.namespace(-2, all = True) r_namespace = ur'\s*(?:%s)\s*\:\s*' % u'|'.join(map(create_regex_i, namespaces)) # Note that this regex creates a group! r_image = u'(%s)' % create_regex(image).replace(r'\_', '[ _]') - + def simple_replacer(match): m_replacement = ImmutableByReference(replacement) groups = list(match.groups()) @@ -209,21 +209,21 @@ if False is self.CommonsDelinker.exec_hook('%s_replace' % hook, (page, summary, image, m_replacement, match, groups)): return u''.join(groups) - + if m_replacement.get() is None: return u'' else: groups[1] = m_replacement.get() return u''.join(groups) - - # Previously links in image descriptions will cause + + # Previously links in image descriptions will cause # unexpected behaviour: [[Image:image.jpg|thumb|[[link]] in description]] # will truncate at the first occurence of ]]. This cannot be # fixed using one regular expression. # This means that all ]] after the start of the image # must be located. If it then does not have an associated # [[, this one is the closure of the image. - + r_simple_s = u'(\[\[%s)%s' % (r_namespace, r_image) r_s = '\[\[' r_e = '\]\]' @@ -231,25 +231,25 @@ image_starts = [match.start() for match in re.finditer(r_simple_s, text)] link_starts = [match.start() for match in re.finditer(r_s, text)] link_ends = [match.end() for match in re.finditer(r_e, text)] - + r_simple = u'(\[\[%s)%s(.*)' % (r_namespace, r_image) hook = 'simple' replacements = [] for image_start in image_starts: - current_link_starts = [link_start for link_start in link_starts + current_link_starts = [link_start for link_start in link_starts if link_start > image_start] - current_link_ends = [link_end for link_end in link_ends + current_link_ends = [link_end for link_end in link_ends if link_end > image_start] end = image_start if current_link_ends: end = current_link_ends[0] - + while current_link_starts and current_link_ends: start = current_link_starts.pop(0) end = current_link_ends.pop(0) if end <= start and end > image_start: # Found the end of the image break - + # Check whether this image is the first one on the line if image_start == 0: prev = '' @@ -262,38 +262,38 @@ end += 1 else: break - + # Add the replacement to the todo list. Doing the # replacement right know would alter the indices. replacements.append((new_text[image_start:end], - re.sub(r_simple, simple_replacer, + re.sub(r_simple, simple_replacer, new_text[image_start:end]))) - + # Perform the replacements for old, new in replacements: if old: new_text = new_text.replace(old, new) - + # Remove the image from galleries hook = 'gallery' - r_galleries = ur'(?s)(\<%s\>)(.*?)(\<\/%s\>)' % (create_regex_i('gallery'), + r_galleries = ur'(?s)(\<%s\>)(.*?)(\<\/%s\>)' % (create_regex_i('gallery'), create_regex_i('gallery')) r_gallery = ur'(?m)^((?:%s)?)%s(\s*(?:\|.*?)?\s*$)' % (r_namespace, r_image) def gallery_replacer(match): - return ur'%s%s%s' % (match.group(1), re.sub(r_gallery, + return ur'%s%s%s' % (match.group(1), re.sub(r_gallery, simple_replacer, match.group(2)), match.group(3)) new_text = re.sub(r_galleries, gallery_replacer, new_text) - + if text == new_text or self.CommonsDelinker.config.get('force_complex', False): # All previous steps did not work, so the image is # likely embedded in a complicated template. hook = 'complex' r_templates = ur'(?s)(\{\{.*?\}\})' r_complicated = u'(?s)(?<=[|{=])[\s\u200E\uFEFF\u200B\u200C]*((?:%s)?)%s[\u200E\uFEFF\u200B\u200C]*' % (r_namespace, r_image) - + def template_replacer(match): return re.sub(r_complicated, simple_replacer, match.group(1)) new_text = re.sub(r_templates, template_replacer, text) - + if text != new_text: # Save to the wiki # Code for checking user page existance has been moved @@ -304,7 +304,7 @@ if False is self.CommonsDelinker.exec_hook('before_save', (page, text, new_text, m_summary)): return 'skipped' - + is_retry = False while True: try: @@ -330,18 +330,18 @@ else: return 'skipped' return 'skipped' - - - + + + def do(self, args): try: self.delink_image(*args) except: output(u'An exception occured in %s' % self, False) traceback.print_exc(file = sys.stderr) - + def get_summary(self, site, image, admin, reason, replacement): - """ Get the summary template and substitute the + """ Get the summary template and substitute the correct values.""" # FIXME: Hardcode is EVIL, but now only the global bot uses this if (site.lang != 'commons' and self.CommonsDelinker.config['global']): @@ -350,7 +350,7 @@ tlp = self.CommonsDelinker.SummaryCache.get(site, 'replace-I18n') else: tlp = self.CommonsDelinker.SummaryCache.get(site, 'summary-I18n') - + tlp = tlp.replace('$1', image) if replacement: tlp = tlp.replace('$2', replacement) @@ -359,23 +359,23 @@ else: tlp = tlp.replace('$2', unicode(admin)) tlp = tlp.replace('$3', unicode(reason)) - + return tlp - + class SummaryCache(object): """ Object to thread-safe cache summary templates. """ def __init__(self, CommonsDelinker): self.summaries = {} self.lock = threading.Lock() self.CommonsDelinker = CommonsDelinker - + def get(self, site, type, key = None, default = None): - # This can probably also provide something for - # localised settings, but then it first needs to + # This can probably also provide something for + # localised settings, but then it first needs to # check whether the page is sysop only. if not key: key = str(site) - + self.lock.acquire() try: if type not in self.summaries: @@ -385,9 +385,9 @@ self.CommonsDelinker.config['summary_cache']: # Return cached result return self.summaries[type][key][0] - + output(u'%s Fetching new summary for %s' % (self, site)) - + # FIXME: evil if self.CommonsDelinker.config['global']: self.check_user_page(site) @@ -402,25 +402,25 @@ pass finally: self.lock.release() - + # No i18n available, but it may be available in the wikipedia # of that language. Only do so for wiktionary, wikibooks, # wikiquote, wikisource, wikinews, wikiversity # This will cause the bot to function even on special wikis # like mediawiki.org and meta and species. output(u'%s Using default summary for %s' % (self, site)) - + if default: return default - + if site.family.name != 'wikipedia' and self.CommonsDelinker.config['global']: - if site.family.name in ('wiktionary', 'wikibooks', 'wikiquote', + if site.family.name in ('wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikinews', 'wikiversity'): if site.lang in config.usernames['wikipedia']: - newsite = self.CommonsDelinker.get_site(site.lang, + newsite = self.CommonsDelinker.get_site(site.lang, wikipedia.Family('wikipedia')) return self.get(newsite, type, key = key) return self.CommonsDelinker.config['default_settings'].get(type, '') - + def check_user_page(self, site): "Check whether a userpage exists. Only used for CommonsDelinker." try: @@ -435,24 +435,24 @@ ftxt = f.read() f.close() if not '#' + str(site) in ftxt: - username = config.usernames[site.family.name][site.lang] - + username = config.usernames[site.family.name][site.lang] + userpage = wikipedia.Page(site, 'User:' + username) - # Removed check for page existence. If it is not in our + # Removed check for page existence. If it is not in our # database we can safely assume that we have no user page # there. In case there is, we will just overwrite it once. - # It causes no real problems, but it is one call to the + # It causes no real problems, but it is one call to the # servers less. # TODO: Config setting? userpage.put('#REDIRECT [[m:User:CommonsDelinker]]', '') - + f = open(filename, 'a') f.write('#' + str(site)) f.close() except wikipedia.LockedPage: # User page is protected, continue anyway - pass - + pass + class CheckUsage(threadpool.Thread): timeout = 120 def __init__(self, pool, CommonsDelinker): @@ -460,14 +460,14 @@ self.CommonsDelinker = CommonsDelinker # Not really thread safe, but we should only do read operations... self.site = CommonsDelinker.site - + def run(self): try: self.connect() except: return self.exit() threadpool.Thread.run(self) - + def connect(self): output(u'%s Connecting to databases' % self) config = self.CommonsDelinker.config @@ -475,22 +475,22 @@ # Note: global use requires MySQL self.CheckUsage = checkusage.CheckUsage(limit = sys.maxint, mysql_kwargs = config['sql_config'], - use_autoconn = True, + use_autoconn = True, http_callback = wait_callback, mysql_callback = wait_callback, mysql_host_suffix = '-fast') else: self.CheckUsage = checkusage.CheckUsage(sys.maxint, http_callback = wait_callback, no_db = True) - - + + def check_usage(self, image, timestamp, admin, reason, replacement): """ Check whether this image needs to be delinked. """ - + # Check whether the image still is deleted on Commons. # BUG: This also returns true for images with a page, but # without the image itself. Can be fixed by querying query.php - # instead of api.php. Also should this be made as an exits() + # instead of api.php. Also should this be made as an exits() # method of checkusage.CheckUsage? if self.site.shared_image_repository() != (None, None): shared_image_repository = self.CommonsDelinker.get_site(*self.site.shared_image_repository()) @@ -505,12 +505,12 @@ not bool(replacement): output(u'%s %s exists again!' % (self, image)) return - - + + if self.CommonsDelinker.config['global']: usage = self.CheckUsage.get_usage(image) usage_domains = {} - + count = 0 # Sort usage per domain for (lang, family), (page_namespace, page_title, title) in usage: @@ -520,21 +520,21 @@ count += 1 else: #FIX! - usage_domains = {(self.site.lang, self.site.family.name): - list(self.CheckUsage.get_usage_live(self.site, + usage_domains = {(self.site.lang, self.site.family.name): + list(self.CheckUsage.get_usage_live(self.site, image))} count = len(usage_domains[(self.site.lang, self.site.family.name)]) - + output(u'%s %s used on %s pages' % (self, image, count)) - + if count: # Pass the usage to the Delinker pool along with other arguments - self.CommonsDelinker.Delinkers.append((image, usage_domains, + self.CommonsDelinker.Delinkers.append((image, usage_domains, timestamp, admin, reason, replacement)) elif replacement: # Record replacement done self.CommonsDelinker.Loggers.append((timestamp, image, replacement)) - + def do(self, args): try: self.check_usage(*args) @@ -544,12 +544,12 @@ traceback.print_exc(file = sys.stderr) self.exit() self.CommonsDelinker.thread_died() - + def starve(self): self.pool.jobLock.acquire() try: if self.pool[id(self)].isSet(): return False - + output(u'%s Starving' % self) self.CheckUsage.close() del self.pool[id(self)] @@ -557,66 +557,66 @@ return True finally: self.pool.jobLock.release() - + class Logger(threadpool.Thread): timeout = 360 - + def __init__(self, pool, CommonsDelinker): threadpool.Thread.__init__(self, pool) self.CommonsDelinker = CommonsDelinker self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new') self.enabled = self.CommonsDelinker.config.get('enable_logging', True) - + def run(self): self.connect() threadpool.Thread.run(self) - + def connect(self): output(u'%s Connecting to log database' % self) self.database = connect_database() self.cursor = self.database.cursor() - - + + def log_result_legacy(self, timestamp, image, domain, namespace, page, status = "ok", newimage = None): # TODO: Make sqlite3 ready - + # The original delinker code cached log results, # in order to limit the number of connections. # However, since we are now using persistent # connections, we can safely insert the result # on the fly. output(u'%s Logging %s for %s on %s' % (self, repr(status), image, page)) - + # There is no need to escape each parameter if - # a parametrized call is made. + # a parametrized call is made. self.cursor.execute("""INSERT INTO %s (timestamp, img, wiki, page_title, namespace, status, newimg) VALUES (%%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % self.CommonsDelinker.config['log_table'], (timestamp, image, domain, page, namespace, status, newimage)) self.database.commit() - - def log_result_new(self, timestamp, image, site_lang, site_family, + + def log_result_new(self, timestamp, image, site_lang, site_family, page_namespace, page_title, status = 'ok', new_image = None): - + output(u'%s Logging %s for %s on %s' % (self, repr(status), image, page_title)) self.cursor.execute("""INSERT INTO %s (timestamp, image, site_lang, site_family, page_namespace, page_title, status, new_image) VALUES (%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % self.CommonsDelinker.config['log_table'], - (timestamp, image, site_lang, site_family, page_namespace, page_title, + (timestamp, image, site_lang, site_family, page_namespace, page_title, status, new_image)) self.database.commit() - + def log_replacement(self, timestamp, old_image, new_image): # TODO: Same as above - + output(u'Replacing %s by %s done' % (old_image, new_image)) - self.cursor.execute("""UPDATE %s SET status = 'done' WHERE - timestamp = %%s AND old_image = %%s AND + self.cursor.execute("""UPDATE %s SET status = 'done' WHERE + timestamp = %%s AND old_image = %%s AND new_image = %%s""" % self.CommonsDelinker.config['replacer_table'], (timestamp, old_image, new_image)) self.database.commit() - + def do(self, args): if not self.enabled: return try: @@ -633,12 +633,12 @@ traceback.print_exc(file = sys.stderr) self.exit() self.CommonsDelinker.thread_died() - + def starve(self): self.pool.jobLock.acquire() try: if self.pool[id(self)].isSet(): return False - + output(u'%s Starving' % self) self.database.close() del self.pool[id(self)] @@ -653,7 +653,7 @@ self.config = config.CommonsDelinker self.site = wikipedia.getSite() self.site.forceLogin() - + # Initialize workers self.CheckUsages = threadpool.ThreadPool(CheckUsage, self.config['checkusage_instances'], self) self.Delinkers = threadpool.ThreadPool(Delinker, self.config['delinker_instances'], self) @@ -661,34 +661,34 @@ self.Loggers = threadpool.ThreadPool(Logger, self.config['logger_instances'], self) else: self.Loggers = threadpool.ThreadPool(Logger, 1, self) - + self.http = checkusage.HTTP(self.site.hostname()) - + self.edit_list = [] self.editLock = threading.Lock() - + self.sites = {} self.siteLock = threading.Lock() - + self.SummaryCache = SummaryCache(self) - + if self.config.get('enable_replacer', False): self.connect_mysql() - + if self.config.get('no_sysop', False): # Don't edit as sysop if hasattr(config, 'sysopnames'): config.sysopnames = dict([(fam, {}) for fam in config.sysopnames.keys()]) - + self.last_check = time.time() - + #if 'bot' in self.site.userGroups: # self.log_limit = '5000' #else: # self.log_limit = '500' self.log_limit = '500' self.init_plugins() - + def init_plugins(self, do_reload = False): import plugins self.hooks = {} @@ -705,7 +705,7 @@ self.hooks[plugin.hook].append(plugin) output(u"%s Loaded plugin %s for hook '%s'" % \ (self, plugin, plugin.hook)) - + def exec_hook(self, name, args): # TODO: Threadsafety! if name in self.hooks: @@ -729,16 +729,16 @@ self.hooks[name].remove(plugin) finally: self.siteLock.release() - + def reload_plugins(signalnum, stack): pass - + def connect_mysql(self): self.database = connect_database() self.cursor = self.database.cursor() - + def set_edit(self, domain, page): - """ Make sure the bot does not create edit + """ Make sure the bot does not create edit conflicts with itself.""" self.editLock.acquire() being_editted = (domain, page) in self.edit_list @@ -751,9 +751,9 @@ self.editLock.acquire() self.edit_list.remove((domain, page)) self.editLock.release() - + def get_site(self, code, fam): - # Threadsafe replacement of wikipedia.getSite + # Threadsafe replacement of wikipedia.getSite key = '%s:%s' % (code, fam) self.siteLock.acquire() try: @@ -779,34 +779,34 @@ self.sites[key][self.sites[key].index((site, True))] = (site, False) finally: self.siteLock.release() - - + + def read_deletion_log(self): ts_format = '%Y-%m-%dT%H:%M:%SZ' wait = self.config['delink_wait'] exclusion = self.config['exclude_string'] - + ts_from = self.last_check # Truncate -> int() ts_end = int(time.time()) self.last_check = ts_end - + # Format as a Mediawiki timestamp and substract a # certain wait period. ts_from_s = time.strftime(ts_format, time.gmtime(ts_from - wait + 1)) ts_end_s = time.strftime(ts_format, time.gmtime(ts_end - wait)) - + try: # Assume less than 500 deletion have been made between - # this and the previous check of the log. If this is not + # this and the previous check of the log. If this is not # the case, timeout should be set lower. result = self.http.query_api(self.site.hostname(), self.site.apipath(), - action = 'query', list = 'logevents', letype = 'delete', - lelimit = self.log_limit, lestart = ts_from_s, leend = ts_end_s, + action = 'query', list = 'logevents', letype = 'delete', + lelimit = self.log_limit, lestart = ts_from_s, leend = ts_end_s, ledir = 'newer') logevents = result['query']['logevents'] except Exception, e: - if type(e) in (SystemError, KeyboardInterrupt): raise + if type(e) in (SystemError, KeyboardInterrupt): raise # Something happened, but since it is a network error, # it will not be critical. In order to prevent data loss # the last_check timestamp has to be set correctly. @@ -814,7 +814,7 @@ output('Warning! Unable to read deletion logs', False) output('%s: %s' % (e.__class__.__name__, str(e)), False) return time.sleep(self.config['timeout']) - + for logevent in logevents: if logevent['ns'] == 6 and logevent['action'] == 'delete': if exclusion not in logevent.get('comment', ''): @@ -823,14 +823,14 @@ timestamp = timestamp.replace(':', '') timestamp = timestamp.replace('T', '') timestamp = timestamp.replace('Z', '') - + output(u'Deleted image: %s' % logevent['title']) self.CheckUsages.append((checkusage.strip_ns(logevent['title']), timestamp, logevent['user'], logevent.get('comment', ''), None)) else: output(u'Skipping deleted image: %s' % logevent['title']) - + def read_replacement_log(self): # TODO: Make sqlite3 ready # TODO: Single process replacer @@ -845,22 +845,22 @@ self.CheckUsages.append((old_image, timestamp, user, comment, new_image)) output(u'Replacing %s by %s' % (old_image, new_image)) self.cursor.execute(update, ('ok', id)) - + self.database.commit() - + def start(self): # Gracefully exit all threads on SIG_INT or SIG_TERM threadpool.catch_signals() - + # Start threads self.Loggers.start() self.Delinkers.start() self.CheckUsages.start() - + # Give threads some time to initialize time.sleep(self.config['timeout']) output(u'All workers started') - + # Main loop while True: if self.config.get('enable_delinker', True): @@ -871,17 +871,17 @@ self.read_deletion_log() if self.config.get('enable_replacer', False): self.read_replacement_log() - + time.sleep(self.config['timeout']) - + def thread_died(self): # Obsolete return - + @staticmethod def output(*args): return output(*args) - + def output(message, toStdout = True): message = time.strftime('[%Y-%m-%d %H:%M:%S] ') + message wikipedia.output(message, toStdout = toStdout) @@ -895,16 +895,16 @@ output(u'Running ' + __version__) CD = CommonsDelinker() output(u'This bot runs from: ' + str(CD.site)) - + re._MAXCACHE = 4 - + args = wikipedia.handleArgs() if '-since' in args: # NOTE: Untested ts_format = '%Y-%m-%d %H:%M:%S' try: since = time.strptime( - args[args.index('-since') + 1], + args[args.index('-since') + 1], ts_format) except ValueError: if args[args.index('-since') + 1][0] == '[' and \ @@ -917,7 +917,7 @@ output(u'Reading deletion log since [%s]' %\ time.strftime(ts_format, since)) CD.last_check = time.mktime(since) - + try: try: CD.start() Modified: trunk/pywikipedia/commonsdelinker/image_replacer.py =================================================================== --- trunk/pywikipedia/commonsdelinker/image_replacer.py 2011-03-13 12:19:20 UTC (rev 9052) +++ trunk/pywikipedia/commonsdelinker/image_replacer.py 2011-03-13 12:24:49 UTC (rev 9053) @@ -4,7 +4,7 @@ Please refer to delinker.txt for full documentation. """ # -# +# # (C) Bryan Tong Minh, 2007 # # Distributed under the terms of the MIT license. @@ -43,44 +43,44 @@ self.config.update(getattr(config, 'Replacer', ())) self.template = re.compile(r'\{\{%s\|([^|]*?)\|([^|]*?)(?:(?:\|reason\=(.*?))?)\}\}' % \ self.config['replace_template']) - self.disallowed_replacements = [(re.compile(i[0], re.I), re.compile(i[1], re.I)) + self.disallowed_replacements = [(re.compile(i[0], re.I), re.compile(i[1], re.I)) for i in self.config.get('disallowed_replacements', ())] - + self.site = wikipedia.getSite(persistent_http = True) self.site.forceLogin() - + self.database = connect_database() self.cursor = self.database.cursor() - + self.first_revision = 0 if self.config.get('replacer_report_replacements', False): self.reporters = threadpool.ThreadPool(Reporter, 1, self.site, self.config) self.reporters.start() - - + + def read_replace_log(self): """ The actual worker method """ - + # FIXME: Make sqlite3 compatible - insert = """INSERT INTO %s (timestamp, old_image, new_image, + insert = """INSERT INTO %s (timestamp, old_image, new_image, status, user, comment) VALUES (%%s, %%s, %%s, 'pending', %%s, %%s)""" % self.config['replacer_table'] - + page = wikipedia.Page(self.site, self.config['command_page']) - + # Get last revision date - if self.cursor.execute("""SELECT timestamp FROM %s + if self.cursor.execute("""SELECT timestamp FROM %s ORDER BY timestamp DESC LIMIT 1""" % \ self.config['replacer_table']): since = mw_timestamp(self.cursor.fetchone()[0]) else: since = None - + if self.config.get('clean_list', False): username = config.sysopnames[self.site.family.name][self.site.lang] else: username = None - + try: # Fetch revision history revisions = self.get_history(page.title(), since, username) @@ -95,18 +95,18 @@ #self.site.conn.close() #self.site.conn.connect() return time.sleep(self.config['timeout']) - + # We're being killed if '{{stop}}' in text.lower(): output(u'Found {{stop}} on command page. Not replacing anything.') return time.sleep(self.config['timeout']) - + # Sort oldest first revisions.sort(key = lambda rev: rev['timestamp']) - + # Find all commands replacements = self.template.finditer(text) - + remove_from_list = [] count = 0 for replacement in replacements: @@ -122,10 +122,10 @@ remove_from_list.append(replacement.group(0)) output('Replacing %s by %s: %s' % replacement.groups()) count += 1 - + # Save all replaces to database self.database.commit() - + if remove_from_list and self.config.get('clean_list', False): # Cleanup the command page while True: @@ -144,10 +144,10 @@ except wikipedia.EditConflict: # Try again text = page.get() - + def get_history(self, title, since, username): """ Fetch the last 50 revisions using the API """ - + address = self.site.api_address() predata = [ ('action', 'query'), @@ -170,10 +170,10 @@ if 'missing' in page: raise Exception('Missing page!') return page.get('revisions', []) - + def examine_revision_history(self, revisions, replacement, username): """ Find out who is to blame for a replacement """ - + for revision in revisions: if replacement.group(0) in revision['*']: db_time = db_timestamp(revision['timestamp']) @@ -182,52 +182,52 @@ return (db_time, strip_image(replacement.group(1)), strip_image(replacement.group(2)), revision['user'], replacement.group(3)) - + output('Warning! Could not find out who did %s' % \ repr(replacement.group(0)), False) return - + def read_finished_replacements(self): - """ Find out which replacements have been completed and add them to + """ Find out which replacements have been completed and add them to the reporters queue. """ - + self.cursor.execute('START TRANSACTION WITH CONSISTENT SNAPSHOT') self.cursor.execute("""SELECT old_image, new_image, user, comment FROM %s WHERE status = 'done' AND timestamp >= %i""" % \ (self.config['replacer_table'], self.first_revision)) finished_images = list(self.cursor) - self.cursor.execute("""UPDATE %s SET status = 'reported' + self.cursor.execute("""UPDATE %s SET status = 'reported' WHERE status = 'done' AND timestamp >= %i""" % \ (self.config['replacer_table'], self.first_revision)) self.cursor.commit() - + for old_image, new_image, user, comment in finished_images: - self.cursor.execute("""SELECT wiki, namespace, page_title - FROM %s WHERE img = %%s AND status <> 'ok'""" % + self.cursor.execute("""SELECT wiki, namespace, page_title + FROM %s WHERE img = %%s AND status <> 'ok'""" % self.config['log_table'], (old_image, )) not_ok = [(wiki, namespace, page_title.decode('utf-8', 'ignore')) for wiki, namespace, page_title in self.cursor] - + if not comment: comment = '' - + self.reporters.append((old_image.decode('utf-8', 'ignore'), - new_image.decode('utf-8', 'ignore'), - user.decode('utf-8', 'ignore'), + new_image.decode('utf-8', 'ignore'), + user.decode('utf-8', 'ignore'), comment.decode('utf-8', 'ignore'), not_ok)) - - + + def start(self): while True: self.read_replace_log() if self.config.get('replacer_report_replacements', False): self.read_finished_replacements() - + # Replacer should not loop as often as delinker time.sleep(self.config['timeout'] * 2) - + def allowed_replacement(self, replacement): """ Method to prevent World War III """ - + for source, target in self.disallowed_replacements: if source.search(replacement.group(1)) and \ target.search(replacement.group(2)): @@ -236,14 +236,14 @@ class Reporter(threadpool.Thread): """ Asynchronous worker to report finished replacements to file pages. """ - + def __init__(self, pool, site, config): self.site = wikipedia.getSite(site.lang, site.family, site.user, True) self.config = config - + threadpool.Thread.__init__(self, pool) - + def do(self, args): try: self.report(args) @@ -254,7 +254,7 @@ sys.stderr.flush() self.exit() os.kill(0, signal.SIGTERM) - + def report(self, (old_image, new_image, user, comment, not_ok)): not_ok_items = [] for wiki, namespace, page_title in not_ok: @@ -265,7 +265,7 @@ namespace_name = namespace_name + u':' else: namespace_name = u'' - + if unicode(site) == unicode(self.site): if (namespace, page_title) != (6, old_image): not_ok_items.append(u'[[:%s%s]]' % \ @@ -273,13 +273,13 @@ else: not_ok_items.append(u'[[:%s:%s%s]]' % (site_prefix(site), namespace_name, page_title)) - + template = u'{{%s|new_image=%s|user=%s|comment=%s|not_ok=%s}}' % \ (self.config['replacer_report_template'], - new_image, user, comment, + new_image, user, comment, self.config.get('replacer_report_seperator', u', ').join(not_ok_items)) page = wikipedia.Page(self.site, u'Image:' + old_image) - + try: text = page.get() except wikipedia.NoPage: @@ -289,7 +289,7 @@ output(u'Warning! %s is a redirect; not reporting replacement!' % old_image) return try: - page.put(u'%s\n%s' % (template, text), + page.put(u'%s\n%s' % (template, text), comment = u'This image has been replaced by ' + new_image) except wikipedia.PageNotSaved, e: output(u'Warning! Unable to report replacement to %s.' % old_image, False) @@ -301,11 +301,11 @@ else: output(u'Reporting replacement of %s by %s.' % \ (old_image, new_image)) - + def main(): global R - + import sys, traceback wikipedia.handleArgs() output(u'Running ' + __version__) @@ -327,5 +327,5 @@ except: pass wikipedia.stopme() - + if __name__ == '__main__': main() Modified: trunk/pywikipedia/commonsdelinker/threadpool.py =================================================================== --- trunk/pywikipedia/commonsdelinker/threadpool.py 2011-03-13 12:19:20 UTC (rev 9052) +++ trunk/pywikipedia/commonsdelinker/threadpool.py 2011-03-13 12:24:49 UTC (rev 9053) @@ -1,19 +1,19 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -This module implements a threadpool which allows scripts that require +This module implements a threadpool which allows scripts that require performing concurrent jobs, an efficient and thread safe way to do this. - -The two classes available are ThreadPool and Thread. ThreadPool is the + +The two classes available are ThreadPool and Thread. ThreadPool is the controller class and contains a collection of Thread objects, which must be subclassed. Any thread can add a job to the ThreadPool by calling its append() method. The pool will add this task to the jobqueue and activate a sleeping thread, if available. In case no thread is directly available, the job will be handled by the first free thread. - + The Thread class must be subclassed and passed to the ThreadPool's constructor. -The subclass should implement a do(args) method, which will receive as its +The subclass should implement a do(args) method, which will receive as its argument the job. Please note that providing mutable variables to the jobqueue may cause thread unsafety! """ @@ -24,9 +24,9 @@ # __version__ = '$Id$' # - + import sys, threading, os - + class ThreadPool(dict): pools = [] def __init__(self, worker, max_threads, *args, **kwargs): @@ -36,7 +36,7 @@ self.jobQueue = [] self.worker = worker self.threads = [] - + self.max_threads = max_threads self.args = args self.kwargs = kwargs @@ -50,7 +50,7 @@ self.jobQueue.append(job) # The amount of workers needed to be unlocked unlock_workers = len(self.jobQueue) - + for event in self.itervalues(): if not event.isSet(): event.set() @@ -62,7 +62,7 @@ if counter == 0 and len(self.threads) < self.max_threads: self.add_thread() self.start() - + def add_thread(self): self.jobLock.acquire() try: @@ -71,7 +71,7 @@ self[id(thread)] = threading.Event() finally: self.jobLock.release() - + def start(self): for thread in self.threads: if not thread.isAlive(): @@ -92,23 +92,23 @@ threading.Thread.__init__(self) self.pool = pool self.quit = False - + def run(self): while True: # No try..finally: lock.release() here: # The lock might be released twice, in case - # the thread waits for an event, a race + # the thread waits for an event, a race # condition might occur where a lock is released - # that is acquired by another thread. + # that is acquired by another thread. self.pool.jobLock.acquire() - + if self.quit and not self.pool.jobQueue: # Only return once the jobQueue is empty. self.pool.jobLock.release() return - + if not self.pool.jobQueue: - # In case no job is available, wait for the pool + # In case no job is available, wait for the pool # to call and do not start a busy while loop. event = self.pool[id(self)] self.pool.jobLock.release() @@ -119,9 +119,9 @@ continue job = self.pool.jobQueue.pop(0) self.pool.jobLock.release() - + self.do(job) - + def exit(self): self.pool.jobLock.acquire() try: @@ -133,7 +133,7 @@ self.pool.threads.remove(self) finally: self.pool.jobLock.release() - + def starve(self): pass @@ -146,12 +146,12 @@ import signal for pool in ThreadPool.pools: pool.exit() - + if signalnum == signal.SIGINT: raise KeyboardInterrupt if signalnum == signal.SIGTERM: raise SystemExit - + def terminate(): # Maybe not a good idea, will also kill child processes import signal @@ -160,24 +160,24 @@ if __name__ == '__main__': import time # Test cases - + class Worker(Thread): def do(self, args): print 'Working', self time.sleep(10) print 'Done', self - + pool = ThreadPool(Worker) print 'Spawning 5 threads' [pool.add_thread() for i in xrange(5)] pool.start() - + print 'Doing 25 jobs' for i in xrange(25): print 'Job', i pool.append(i) time.sleep(i % 6) - + for thread in pool.threads: thread.exit()

1 0

SVN: [9052] trunk/pywikipedia/families
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9052 Revision: 9052 Author: xqt Date: 2011-03-13 12:19:20 +0000 (Sun, 13 Mar 2011) Log Message: ----------- stripped trailing whitespace Modified Paths: -------------- trunk/pywikipedia/families/anarchopedia_family.py trunk/pywikipedia/families/battlestarwiki_family.py trunk/pywikipedia/families/fon_family.py trunk/pywikipedia/families/freeciv_family.py trunk/pywikipedia/families/twcareer_family.py trunk/pywikipedia/families/ubuntutw_family.py trunk/pywikipedia/families/uncyclopedia_family.py trunk/pywikipedia/families/wikia_family.py trunk/pywikipedia/families/wikibond_family.py trunk/pywikipedia/families/wikibooks_family.py trunk/pywikipedia/families/wikinews_family.py trunk/pywikipedia/families/wikipedia_family.py trunk/pywikipedia/families/wikiquote_family.py trunk/pywikipedia/families/wikisource_family.py trunk/pywikipedia/families/wikitravel_family.py trunk/pywikipedia/families/wikiversity_family.py trunk/pywikipedia/families/wiktionary_family.py trunk/pywikipedia/families/wowwiki_family.py Modified: trunk/pywikipedia/families/anarchopedia_family.py =================================================================== --- trunk/pywikipedia/families/anarchopedia_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/anarchopedia_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -93,7 +93,7 @@ self.namespaces[15]['sq'] = u'Kategori Diskutim' self.namespaces[100] = {'en':u'Focus'} - + self.namespaces[101] = {'en':u'Focus talk'} Modified: trunk/pywikipedia/families/battlestarwiki_family.py =================================================================== --- trunk/pywikipedia/families/battlestarwiki_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/battlestarwiki_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import family - + # The Battlestar Wiki family, a set of Battlestar wikis. # http://battlestarwiki.org/ @@ -8,14 +8,14 @@ def __init__(self): family.Family.__init__(self) self.name = 'battlestarwiki' - + self.languages_by_size = ['en', 'de', 'fr', 'zh', 'es', 'ms', 'tr', 'simple'] - + for lang in self.languages_by_size: self.langs[lang] = '%s.battlestarwiki.org' % lang # Most namespaces are inherited from family. - + self.namespaces[4] = { '_default': u'Battlestar Wiki', } @@ -31,35 +31,35 @@ # Custom namespaces that a needed self.namespaces[100] = { - '_default': u'Portal', + '_default': u'Portal', } - self.namespaces[101] = { - '_default': u'Portal talk', + self.namespaces[101] = { + '_default': u'Portal talk', } - self.namespaces[102] = { - '_default': u'Sources', + self.namespaces[102] = { + '_default': u'Sources', } - self.namespaces[103] = { - '_default': u'Sources talk', + self.namespaces[103] = { + '_default': u'Sources talk', } - self.namespaces[104] = { - '_default': u'Quotes', + self.namespaces[104] = { + '_default': u'Quotes', } - self.namespaces[105] = { - '_default': u'Quotes talk', + self.namespaces[105] = { + '_default': u'Quotes talk', } - self.namespaces[106] = { - '_default': u'Podcast', + self.namespaces[106] = { + '_default': u'Podcast', } - self.namespaces[107] = { - '_default': u'Podcast talk', + self.namespaces[107] = { + '_default': u'Podcast talk', } # A few selected big languages for things that we do not want to loop over # all languages. This is only needed by the titletranslate.py module, so # if you carefully avoid the options, you could get away without these # for another wiki family. - + alphabetic = ['de', 'en', 'es', 'fr', 'tr', 'zh'] def hostname(self,code): Modified: trunk/pywikipedia/families/fon_family.py =================================================================== --- trunk/pywikipedia/families/fon_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/fon_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -37,4 +37,4 @@ return '/mediawiki' def version(self, code): - return "1.15.1" + return "1.15.1" Modified: trunk/pywikipedia/families/freeciv_family.py =================================================================== --- trunk/pywikipedia/families/freeciv_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/freeciv_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -7,7 +7,7 @@ # The project wiki of Freeciv, an open source strategy game. class Family(family.Family): - + def __init__(self): family.Family.__init__(self) self.name = 'freeciv' Modified: trunk/pywikipedia/families/twcareer_family.py =================================================================== --- trunk/pywikipedia/families/twcareer_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/twcareer_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -36,4 +36,4 @@ return "1.11.0" def scriptpath(self, code): - return '' \ No newline at end of file + return '' Modified: trunk/pywikipedia/families/ubuntutw_family.py =================================================================== --- trunk/pywikipedia/families/ubuntutw_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/ubuntutw_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -82,4 +82,4 @@ return "1.12.0" def scriptpath(self, code): - return '' \ No newline at end of file + return '' Modified: trunk/pywikipedia/families/uncyclopedia_family.py =================================================================== --- trunk/pywikipedia/families/uncyclopedia_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/uncyclopedia_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -2205,7 +2205,7 @@ 'eo','tr','ca','fa','hr','sr','lt','tl','lv','bg','la','ro','ms','ast', 'mk','cy','et','sl','mo','bs','mg','yi','ka','got','zombie','bn','li', 'km','lb','lo','su','jv','ga','oc' - ] + ] self.alphabetic_revised = [ 'ar','ast','bg','bn','bs','ca','common','cs','cy','da','de','el','en', @@ -2228,4 +2228,4 @@ return 'utf-8' def shared_image_repository(self, code): - return ('common', 'common') \ No newline at end of file + return ('common', 'common') Modified: trunk/pywikipedia/families/wikia_family.py =================================================================== --- trunk/pywikipedia/families/wikia_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikia_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -34,10 +34,10 @@ self.namespaces[113] = { '_default': u'Mini talk', } - + def hostname(self, code): return u'www.wikia.com' - + def version(self, code): return "1.16.2" Modified: trunk/pywikipedia/families/wikibond_family.py =================================================================== --- trunk/pywikipedia/families/wikibond_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikibond_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -6,7 +6,7 @@ # The url op the wiki: nl.wikibond.org class Family(family.Family): - + def __init__(self): family.Family.__init__(self) self.name = 'wikibond' Modified: trunk/pywikipedia/families/wikibooks_family.py =================================================================== --- trunk/pywikipedia/families/wikibooks_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikibooks_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -401,7 +401,7 @@ self.cross_allowed = ['fa', 'fy', 'it', 'nl', 'ru', 'simple', 'zh'] # CentralAuth cross avaliable projects. self.cross_projects = [ - 'wikipedia', 'wiktionary', 'wikiquote', 'wikiquote', 'wikinews', 'wikiversity', + 'wikipedia', 'wiktionary', 'wikiquote', 'wikiquote', 'wikinews', 'wikiversity', 'meta', 'mediawiki', 'test', 'incubator', 'commons', 'species' ] Modified: trunk/pywikipedia/families/wikinews_family.py =================================================================== --- trunk/pywikipedia/families/wikinews_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikinews_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -235,7 +235,7 @@ self.cross_allowed = ['cs', 'hu',] # CentralAuth cross avaliable projects. self.cross_projects = [ - 'wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikiversity', + 'wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikiversity', 'meta', 'mediawiki', 'test', 'incubator', 'commons', 'species' ] Modified: trunk/pywikipedia/families/wikipedia_family.py =================================================================== --- trunk/pywikipedia/families/wikipedia_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikipedia_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -383,7 +383,7 @@ 'zh': [u'Wikipedia talk', u'维基百科讨论'], 'zh-classical': u'維基大典 talk', } - + self.namespaces[90] = { 'hu': u'Téma', } Modified: trunk/pywikipedia/families/wikiquote_family.py =================================================================== --- trunk/pywikipedia/families/wikiquote_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikiquote_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -210,7 +210,7 @@ ] # CentralAuth cross avaliable projects. self.cross_projects = [ - 'wikipedia', 'wiktionary', 'wikibooks', 'wikisource', 'wikinews', 'wikiversity', + 'wikipedia', 'wiktionary', 'wikibooks', 'wikisource', 'wikinews', 'wikiversity', 'meta', 'mediawiki', 'test', 'incubator', 'commons', 'species' ] # Which languages have a special order for putting interlanguage links, Modified: trunk/pywikipedia/families/wikisource_family.py =================================================================== --- trunk/pywikipedia/families/wikisource_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikisource_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -134,7 +134,7 @@ 'yi': [u'װיקיביבליאָטעק רעדן', u'וויקיביבליאטעק רעדן'], 'zh': [u'Wikisource talk', u'维基文库讨论'], } - + self.namespaces[90] = { 'sv': u'Tråd', } @@ -443,7 +443,7 @@ ] # CentralAuth cross avaliable projects. self.cross_projects = [ - 'wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikinews', 'wikiversity', + 'wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikinews', 'wikiversity', 'meta', 'mediawiki', 'test', 'incubator', 'commons', 'species' ] @@ -499,15 +499,15 @@ 'vi': self.authornamespaces, 'zh': self.authornamespaces, } - + self.crossnamespace[104] = { 'pl': self.authornamespaces, } - + self.crossnamespace[106] = { 'sv': self.authornamespaces, } - + def version(self, code): return '1.17wmf1' @@ -524,7 +524,7 @@ def scriptpath(self, code): if code == '-': return '/wikipedia/sources/w' - + return '/%s/%s/w' % (self.name, code) def nicepath(self, code): Modified: trunk/pywikipedia/families/wikitravel_family.py =================================================================== --- trunk/pywikipedia/families/wikitravel_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikitravel_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -458,7 +458,7 @@ } self.crossnamespace[14] = { 'wts': { - '_default': [0], + '_default': [0], }, } Modified: trunk/pywikipedia/families/wikiversity_family.py =================================================================== --- trunk/pywikipedia/families/wikiversity_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wikiversity_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -122,7 +122,7 @@ ] # CentralAuth cross avaliable projects. self.cross_projects = [ - 'wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikinews', + 'wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikinews', 'meta', 'mediawiki', 'test', 'incubator', 'commons', 'species' ] Modified: trunk/pywikipedia/families/wiktionary_family.py =================================================================== --- trunk/pywikipedia/families/wiktionary_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wiktionary_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -440,7 +440,7 @@ 'ang', 'ast', 'az', 'bg', 'bn', 'da', 'eo', 'es', 'fa', 'fy', 'ga', 'gd', 'hu', 'ia', 'ie', 'ik', 'jv', 'ka', 'li', 'lt', 'mk', 'nl', 'no', 'oc', 'pt', 'sk', 'tg', 'th', 'ti', 'ts', 'ug', 'uk', 'vo', - 'za', 'zh-min-nan', 'zh', 'zu', + 'za', 'zh-min-nan', 'zh', 'zu', ] # Other than most Wikipedias, page names must not start with a capital Modified: trunk/pywikipedia/families/wowwiki_family.py =================================================================== --- trunk/pywikipedia/families/wowwiki_family.py 2011-03-13 12:10:55 UTC (rev 9051) +++ trunk/pywikipedia/families/wowwiki_family.py 2011-03-13 12:19:20 UTC (rev 9052) @@ -133,7 +133,7 @@ '_default': u'Video talk', 'ru': u'Обсуждение видео' } - self.namespaces[500] = { + self.namespaces[500] = { '_default': u'User blog', 'de': u'Benutzer Blog', 'en': '', #disabled on en @@ -177,7 +177,7 @@ 'en': u'Portal talk', 'ru': u'Сервер talk' } - #and a few more + #and a few more self.namespaces[120] = { 'no': u'Oppdrag' } self.namespaces[121] = { 'no': u'Oppdrag Kommentar' } self.namespaces[122] = { 'no': u'Retningslinje' }

1 0

SVN: [9051] trunk/pywikipedia/maintenance
by xqt＠svn.wikimedia.org 13 Mar '11

13 Mar '11

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9051 Revision: 9051 Author: xqt Date: 2011-03-13 12:10:55 +0000 (Sun, 13 Mar 2011) Log Message: ----------- version line added Modified Paths: -------------- trunk/pywikipedia/maintenance/update_namespaces.py trunk/pywikipedia/maintenance/wikimedia_sites.py Property Changed: ---------------- trunk/pywikipedia/maintenance/update_namespaces.py trunk/pywikipedia/maintenance/wikimedia_sites.py Modified: trunk/pywikipedia/maintenance/update_namespaces.py =================================================================== --- trunk/pywikipedia/maintenance/update_namespaces.py 2011-03-13 12:08:39 UTC (rev 9050) +++ trunk/pywikipedia/maintenance/update_namespaces.py 2011-03-13 12:10:55 UTC (rev 9051) @@ -11,6 +11,7 @@ # # Distributed under the terms of the MIT license. # +__version__ = '$Id$' import sys sys.path.append('..') Property changes on: trunk/pywikipedia/maintenance/update_namespaces.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Modified: trunk/pywikipedia/maintenance/wikimedia_sites.py =================================================================== --- trunk/pywikipedia/maintenance/wikimedia_sites.py 2011-03-13 12:08:39 UTC (rev 9050) +++ trunk/pywikipedia/maintenance/wikimedia_sites.py 2011-03-13 12:10:55 UTC (rev 9051) @@ -7,6 +7,7 @@ # # Distributed under the terms of the MIT license. # +__version__ = '$Id$' import sys, re Property changes on: trunk/pywikipedia/maintenance/wikimedia_sites.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision

1 0

← Newer
1
...
6
7
8
9
10
11
12
13
Older →

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

Pywikipedia-svn March 2011