http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9060
Revision: 9060
Author: xqt
Date: 2011-03-13 14:10:11 +0000 (Sun, 13 Mar 2011)
Log Message:
-----------
eol-style
Modified Paths:
--------------
branches/rewrite/scripts/redirect.py
Property Changed:
----------------
branches/rewrite/scripts/redirect.py
Modified: branches/rewrite/scripts/redirect.py
===================================================================
--- branches/rewrite/scripts/redirect.py 2011-03-13 14:06:08 UTC (rev 9059)
+++ branches/rewrite/scripts/redirect.py 2011-03-13 14:10:11 UTC (rev 9060)
@@ -1,708 +1,708 @@
-# -*- coding: utf-8 -*-
-"""
-Script to resolve double redirects, and to delete broken redirects. Requires
-access to MediaWiki's maintenance pages or to a XML dump file. Delete
-function requires adminship.
-
-Syntax:
-
- python redirect.py action [-arguments ...]
-
-where action can be one of these:
-
-double Fix redirects which point to other redirects
-broken Delete redirects where targets don\'t exist. Requires adminship.
-both Both of the above. Permitted only with -api. Implies -api.
-
-and arguments can be:
-
--moves Use the page move log to find double-redirect candidates. Only
- works with action "double".
-
--namespace:n Namespace to process. Can be given multiple times, for several
- namespaces. If omitted, only the main (article) namespace is
- treated.
-
--offset:n With -moves, the number of hours ago to start scanning moved
- pages. Otherwise, ignored.
-
--start:title The starting page title in each namespace. Page need not exist.
-
--until:title The possible last page title in each namespace. Page needs not
- exist.
-
--total:n The maximum count of redirects to work upon. If omitted, there
- is no limit.
-
--always Don't prompt you for each replacement.
-
-"""
-
-# XML not yet implemented: deleted help text follows
-##-xml Retrieve information from a local XML dump
-## (http://download.wikimedia.org). Argument can also be given as
-## "-xml:filename.xml". Cannot be used with -api or -moves.
-## If neither of -xml -api -moves is given, info will be loaded
-## from a special page of the live wiki.
-
-#
-# (C) Daniel Herding, 2004.
-# (C) Purodha Blissenbach, 2009.
-# (C) xqt, 2009-2010
-# (C) Pywikipedia bot team, 2004-2010
-#
-# Distributed under the terms of the MIT license.
-#
-__version__='$Id: redirect.py 7789 2009-12-17 19:20:12Z xqt $'
-#
-import re, sys, datetime
-import pywikibot
-from pywikibot import config, i18n
-# import xmlreader
-
-
-class RedirectGenerator:
- def __init__(self, xmlFilename=None, namespaces=[], offset=-1,
- use_move_log=False, use_api=False, start=None, until=None,
- number=None):
- self.site = pywikibot.getSite()
-## self.xmlFilename = xmlFilename
- self.namespaces = namespaces
- if use_api and self.namespaces == []:
- self.namespaces = [ 0 ]
- self.offset = offset
- self.use_move_log = use_move_log
- self.use_api = use_api
- self.api_start = start
- self.api_until = until
- self.api_number = number
-
-# note: rewrite branch does not yet support XML dumps, so this is commented out
-# until that support is added
-## def get_redirects_from_dump(self, alsoGetPageTitles=False):
-## '''
-## Load a local XML dump file, look at all pages which have the
-## redirect flag set, and find out where they're pointing at. Return
-## a dictionary where the redirect names are the keys and the redirect
-## targets are the values.
-## '''
-## xmlFilename = self.xmlFilename
-## redict = {}
-## # open xml dump and read page titles out of it
-## dump = xmlreader.XmlDump(xmlFilename)
-## redirR = self.site.redirectRegex()
-## readPagesCount = 0
-## if alsoGetPageTitles:
-## pageTitles = set()
-## for entry in dump.parse():
-## readPagesCount += 1
-## # always print status message after 10000 pages
-## if readPagesCount % 10000 == 0:
-## pywikibot.output(u'%i pages read...' % readPagesCount)
-## if len(self.namespaces) > 0:
-## if pywikibot.Page(self.site, entry.title).namespace() \
-## not in self.namespaces:
-## continue
-## if alsoGetPageTitles:
-## pageTitles.add(entry.title.replace(' ', '_'))
-##
-## m = redirR.match(entry.text)
-## if m:
-## target = m.group(1)
-## # There might be redirects to another wiki. Ignore these.
-## for code in self.site.family.langs.keys():
-## if target.startswith('%s:' % code) \
-## or target.startswith(':%s:' % code):
-## if code == self.site.language():
-## # link to our wiki, but with the lang prefix
-## target = target[(len(code)+1):]
-## if target.startswith(':'):
-## target = target[1:]
-## else:
-## pywikibot.output(
-## u'NOTE: Ignoring %s which is a redirect to %s:'
-## % (entry.title, code))
-## target = None
-## break
-## # if the redirect does not link to another wiki
-## if target:
-## source = entry.title.replace(' ', '_')
-## target = target.replace(' ', '_')
-## # remove leading and trailing whitespace
-## target = target.strip('_')
-## # capitalize the first letter
-## if not pywikibot.getSite().nocapitalize:
-## source = source[:1].upper() + source[1:]
-## target = target[:1].upper() + target[1:]
-## if '#' in target:
-## target = target[:target.index('#')].rstrip("_")
-## if '|' in target:
-## pywikibot.output(
-## u'HINT: %s is a redirect with a pipelink.'
-## % entry.title)
-## target = target[:target.index('|')].rstrip("_")
-## if target: # in case preceding steps left nothing
-## redict[source] = target
-## if alsoGetPageTitles:
-## return redict, pageTitles
-## else:
-## return redict
-##
- def get_redirect_pages_via_api(self):
- """Return generator that yields
- Pages that are redirects.
-
- """
- for ns in self.namespaces:
- done = False
- gen = self.site.allpages(start=self.api_start,
- namespace=ns,
- filterredir=True)
- if self.api_number:
- gen.set_maximum_items(self.api_number)
- for p in gen:
- done = self.api_until \
- and p.title(withNamespace=False) >= self.api_until
- if done:
- return
- yield p
-
- def _next_redirect_group(self):
- """
- Return a generator that retrieves pageids from the API 500 at a time
- and yields them as a list
- """
- apiQ = []
- for page in self.get_redirect_pages_via_api():
- apiQ.append(str(page._pageid))
- if len(apiQ) >= 500:
- yield apiQ
- apiQ = []
- if apiQ:
- yield apiQ
-
- def get_redirects_via_api(self, maxlen=8):
- """
- Return a generator that yields tuples of data about redirect Pages:
- 0 - page title of a redirect page
- 1 - type of redirect:
- 0 - broken redirect, target page title missing
- 1 - normal redirect, target page exists and is not a
- redirect
- 2..maxlen - start of a redirect chain of that many redirects
- (currently, the API seems not to return sufficient
- data to make these return values possible, but
- that may change)
- maxlen+1 - start of an even longer chain, or a loop
- (currently, the API seems not to return sufficient
- data to allow this return values, but that may
- change)
- None - start of a redirect chain of unknown length, or loop
- 2 - target page title of the redirect, or chain (may not exist)
- 3 - target page of the redirect, or end of chain, or page title where
- chain or loop detecton was halted, or None if unknown
- """
- for apiQ in self._next_redirect_group():
- gen = pywikibot.data.api.Request(action="query", redirects="",
- pageids=apiQ)
- data = gen.submit()
- if 'error' in data:
- raise RuntimeError("API query error: %s" % data)
- if data == [] or 'query' not in data:
- raise RuntimeError("No results given.")
- redirects = {}
- pages = {}
- redirects = dict((x['from'], x['to'])
- for x in data['query']['redirects'])
-
- for pagetitle in data['query']['pages'].values():
- if 'missing' in pagetitle and 'pageid' not in pagetitle:
- pages[pagetitle['title']] = False
- else:
- pages[pagetitle['title']] = True
- for redirect in redirects:
- target = redirects[redirect]
- result = 0
- final = None
- try:
- if pages[target]:
- final = target
- try:
- while result <= maxlen:
- result += 1
- final = redirects[final]
- # result = None
- except KeyError:
- pass
- except KeyError:
- result = None
- pass
- yield (redirect, result, target, final)
-
- def retrieve_broken_redirects(self):
- if self.use_api:
- count = 0
- for (pagetitle, type, target, final) \
- in self.get_redirects_via_api(maxlen=2):
- if type == 0:
- yield pagetitle
- if self.api_number:
- count += 1
- if count >= self.api_number:
- break
-# TODO: add XML dump support
-## elif self.xmlFilename == None:
-## # retrieve information from the live wiki's maintenance page
-## # broken redirect maintenance page's URL
-## path = self.site.broken_redirects_address(default_limit=False)
-## pywikibot.output(u'Retrieving special page...')
-## maintenance_txt = self.site.getUrl(path)
-##
-## # regular expression which finds redirects which point to a
-## # non-existing page inside the HTML
-## Rredir = re.compile('\<li\>\<a href=".+?" title="(.*?)"')
-##
-## redir_names = Rredir.findall(maintenance_txt)
-## pywikibot.output(u'Retrieved %d redirects from special page.\n'
-## % len(redir_names))
-## for redir_name in redir_names:
-## yield redir_name
-## else:
-## # retrieve information from XML dump
-## pywikibot.output(
-## u'Getting a list of all redirects and of all page titles...')
-## redirs, pageTitles = self.get_redirects_from_dump(
-## alsoGetPageTitles=True)
-## for (key, value) in redirs.iteritems():
-## if value not in pageTitles:
-## yield key
-
- def retrieve_double_redirects(self):
- if self.use_move_log:
- for redir_page in self.get_moved_pages_redirects():
- yield redir_page.title()
- return
- else:
- count = 0
- for (pagetitle, type, target, final) \
- in self.get_redirects_via_api(maxlen=2):
- if type != 0 and type != 1:
- yield pagetitle
- if self.api_number:
- count += 1
- if count >= self.api_number:
- break
-
-# TODO: API cannot yet deliver contents of "special" pages
-## elif self.xmlFilename == None:
-## # retrieve information from the live wiki's maintenance page
-## # double redirect maintenance page's URL
-### pywikibot.config.special_page_limit = 1000
-## path = self.site.double_redirects_address(default_limit = False)
-## pywikibot.output(u'Retrieving special page...')
-## maintenance_txt = self.site.getUrl(path)
-##
-## # regular expression which finds redirects which point to
-## # another redirect inside the HTML
-## Rredir = re.compile('\<li\>\<a href=".+?" title="(.*?)">')
-## redir_names = Rredir.findall(maintenance_txt)
-## pywikibot.output(u'Retrieved %i redirects from special page.\n'
-## % len(redir_names))
-## for redir_name in redir_names:
-## yield redir_name
-## else:
-## redict = self.get_redirects_from_dump()
-## num = 0
-## for (key, value) in redict.iteritems():
-## num += 1
-## # check if the value - that is, the redirect target - is a
-## # redirect as well
-## if num > self.offset and value in redict:
-## yield key
-## pywikibot.output(u'\nChecking redirect %i of %i...'
-## % (num + 1, len(redict)))
-
- def get_moved_pages_redirects(self):
- '''generate redirects to recently-moved pages'''
- # this will run forever, until user interrupts it
-
- if self.offset <= 0:
- self.offset = 1
- start = datetime.datetime.utcnow() \
- - datetime.timedelta(0, self.offset*3600)
- # self.offset hours ago
- offset_time = start.strftime("%Y%m%d%H%M%S")
-
- move_gen = self.site.logevents(logtype="move", start=offset_time)
- if self.api_number:
- move_gen.set_maximum_items(self.api_number)
- for logentry in move_gen:
- moved_page = logentry.title()
- try:
- if not moved_page.isRedirectPage():
- continue
- except pywikibot.BadTitle:
- continue
- except pywikibot.ServerError:
- continue
- # moved_page is now a redirect, so any redirects pointing
- # to it need to be changed
- try:
- for page in moved_page.getReferences(follow_redirects=True,
- redirectsOnly=True):
- yield page
- except pywikibot.NoPage:
- # original title must have been deleted after move
- continue
-
-
-class RedirectRobot:
- def __init__(self, action, generator, always=False, number=None):
- self.site = pywikibot.getSite()
- self.action = action
- self.generator = generator
- self.always = always
- self.number = number
- self.exiting = False
-
- def prompt(self, question):
- if not self.always:
- choice = pywikibot.inputChoice(question,
- ['Yes', 'No', 'All', 'Quit'],
- ['y', 'N', 'a', 'q'], 'N')
- if choice == 'n':
- return False
- elif choice == 'q':
- self.exiting = True
- return False
- elif choice == 'a':
- self.always = True
- return True
-
- def delete_broken_redirects(self):
- # get reason for deletion text
- reason = i18n.twtranslate(self.site, 'redirect-remove-broken')
- for redir_name in self.generator.retrieve_broken_redirects():
- self.delete_1_broken_redirect(redir_name, reason)
- if self.exiting:
- break
-
- def delete_1_broken_redirect(self, redir_name, reason):
- redir_page = pywikibot.Page(self.site, redir_name)
- # Show the title of the page we're working on.
- # Highlight the title in purple.
- pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
- % redir_page.title())
- try:
- targetPage = redir_page.getRedirectTarget()
- except pywikibot.IsNotRedirectPage:
- pywikibot.output(u'%s is not a redirect.' % redir_page.title())
- except pywikibot.NoPage:
- pywikibot.output(u'%s doesn\'t exist.' % redir_page.title())
- else:
- try:
- targetPage.get()
- except pywikibot.NoPage:
- if self.prompt(
- u'Redirect target %s does not exist. Do you want to delete %s?'
- % (targetPage.title(asLink=True),
- redir_page.title(asLink=True))):
- try:
- redir_page.delete(reason, prompt = False)
- except pywikibot.NoUsername:
- if i18n.twhas_key(
- targetPage.site.lang,
- 'redirect-broken-redirect-template') and \
- i18n.twhas_key(targetPage.site.lang,
- 'redirect-remove-broken'):
- pywikibot.output(
- u"No sysop in user-config.py, put page to speedy deletion.")
- content = redir_page.get(get_redirect=True)
- ### TODO: Add bot's signature if needed
- ### Not supported via TW yet
- content = i18n.twtranslate(
- targetPage.site.lang,
- 'redirect-broken-redirect-template'
- ) + "\n" + content
- redir_page.put(content, reason)
- except pywikibot.IsRedirectPage:
- pywikibot.output(
- u'Redirect target %s is also a redirect! Won\'t delete anything.'
- % targetPage.title(asLink=True))
- else:
- #we successfully get the target page, meaning that
- #it exists and is not a redirect: no reason to touch it.
- pywikibot.output(
- u'Redirect target %s does exist! Won\'t delete anything.'
- % targetPage.title(asLink=True))
- pywikibot.output(u'')
-
- def fix_double_redirects(self):
- for redir_name in self.generator.retrieve_double_redirects():
- self.fix_1_double_redirect(redir_name)
- if self.exiting:
- break
-
- def fix_1_double_redirect(self, redir_name):
- redir = pywikibot.Page(self.site, redir_name)
- # Show the title of the page we're working on.
- # Highlight the title in purple.
- pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
- % redir.title())
- newRedir = redir
- redirList = [] # bookkeeping to detect loops
- while True:
- redirList.append(u'%s:%s' % (newRedir.site.lang,
- newRedir.title(withSection=False)))
- try:
- targetPage = newRedir.getRedirectTarget()
- except pywikibot.IsNotRedirectPage:
- if len(redirList) == 1:
- pywikibot.output(u'Skipping: Page %s is not a redirect.'
- % redir.title(asLink=True))
- break #do nothing
- elif len(redirList) == 2:
- pywikibot.output(
- u'Skipping: Redirect target %s is not a redirect.'
- % newRedir.title(asLink=True))
- break # do nothing
- else:
- pass # target found
- except pywikibot.SectionError:
- pywikibot.output(
- u'Warning: Redirect target section %s doesn\'t exist.'
- % newRedir.title(asLink=True))
- except pywikibot.CircularRedirect, e:
- pywikibot.warning(u"Skipping circular redirect: [[%s]]"
- % str(e))
- break
- except pywikibot.BadTitle, e:
- # str(e) is in the format 'BadTitle: [[Foo]]'
- pywikibot.output(
- u'Warning: Redirect target %s is not a valid page title.'
- % str(e)[10:])
- break
- except pywikibot.NoPage:
- if len(redirList) == 1:
- pywikibot.output(u'Skipping: Page %s does not exist.'
- % redir.title(asLink=True))
- break
- else:
- if self.always:
- pywikibot.output(
- u"Skipping: Redirect target %s doesn't exist."
- % newRedir.title(asLink=True))
- break # skip if automatic
- else:
- pywikibot.output(
- u"Warning: Redirect target %s doesn't exist."
- % newRedir.title(asLink=True))
- except pywikibot.ServerError:
- pywikibot.output(u'Skipping: Server Error')
- break
- else:
- pywikibot.output(
- u' Links to: %s.'
- % targetPage.title(asLink=True))
- if targetPage.site.sitename() == 'wikipedia:en':
- mw_msg = targetPage.site.mediawiki_message(
- 'wikieditor-toolbar-tool-redirect-example')
- if targetPage.title() == mw_msg:
- pywikibot.output(
- u"Skipping toolbar example: Redirect source is potentially vandalized.")
- break
- if targetPage.site != self.site:
- pywikibot.output(
- u'Warning: redirect target (%s) is on a different site.'
- % targetPage.title(asLink=True))
- if self.always:
- break # skip if automatic
- # watch out for redirect loops
- if redirList.count(u'%s:%s'
- % (targetPage.site.lang,
- targetPage.title(withSection=False))
- ) > 0:
- pywikibot.output(
- u'Warning: Redirect target %s forms a redirect loop.'
- % targetPage.title(asLink=True))
- break ### doesn't work. edits twice!
-## try:
-## content = targetPage.get(get_redirect=True)
-## except pywikibot.SectionError:
-## content = pywikibot.Page(
-## targetPage.site,
-## targetPage.title(withSection=False)
-## ).get(get_redirect=True)
-## if i18n.twhas_key(
-## targetPage.site.lang,
-## 'redirect-broken-redirect-template') and \
-## i18n.twhas_key(targetPage.site.lang,
-## 'redirect-remove-loop'):
-## pywikibot.output(u"Tagging redirect for deletion")
-## # Delete the two redirects
-## content = i18n.twtranslate(
-## targetPage.site.lang,
-## 'redirect-remove-loop',
-## ) + "\n" + content
-## summ = i18n.twtranslate(
-## targetPage.site.lang,
-## 'redirect-broken-redirect-template')
-## targetPage.put(content, summ)
-## redir.put(content, summ)
-## break # TODO Better implement loop redirect
- else: # redirect target found
- if targetPage.isStaticRedirect():
- pywikibot.output(
- u" Redirect target is STATICREDIRECT.")
- pass
- else:
- newRedir = targetPage
- continue
- try:
- oldText = redir.get(get_redirect=True)
- except pywikibot.BadTitle:
- pywikibot.output(u"Bad Title Error")
- break
- text = self.site.redirectRegex().sub(
- '#%s %s' % (self.site.redirect(True),
- targetPage.title(asLink=True)), oldText)
- if text == oldText:
- pywikibot.output(u"Note: Nothing left to do on %s"
- % redir.title(asLink=True))
- break
- summary = i18n.twtranslate(self.site, 'redirect-fix-double',
- {'to': targetPage.title(asLink=True)}
- )
- pywikibot.showDiff(oldText, text)
- if self.prompt(u'Do you want to accept the changes?'):
- try:
- redir.put(text, summary)
- except pywikibot.LockedPage:
- pywikibot.output(u'%s is locked.' % redir.title())
- except pywikibot.SpamfilterError, error:
- pywikibot.output(
- u"Saving page [[%s]] prevented by spam filter: %s"
- % (redir.title(), error.url))
- except pywikibot.PageNotSaved, error:
- pywikibot.output(u"Saving page [[%s]] failed: %s"
- % (redir.title(), error))
- except pywikibot.NoUsername:
- pywikibot.output(
- u"Page [[%s]] not saved; sysop privileges required."
- % redir.title())
- except pywikibot.Error, error:
- pywikibot.output(
- u"Unexpected error occurred trying to save [[%s]]: %s"
- % (redir.title(), error))
- break
-
- def fix_double_or_delete_broken_redirects(self):
- # TODO: part of this should be moved to generator, the rest merged into self.run()
- # get reason for deletion text
- delete_reason = i18n.twtranslate(self.site, 'redirect-remove-broken')
- count = 0
- for (redir_name, code, target, final)\
- in self.generator.get_redirects_via_api(maxlen=2):
- if code == 1:
- continue
- elif code == 0:
- self.delete_1_broken_redirect(redir_name, delete_reason)
- count += 1
- else:
- self.fix_1_double_redirect(redir_name)
- count += 1
- if self.exiting or (self.number and count >= self.number):
- break
-
- def run(self):
- # TODO: make all generators return a redirect type indicator,
- # thus make them usable with 'both'
- if self.action == 'double':
- self.fix_double_redirects()
- elif self.action == 'broken':
- self.delete_broken_redirects()
- elif self.action == 'both':
- self.fix_double_or_delete_broken_redirects()
-
-def main(*args):
- # read command line parameters
- # what the bot should do (either resolve double redirs, or delete broken
- # redirs)
- action = None
- # where the bot should get his infos from (either None to load the
- # maintenance special page from the live wiki, or the filename of a
- # local XML dump file)
- xmlFilename = None
- # Which namespace should be processed when using a XML dump
- # default to -1 which means all namespaces will be processed
- namespaces = []
- # at which redirect shall we start searching double redirects again
- # (only with dump); default to -1 which means all redirects are checked
- offset = -1
- moved_pages = False
- api = True # rewrite always uses api, probably should get rid of this
- start = ''
- until = ''
- number = None
- always = False
- for arg in pywikibot.handleArgs(*args):
- if arg == 'double' or arg == 'do':
- action = 'double'
- elif arg == 'broken' or arg == 'br':
- action = 'broken'
- elif arg == 'both':
- action = 'both'
- elif arg.startswith('-xml'):
- if len(arg) == 4:
- xmlFilename = pywikibot.input(
- u'Please enter the XML dump\'s filename: ')
- else:
- xmlFilename = arg[5:]
- elif arg.startswith('-moves'):
- moved_pages = True
- elif arg.startswith('-namespace:'):
- ns = arg[11:]
- if ns == '':
- ## "-namespace:" does NOT yield -namespace:0 further down the road!
- ns = pywikibot.input(
- u'Please enter a namespace by its number: ')
-# u'Please enter a namespace by its name or number: ')
-# TODO! at least for some generators.
- if ns == '':
- ns = '0'
- try:
- ns = int(ns)
- except ValueError:
-#-namespace:all Process all namespaces. Works only with the API read interface.
- pass
- if not ns in namespaces:
- namespaces.append(ns)
- elif arg.startswith('-offset:'):
- offset = int(arg[8:])
- elif arg.startswith('-start:'):
- start = arg[7:]
- elif arg.startswith('-until:'):
- until = arg[7:]
- elif arg.startswith('-total:'):
- number = int(arg[8:])
- elif arg == '-always':
- always = True
- else:
- pywikibot.output(u'Unknown argument: %s' % arg)
-
- if xmlFilename:
- pywikibot.error(u"Sorry, xmlreader is not yet implemented in rewrite")
- elif not action: # or (xmlFilename and moved_pages)
- # or (api and xmlFilename):
- pywikibot.showHelp('redirect')
- else:
- gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages,
- api, start, until, number)
- bot = RedirectRobot(action, gen, always, number)
- bot.run()
-
-if __name__ == '__main__':
- try:
- main()
- finally:
- pywikibot.stopme()
+# -*- coding: utf-8 -*-
+"""
+Script to resolve double redirects, and to delete broken redirects. Requires
+access to MediaWiki's maintenance pages or to a XML dump file. Delete
+function requires adminship.
+
+Syntax:
+
+ python redirect.py action [-arguments ...]
+
+where action can be one of these:
+
+double Fix redirects which point to other redirects
+broken Delete redirects where targets don\'t exist. Requires adminship.
+both Both of the above. Permitted only with -api. Implies -api.
+
+and arguments can be:
+
+-moves Use the page move log to find double-redirect candidates. Only
+ works with action "double".
+
+-namespace:n Namespace to process. Can be given multiple times, for several
+ namespaces. If omitted, only the main (article) namespace is
+ treated.
+
+-offset:n With -moves, the number of hours ago to start scanning moved
+ pages. Otherwise, ignored.
+
+-start:title The starting page title in each namespace. Page need not exist.
+
+-until:title The possible last page title in each namespace. Page needs not
+ exist.
+
+-total:n The maximum count of redirects to work upon. If omitted, there
+ is no limit.
+
+-always Don't prompt you for each replacement.
+
+"""
+
+# XML not yet implemented: deleted help text follows
+##-xml Retrieve information from a local XML dump
+## (http://download.wikimedia.org). Argument can also be given as
+## "-xml:filename.xml". Cannot be used with -api or -moves.
+## If neither of -xml -api -moves is given, info will be loaded
+## from a special page of the live wiki.
+
+#
+# (C) Daniel Herding, 2004.
+# (C) Purodha Blissenbach, 2009.
+# (C) xqt, 2009-2010
+# (C) Pywikipedia bot team, 2004-2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__='$Id: redirect.py 7789 2009-12-17 19:20:12Z xqt $'
+#
+import re, sys, datetime
+import pywikibot
+from pywikibot import config, i18n
+# import xmlreader
+
+
+class RedirectGenerator:
+ def __init__(self, xmlFilename=None, namespaces=[], offset=-1,
+ use_move_log=False, use_api=False, start=None, until=None,
+ number=None):
+ self.site = pywikibot.getSite()
+## self.xmlFilename = xmlFilename
+ self.namespaces = namespaces
+ if use_api and self.namespaces == []:
+ self.namespaces = [ 0 ]
+ self.offset = offset
+ self.use_move_log = use_move_log
+ self.use_api = use_api
+ self.api_start = start
+ self.api_until = until
+ self.api_number = number
+
+# note: rewrite branch does not yet support XML dumps, so this is commented out
+# until that support is added
+## def get_redirects_from_dump(self, alsoGetPageTitles=False):
+## '''
+## Load a local XML dump file, look at all pages which have the
+## redirect flag set, and find out where they're pointing at. Return
+## a dictionary where the redirect names are the keys and the redirect
+## targets are the values.
+## '''
+## xmlFilename = self.xmlFilename
+## redict = {}
+## # open xml dump and read page titles out of it
+## dump = xmlreader.XmlDump(xmlFilename)
+## redirR = self.site.redirectRegex()
+## readPagesCount = 0
+## if alsoGetPageTitles:
+## pageTitles = set()
+## for entry in dump.parse():
+## readPagesCount += 1
+## # always print status message after 10000 pages
+## if readPagesCount % 10000 == 0:
+## pywikibot.output(u'%i pages read...' % readPagesCount)
+## if len(self.namespaces) > 0:
+## if pywikibot.Page(self.site, entry.title).namespace() \
+## not in self.namespaces:
+## continue
+## if alsoGetPageTitles:
+## pageTitles.add(entry.title.replace(' ', '_'))
+##
+## m = redirR.match(entry.text)
+## if m:
+## target = m.group(1)
+## # There might be redirects to another wiki. Ignore these.
+## for code in self.site.family.langs.keys():
+## if target.startswith('%s:' % code) \
+## or target.startswith(':%s:' % code):
+## if code == self.site.language():
+## # link to our wiki, but with the lang prefix
+## target = target[(len(code)+1):]
+## if target.startswith(':'):
+## target = target[1:]
+## else:
+## pywikibot.output(
+## u'NOTE: Ignoring %s which is a redirect to %s:'
+## % (entry.title, code))
+## target = None
+## break
+## # if the redirect does not link to another wiki
+## if target:
+## source = entry.title.replace(' ', '_')
+## target = target.replace(' ', '_')
+## # remove leading and trailing whitespace
+## target = target.strip('_')
+## # capitalize the first letter
+## if not pywikibot.getSite().nocapitalize:
+## source = source[:1].upper() + source[1:]
+## target = target[:1].upper() + target[1:]
+## if '#' in target:
+## target = target[:target.index('#')].rstrip("_")
+## if '|' in target:
+## pywikibot.output(
+## u'HINT: %s is a redirect with a pipelink.'
+## % entry.title)
+## target = target[:target.index('|')].rstrip("_")
+## if target: # in case preceding steps left nothing
+## redict[source] = target
+## if alsoGetPageTitles:
+## return redict, pageTitles
+## else:
+## return redict
+##
+ def get_redirect_pages_via_api(self):
+ """Return generator that yields
+ Pages that are redirects.
+
+ """
+ for ns in self.namespaces:
+ done = False
+ gen = self.site.allpages(start=self.api_start,
+ namespace=ns,
+ filterredir=True)
+ if self.api_number:
+ gen.set_maximum_items(self.api_number)
+ for p in gen:
+ done = self.api_until \
+ and p.title(withNamespace=False) >= self.api_until
+ if done:
+ return
+ yield p
+
+ def _next_redirect_group(self):
+ """
+ Return a generator that retrieves pageids from the API 500 at a time
+ and yields them as a list
+ """
+ apiQ = []
+ for page in self.get_redirect_pages_via_api():
+ apiQ.append(str(page._pageid))
+ if len(apiQ) >= 500:
+ yield apiQ
+ apiQ = []
+ if apiQ:
+ yield apiQ
+
+ def get_redirects_via_api(self, maxlen=8):
+ """
+ Return a generator that yields tuples of data about redirect Pages:
+ 0 - page title of a redirect page
+ 1 - type of redirect:
+ 0 - broken redirect, target page title missing
+ 1 - normal redirect, target page exists and is not a
+ redirect
+ 2..maxlen - start of a redirect chain of that many redirects
+ (currently, the API seems not to return sufficient
+ data to make these return values possible, but
+ that may change)
+ maxlen+1 - start of an even longer chain, or a loop
+ (currently, the API seems not to return sufficient
+ data to allow this return values, but that may
+ change)
+ None - start of a redirect chain of unknown length, or loop
+ 2 - target page title of the redirect, or chain (may not exist)
+ 3 - target page of the redirect, or end of chain, or page title where
+ chain or loop detecton was halted, or None if unknown
+ """
+ for apiQ in self._next_redirect_group():
+ gen = pywikibot.data.api.Request(action="query", redirects="",
+ pageids=apiQ)
+ data = gen.submit()
+ if 'error' in data:
+ raise RuntimeError("API query error: %s" % data)
+ if data == [] or 'query' not in data:
+ raise RuntimeError("No results given.")
+ redirects = {}
+ pages = {}
+ redirects = dict((x['from'], x['to'])
+ for x in data['query']['redirects'])
+
+ for pagetitle in data['query']['pages'].values():
+ if 'missing' in pagetitle and 'pageid' not in pagetitle:
+ pages[pagetitle['title']] = False
+ else:
+ pages[pagetitle['title']] = True
+ for redirect in redirects:
+ target = redirects[redirect]
+ result = 0
+ final = None
+ try:
+ if pages[target]:
+ final = target
+ try:
+ while result <= maxlen:
+ result += 1
+ final = redirects[final]
+ # result = None
+ except KeyError:
+ pass
+ except KeyError:
+ result = None
+ pass
+ yield (redirect, result, target, final)
+
+ def retrieve_broken_redirects(self):
+ if self.use_api:
+ count = 0
+ for (pagetitle, type, target, final) \
+ in self.get_redirects_via_api(maxlen=2):
+ if type == 0:
+ yield pagetitle
+ if self.api_number:
+ count += 1
+ if count >= self.api_number:
+ break
+# TODO: add XML dump support
+## elif self.xmlFilename == None:
+## # retrieve information from the live wiki's maintenance page
+## # broken redirect maintenance page's URL
+## path = self.site.broken_redirects_address(default_limit=False)
+## pywikibot.output(u'Retrieving special page...')
+## maintenance_txt = self.site.getUrl(path)
+##
+## # regular expression which finds redirects which point to a
+## # non-existing page inside the HTML
+## Rredir = re.compile('\<li\>\<a href=".+?" title="(.*?)"')
+##
+## redir_names = Rredir.findall(maintenance_txt)
+## pywikibot.output(u'Retrieved %d redirects from special page.\n'
+## % len(redir_names))
+## for redir_name in redir_names:
+## yield redir_name
+## else:
+## # retrieve information from XML dump
+## pywikibot.output(
+## u'Getting a list of all redirects and of all page titles...')
+## redirs, pageTitles = self.get_redirects_from_dump(
+## alsoGetPageTitles=True)
+## for (key, value) in redirs.iteritems():
+## if value not in pageTitles:
+## yield key
+
+ def retrieve_double_redirects(self):
+ if self.use_move_log:
+ for redir_page in self.get_moved_pages_redirects():
+ yield redir_page.title()
+ return
+ else:
+ count = 0
+ for (pagetitle, type, target, final) \
+ in self.get_redirects_via_api(maxlen=2):
+ if type != 0 and type != 1:
+ yield pagetitle
+ if self.api_number:
+ count += 1
+ if count >= self.api_number:
+ break
+
+# TODO: API cannot yet deliver contents of "special" pages
+## elif self.xmlFilename == None:
+## # retrieve information from the live wiki's maintenance page
+## # double redirect maintenance page's URL
+### pywikibot.config.special_page_limit = 1000
+## path = self.site.double_redirects_address(default_limit = False)
+## pywikibot.output(u'Retrieving special page...')
+## maintenance_txt = self.site.getUrl(path)
+##
+## # regular expression which finds redirects which point to
+## # another redirect inside the HTML
+## Rredir = re.compile('\<li\>\<a href=".+?" title="(.*?)">')
+## redir_names = Rredir.findall(maintenance_txt)
+## pywikibot.output(u'Retrieved %i redirects from special page.\n'
+## % len(redir_names))
+## for redir_name in redir_names:
+## yield redir_name
+## else:
+## redict = self.get_redirects_from_dump()
+## num = 0
+## for (key, value) in redict.iteritems():
+## num += 1
+## # check if the value - that is, the redirect target - is a
+## # redirect as well
+## if num > self.offset and value in redict:
+## yield key
+## pywikibot.output(u'\nChecking redirect %i of %i...'
+## % (num + 1, len(redict)))
+
+ def get_moved_pages_redirects(self):
+ '''generate redirects to recently-moved pages'''
+ # this will run forever, until user interrupts it
+
+ if self.offset <= 0:
+ self.offset = 1
+ start = datetime.datetime.utcnow() \
+ - datetime.timedelta(0, self.offset*3600)
+ # self.offset hours ago
+ offset_time = start.strftime("%Y%m%d%H%M%S")
+
+ move_gen = self.site.logevents(logtype="move", start=offset_time)
+ if self.api_number:
+ move_gen.set_maximum_items(self.api_number)
+ for logentry in move_gen:
+ moved_page = logentry.title()
+ try:
+ if not moved_page.isRedirectPage():
+ continue
+ except pywikibot.BadTitle:
+ continue
+ except pywikibot.ServerError:
+ continue
+ # moved_page is now a redirect, so any redirects pointing
+ # to it need to be changed
+ try:
+ for page in moved_page.getReferences(follow_redirects=True,
+ redirectsOnly=True):
+ yield page
+ except pywikibot.NoPage:
+ # original title must have been deleted after move
+ continue
+
+
+class RedirectRobot:
+ def __init__(self, action, generator, always=False, number=None):
+ self.site = pywikibot.getSite()
+ self.action = action
+ self.generator = generator
+ self.always = always
+ self.number = number
+ self.exiting = False
+
+ def prompt(self, question):
+ if not self.always:
+ choice = pywikibot.inputChoice(question,
+ ['Yes', 'No', 'All', 'Quit'],
+ ['y', 'N', 'a', 'q'], 'N')
+ if choice == 'n':
+ return False
+ elif choice == 'q':
+ self.exiting = True
+ return False
+ elif choice == 'a':
+ self.always = True
+ return True
+
+ def delete_broken_redirects(self):
+ # get reason for deletion text
+ reason = i18n.twtranslate(self.site, 'redirect-remove-broken')
+ for redir_name in self.generator.retrieve_broken_redirects():
+ self.delete_1_broken_redirect(redir_name, reason)
+ if self.exiting:
+ break
+
+ def delete_1_broken_redirect(self, redir_name, reason):
+ redir_page = pywikibot.Page(self.site, redir_name)
+ # Show the title of the page we're working on.
+ # Highlight the title in purple.
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
+ % redir_page.title())
+ try:
+ targetPage = redir_page.getRedirectTarget()
+ except pywikibot.IsNotRedirectPage:
+ pywikibot.output(u'%s is not a redirect.' % redir_page.title())
+ except pywikibot.NoPage:
+ pywikibot.output(u'%s doesn\'t exist.' % redir_page.title())
+ else:
+ try:
+ targetPage.get()
+ except pywikibot.NoPage:
+ if self.prompt(
+ u'Redirect target %s does not exist. Do you want to delete %s?'
+ % (targetPage.title(asLink=True),
+ redir_page.title(asLink=True))):
+ try:
+ redir_page.delete(reason, prompt = False)
+ except pywikibot.NoUsername:
+ if i18n.twhas_key(
+ targetPage.site.lang,
+ 'redirect-broken-redirect-template') and \
+ i18n.twhas_key(targetPage.site.lang,
+ 'redirect-remove-broken'):
+ pywikibot.output(
+ u"No sysop in user-config.py, put page to speedy deletion.")
+ content = redir_page.get(get_redirect=True)
+ ### TODO: Add bot's signature if needed
+ ### Not supported via TW yet
+ content = i18n.twtranslate(
+ targetPage.site.lang,
+ 'redirect-broken-redirect-template'
+ ) + "\n" + content
+ redir_page.put(content, reason)
+ except pywikibot.IsRedirectPage:
+ pywikibot.output(
+ u'Redirect target %s is also a redirect! Won\'t delete anything.'
+ % targetPage.title(asLink=True))
+ else:
+ #we successfully get the target page, meaning that
+ #it exists and is not a redirect: no reason to touch it.
+ pywikibot.output(
+ u'Redirect target %s does exist! Won\'t delete anything.'
+ % targetPage.title(asLink=True))
+ pywikibot.output(u'')
+
+ def fix_double_redirects(self):
+ for redir_name in self.generator.retrieve_double_redirects():
+ self.fix_1_double_redirect(redir_name)
+ if self.exiting:
+ break
+
+ def fix_1_double_redirect(self, redir_name):
+ redir = pywikibot.Page(self.site, redir_name)
+ # Show the title of the page we're working on.
+ # Highlight the title in purple.
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
+ % redir.title())
+ newRedir = redir
+ redirList = [] # bookkeeping to detect loops
+ while True:
+ redirList.append(u'%s:%s' % (newRedir.site.lang,
+ newRedir.title(withSection=False)))
+ try:
+ targetPage = newRedir.getRedirectTarget()
+ except pywikibot.IsNotRedirectPage:
+ if len(redirList) == 1:
+ pywikibot.output(u'Skipping: Page %s is not a redirect.'
+ % redir.title(asLink=True))
+ break #do nothing
+ elif len(redirList) == 2:
+ pywikibot.output(
+ u'Skipping: Redirect target %s is not a redirect.'
+ % newRedir.title(asLink=True))
+ break # do nothing
+ else:
+ pass # target found
+ except pywikibot.SectionError:
+ pywikibot.output(
+ u'Warning: Redirect target section %s doesn\'t exist.'
+ % newRedir.title(asLink=True))
+ except pywikibot.CircularRedirect, e:
+ pywikibot.warning(u"Skipping circular redirect: [[%s]]"
+ % str(e))
+ break
+ except pywikibot.BadTitle, e:
+ # str(e) is in the format 'BadTitle: [[Foo]]'
+ pywikibot.output(
+ u'Warning: Redirect target %s is not a valid page title.'
+ % str(e)[10:])
+ break
+ except pywikibot.NoPage:
+ if len(redirList) == 1:
+ pywikibot.output(u'Skipping: Page %s does not exist.'
+ % redir.title(asLink=True))
+ break
+ else:
+ if self.always:
+ pywikibot.output(
+ u"Skipping: Redirect target %s doesn't exist."
+ % newRedir.title(asLink=True))
+ break # skip if automatic
+ else:
+ pywikibot.output(
+ u"Warning: Redirect target %s doesn't exist."
+ % newRedir.title(asLink=True))
+ except pywikibot.ServerError:
+ pywikibot.output(u'Skipping: Server Error')
+ break
+ else:
+ pywikibot.output(
+ u' Links to: %s.'
+ % targetPage.title(asLink=True))
+ if targetPage.site.sitename() == 'wikipedia:en':
+ mw_msg = targetPage.site.mediawiki_message(
+ 'wikieditor-toolbar-tool-redirect-example')
+ if targetPage.title() == mw_msg:
+ pywikibot.output(
+ u"Skipping toolbar example: Redirect source is potentially vandalized.")
+ break
+ if targetPage.site != self.site:
+ pywikibot.output(
+ u'Warning: redirect target (%s) is on a different site.'
+ % targetPage.title(asLink=True))
+ if self.always:
+ break # skip if automatic
+ # watch out for redirect loops
+ if redirList.count(u'%s:%s'
+ % (targetPage.site.lang,
+ targetPage.title(withSection=False))
+ ) > 0:
+ pywikibot.output(
+ u'Warning: Redirect target %s forms a redirect loop.'
+ % targetPage.title(asLink=True))
+ break ### doesn't work. edits twice!
+## try:
+## content = targetPage.get(get_redirect=True)
+## except pywikibot.SectionError:
+## content = pywikibot.Page(
+## targetPage.site,
+## targetPage.title(withSection=False)
+## ).get(get_redirect=True)
+## if i18n.twhas_key(
+## targetPage.site.lang,
+## 'redirect-broken-redirect-template') and \
+## i18n.twhas_key(targetPage.site.lang,
+## 'redirect-remove-loop'):
+## pywikibot.output(u"Tagging redirect for deletion")
+## # Delete the two redirects
+## content = i18n.twtranslate(
+## targetPage.site.lang,
+## 'redirect-remove-loop',
+## ) + "\n" + content
+## summ = i18n.twtranslate(
+## targetPage.site.lang,
+## 'redirect-broken-redirect-template')
+## targetPage.put(content, summ)
+## redir.put(content, summ)
+## break # TODO Better implement loop redirect
+ else: # redirect target found
+ if targetPage.isStaticRedirect():
+ pywikibot.output(
+ u" Redirect target is STATICREDIRECT.")
+ pass
+ else:
+ newRedir = targetPage
+ continue
+ try:
+ oldText = redir.get(get_redirect=True)
+ except pywikibot.BadTitle:
+ pywikibot.output(u"Bad Title Error")
+ break
+ text = self.site.redirectRegex().sub(
+ '#%s %s' % (self.site.redirect(True),
+ targetPage.title(asLink=True)), oldText)
+ if text == oldText:
+ pywikibot.output(u"Note: Nothing left to do on %s"
+ % redir.title(asLink=True))
+ break
+ summary = i18n.twtranslate(self.site, 'redirect-fix-double',
+ {'to': targetPage.title(asLink=True)}
+ )
+ pywikibot.showDiff(oldText, text)
+ if self.prompt(u'Do you want to accept the changes?'):
+ try:
+ redir.put(text, summary)
+ except pywikibot.LockedPage:
+ pywikibot.output(u'%s is locked.' % redir.title())
+ except pywikibot.SpamfilterError, error:
+ pywikibot.output(
+ u"Saving page [[%s]] prevented by spam filter: %s"
+ % (redir.title(), error.url))
+ except pywikibot.PageNotSaved, error:
+ pywikibot.output(u"Saving page [[%s]] failed: %s"
+ % (redir.title(), error))
+ except pywikibot.NoUsername:
+ pywikibot.output(
+ u"Page [[%s]] not saved; sysop privileges required."
+ % redir.title())
+ except pywikibot.Error, error:
+ pywikibot.output(
+ u"Unexpected error occurred trying to save [[%s]]: %s"
+ % (redir.title(), error))
+ break
+
+ def fix_double_or_delete_broken_redirects(self):
+ # TODO: part of this should be moved to generator, the rest merged into self.run()
+ # get reason for deletion text
+ delete_reason = i18n.twtranslate(self.site, 'redirect-remove-broken')
+ count = 0
+ for (redir_name, code, target, final)\
+ in self.generator.get_redirects_via_api(maxlen=2):
+ if code == 1:
+ continue
+ elif code == 0:
+ self.delete_1_broken_redirect(redir_name, delete_reason)
+ count += 1
+ else:
+ self.fix_1_double_redirect(redir_name)
+ count += 1
+ if self.exiting or (self.number and count >= self.number):
+ break
+
+ def run(self):
+ # TODO: make all generators return a redirect type indicator,
+ # thus make them usable with 'both'
+ if self.action == 'double':
+ self.fix_double_redirects()
+ elif self.action == 'broken':
+ self.delete_broken_redirects()
+ elif self.action == 'both':
+ self.fix_double_or_delete_broken_redirects()
+
+def main(*args):
+ # read command line parameters
+ # what the bot should do (either resolve double redirs, or delete broken
+ # redirs)
+ action = None
+ # where the bot should get his infos from (either None to load the
+ # maintenance special page from the live wiki, or the filename of a
+ # local XML dump file)
+ xmlFilename = None
+ # Which namespace should be processed when using a XML dump
+ # default to -1 which means all namespaces will be processed
+ namespaces = []
+ # at which redirect shall we start searching double redirects again
+ # (only with dump); default to -1 which means all redirects are checked
+ offset = -1
+ moved_pages = False
+ api = True # rewrite always uses api, probably should get rid of this
+ start = ''
+ until = ''
+ number = None
+ always = False
+ for arg in pywikibot.handleArgs(*args):
+ if arg == 'double' or arg == 'do':
+ action = 'double'
+ elif arg == 'broken' or arg == 'br':
+ action = 'broken'
+ elif arg == 'both':
+ action = 'both'
+ elif arg.startswith('-xml'):
+ if len(arg) == 4:
+ xmlFilename = pywikibot.input(
+ u'Please enter the XML dump\'s filename: ')
+ else:
+ xmlFilename = arg[5:]
+ elif arg.startswith('-moves'):
+ moved_pages = True
+ elif arg.startswith('-namespace:'):
+ ns = arg[11:]
+ if ns == '':
+ ## "-namespace:" does NOT yield -namespace:0 further down the road!
+ ns = pywikibot.input(
+ u'Please enter a namespace by its number: ')
+# u'Please enter a namespace by its name or number: ')
+# TODO! at least for some generators.
+ if ns == '':
+ ns = '0'
+ try:
+ ns = int(ns)
+ except ValueError:
+#-namespace:all Process all namespaces. Works only with the API read interface.
+ pass
+ if not ns in namespaces:
+ namespaces.append(ns)
+ elif arg.startswith('-offset:'):
+ offset = int(arg[8:])
+ elif arg.startswith('-start:'):
+ start = arg[7:]
+ elif arg.startswith('-until:'):
+ until = arg[7:]
+ elif arg.startswith('-total:'):
+ number = int(arg[8:])
+ elif arg == '-always':
+ always = True
+ else:
+ pywikibot.output(u'Unknown argument: %s' % arg)
+
+ if xmlFilename:
+ pywikibot.error(u"Sorry, xmlreader is not yet implemented in rewrite")
+ elif not action: # or (xmlFilename and moved_pages)
+ # or (api and xmlFilename):
+ pywikibot.showHelp('redirect')
+ else:
+ gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages,
+ api, start, until, number)
+ bot = RedirectRobot(action, gen, always, number)
+ bot.run()
+
+if __name__ == '__main__':
+ try:
+ main()
+ finally:
+ pywikibot.stopme()
Property changes on: branches/rewrite/scripts/redirect.py
___________________________________________________________________
Added: svn:eol-style
+ native
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9056
Revision: 9056
Author: xqt
Date: 2011-03-13 12:46:39 +0000 (Sun, 13 Mar 2011)
Log Message:
-----------
eol-style
Modified Paths:
--------------
branches/rewrite/COPYING
branches/rewrite/INSTALL
branches/rewrite/distribute_setup.py
Property Changed:
----------------
branches/rewrite/COPYING
branches/rewrite/INSTALL
branches/rewrite/README
branches/rewrite/distribute_setup.py
Modified: branches/rewrite/COPYING
===================================================================
--- branches/rewrite/COPYING 2011-03-13 12:39:28 UTC (rev 9055)
+++ branches/rewrite/COPYING 2011-03-13 12:46:39 UTC (rev 9056)
@@ -1,23 +1,23 @@
-Copyright (c) 2004-2010 Pywikipedia bot team
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
+Copyright (c) 2004-2010 Pywikipedia bot team
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
Property changes on: branches/rewrite/COPYING
___________________________________________________________________
Added: svn:eol-style
+ native
Modified: branches/rewrite/INSTALL
===================================================================
--- branches/rewrite/INSTALL 2011-03-13 12:39:28 UTC (rev 9055)
+++ branches/rewrite/INSTALL 2011-03-13 12:46:39 UTC (rev 9056)
@@ -1,21 +1,21 @@
-To install the Pywikipediabot framework:
-
-1) Extract/unzip this package to a directory on your computer.
-2) Open a command prompt in that directory, and run the command:
-
- python setup.py install
-
-This will install the package to the "site-packages" directory of your Python
-installation. Linux/Unix users may need to add "sudo" at the beginning of the
-command line to get access to the site-packages directory; or, they can
-install the package to a different location by running:
-
- python setup.py install --home=/path/to/location
-
-where "/path/to/location" is the directory that you want the Pywikipediabot
-files installed to.
-
-Note: You will need to have an active Internet connection when you run the
-setup script, in case the installer needs to download other modules required
-by the framework.
-
+To install the Pywikipediabot framework:
+
+1) Extract/unzip this package to a directory on your computer.
+2) Open a command prompt in that directory, and run the command:
+
+ python setup.py install
+
+This will install the package to the "site-packages" directory of your Python
+installation. Linux/Unix users may need to add "sudo" at the beginning of the
+command line to get access to the site-packages directory; or, they can
+install the package to a different location by running:
+
+ python setup.py install --home=/path/to/location
+
+where "/path/to/location" is the directory that you want the Pywikipediabot
+files installed to.
+
+Note: You will need to have an active Internet connection when you run the
+setup script, in case the installer needs to download other modules required
+by the framework.
+
Property changes on: branches/rewrite/INSTALL
___________________________________________________________________
Added: svn:eol-style
+ native
Property changes on: branches/rewrite/README
___________________________________________________________________
Added: svn:eol-style
+ native
Modified: branches/rewrite/distribute_setup.py
===================================================================
--- branches/rewrite/distribute_setup.py 2011-03-13 12:39:28 UTC (rev 9055)
+++ branches/rewrite/distribute_setup.py 2011-03-13 12:46:39 UTC (rev 9056)
@@ -1,477 +1,477 @@
-#!python
-"""Bootstrap distribute installation
-
-If you want to use setuptools in your package's setup.py, just include this
-file in the same directory with it, and add this to the top of your setup.py::
-
- from distribute_setup import use_setuptools
- use_setuptools()
-
-If you want to require a specific version of setuptools, set a download
-mirror, or use an alternate download directory, you can do so by supplying
-the appropriate options to ``use_setuptools()``.
-
-This file can also be run as a script to install or upgrade setuptools.
-"""
-import os
-import sys
-import time
-import fnmatch
-import tempfile
-import tarfile
-from distutils import log
-
-try:
- from site import USER_SITE
-except ImportError:
- USER_SITE = None
-
-try:
- import subprocess
-
- def _python_cmd(*args):
- args = (sys.executable,) + args
- return subprocess.call(args) == 0
-
-except ImportError:
- # will be used for python 2.3
- def _python_cmd(*args):
- args = (sys.executable,) + args
- # quoting arguments if windows
- if sys.platform == 'win32':
- def quote(arg):
- if ' ' in arg:
- return '"%s"' % arg
- return arg
- args = [quote(arg) for arg in args]
- return os.spawnl(os.P_WAIT, sys.executable, *args) == 0
-
-DEFAULT_VERSION = "0.6.10"
-DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/"
-SETUPTOOLS_FAKED_VERSION = "0.6c11"
-
-SETUPTOOLS_PKG_INFO = """\
-Metadata-Version: 1.0
-Name: setuptools
-Version: %s
-Summary: xxxx
-Home-page: xxx
-Author: xxx
-Author-email: xxx
-License: xxx
-Description: xxx
-""" % SETUPTOOLS_FAKED_VERSION
-
-
-def _install(tarball):
- # extracting the tarball
- tmpdir = tempfile.mkdtemp()
- log.warn('Extracting in %s', tmpdir)
- old_wd = os.getcwd()
- try:
- os.chdir(tmpdir)
- tar = tarfile.open(tarball)
- _extractall(tar)
- tar.close()
-
- # going in the directory
- subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
- os.chdir(subdir)
- log.warn('Now working in %s', subdir)
-
- # installing
- log.warn('Installing Distribute')
- if not _python_cmd('setup.py', 'install'):
- log.warn('Something went wrong during the installation.')
- log.warn('See the error message above.')
- finally:
- os.chdir(old_wd)
-
-
-def _build_egg(egg, tarball, to_dir):
- # extracting the tarball
- tmpdir = tempfile.mkdtemp()
- log.warn('Extracting in %s', tmpdir)
- old_wd = os.getcwd()
- try:
- os.chdir(tmpdir)
- tar = tarfile.open(tarball)
- _extractall(tar)
- tar.close()
-
- # going in the directory
- subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
- os.chdir(subdir)
- log.warn('Now working in %s', subdir)
-
- # building an egg
- log.warn('Building a Distribute egg in %s', to_dir)
- _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)
-
- finally:
- os.chdir(old_wd)
- # returning the result
- log.warn(egg)
- if not os.path.exists(egg):
- raise IOError('Could not build the egg.')
-
-
-def _do_download(version, download_base, to_dir, download_delay):
- egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg'
- % (version, sys.version_info[0], sys.version_info[1]))
- if not os.path.exists(egg):
- tarball = download_setuptools(version, download_base,
- to_dir, download_delay)
- _build_egg(egg, tarball, to_dir)
- sys.path.insert(0, egg)
- import setuptools
- setuptools.bootstrap_install_from = egg
-
-
-def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
- to_dir=os.curdir, download_delay=15, no_fake=True):
- # making sure we use the absolute path
- to_dir = os.path.abspath(to_dir)
- was_imported = 'pkg_resources' in sys.modules or \
- 'setuptools' in sys.modules
- try:
- try:
- import pkg_resources
- if not hasattr(pkg_resources, '_distribute'):
- if not no_fake:
- _fake_setuptools()
- raise ImportError
- except ImportError:
- return _do_download(version, download_base, to_dir, download_delay)
- try:
- pkg_resources.require("distribute>="+version)
- return
- except pkg_resources.VersionConflict:
- e = sys.exc_info()[1]
- if was_imported:
- sys.stderr.write(
- "The required version of distribute (>=%s) is not available,\n"
- "and can't be installed while this script is running. Please\n"
- "install a more recent version first, using\n"
- "'easy_install -U distribute'."
- "\n\n(Currently using %r)\n" % (version, e.args[0]))
- sys.exit(2)
- else:
- del pkg_resources, sys.modules['pkg_resources'] # reload ok
- return _do_download(version, download_base, to_dir,
- download_delay)
- except pkg_resources.DistributionNotFound:
- return _do_download(version, download_base, to_dir,
- download_delay)
- finally:
- if not no_fake:
- _create_fake_setuptools_pkg_info(to_dir)
-
-def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
- to_dir=os.curdir, delay=15):
- """Download distribute from a specified location and return its filename
-
- `version` should be a valid distribute version number that is available
- as an egg for download under the `download_base` URL (which should end
- with a '/'). `to_dir` is the directory where the egg will be downloaded.
- `delay` is the number of seconds to pause before an actual download
- attempt.
- """
- # making sure we use the absolute path
- to_dir = os.path.abspath(to_dir)
- try:
- from urllib.request import urlopen
- except ImportError:
- from urllib2 import urlopen
- tgz_name = "distribute-%s.tar.gz" % version
- url = download_base + tgz_name
- saveto = os.path.join(to_dir, tgz_name)
- src = dst = None
- if not os.path.exists(saveto): # Avoid repeated downloads
- try:
- log.warn("Downloading %s", url)
- src = urlopen(url)
- # Read/write all in one block, so we don't create a corrupt file
- # if the download is interrupted.
- data = src.read()
- dst = open(saveto, "wb")
- dst.write(data)
- finally:
- if src:
- src.close()
- if dst:
- dst.close()
- return os.path.realpath(saveto)
-
-
-def _patch_file(path, content):
- """Will backup the file then patch it"""
- existing_content = open(path).read()
- if existing_content == content:
- # already patched
- log.warn('Already patched.')
- return False
- log.warn('Patching...')
- _rename_path(path)
- f = open(path, 'w')
- try:
- f.write(content)
- finally:
- f.close()
- return True
-
-
-def _same_content(path, content):
- return open(path).read() == content
-
-def _no_sandbox(function):
- def __no_sandbox(*args, **kw):
- try:
- from setuptools.sandbox import DirectorySandbox
- def violation(*args):
- pass
- DirectorySandbox._old = DirectorySandbox._violation
- DirectorySandbox._violation = violation
- patched = True
- except ImportError:
- patched = False
-
- try:
- return function(*args, **kw)
- finally:
- if patched:
- DirectorySandbox._violation = DirectorySandbox._old
- del DirectorySandbox._old
-
- return __no_sandbox
-
-@_no_sandbox
-def _rename_path(path):
- new_name = path + '.OLD.%s' % time.time()
- log.warn('Renaming %s into %s', path, new_name)
- os.rename(path, new_name)
- return new_name
-
-def _remove_flat_installation(placeholder):
- if not os.path.isdir(placeholder):
- log.warn('Unkown installation at %s', placeholder)
- return False
- found = False
- for file in os.listdir(placeholder):
- if fnmatch.fnmatch(file, 'setuptools*.egg-info'):
- found = True
- break
- if not found:
- log.warn('Could not locate setuptools*.egg-info')
- return
-
- log.warn('Removing elements out of the way...')
- pkg_info = os.path.join(placeholder, file)
- if os.path.isdir(pkg_info):
- patched = _patch_egg_dir(pkg_info)
- else:
- patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO)
-
- if not patched:
- log.warn('%s already patched.', pkg_info)
- return False
- # now let's move the files out of the way
- for element in ('setuptools', 'pkg_resources.py', 'site.py'):
- element = os.path.join(placeholder, element)
- if os.path.exists(element):
- _rename_path(element)
- else:
- log.warn('Could not find the %s element of the '
- 'Setuptools distribution', element)
- return True
-
-
-def _after_install(dist):
- log.warn('After install bootstrap.')
- placeholder = dist.get_command_obj('install').install_purelib
- _create_fake_setuptools_pkg_info(placeholder)
-
-@_no_sandbox
-def _create_fake_setuptools_pkg_info(placeholder):
- if not placeholder or not os.path.exists(placeholder):
- log.warn('Could not find the install location')
- return
- pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1])
- setuptools_file = 'setuptools-%s-py%s.egg-info' % \
- (SETUPTOOLS_FAKED_VERSION, pyver)
- pkg_info = os.path.join(placeholder, setuptools_file)
- if os.path.exists(pkg_info):
- log.warn('%s already exists', pkg_info)
- return
-
- log.warn('Creating %s', pkg_info)
- f = open(pkg_info, 'w')
- try:
- f.write(SETUPTOOLS_PKG_INFO)
- finally:
- f.close()
-
- pth_file = os.path.join(placeholder, 'setuptools.pth')
- log.warn('Creating %s', pth_file)
- f = open(pth_file, 'w')
- try:
- f.write(os.path.join(os.curdir, setuptools_file))
- finally:
- f.close()
-
-def _patch_egg_dir(path):
- # let's check if it's already patched
- pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO')
- if os.path.exists(pkg_info):
- if _same_content(pkg_info, SETUPTOOLS_PKG_INFO):
- log.warn('%s already patched.', pkg_info)
- return False
- _rename_path(path)
- os.mkdir(path)
- os.mkdir(os.path.join(path, 'EGG-INFO'))
- pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO')
- f = open(pkg_info, 'w')
- try:
- f.write(SETUPTOOLS_PKG_INFO)
- finally:
- f.close()
- return True
-
-
-def _before_install():
- log.warn('Before install bootstrap.')
- _fake_setuptools()
-
-
-def _under_prefix(location):
- if 'install' not in sys.argv:
- return True
- args = sys.argv[sys.argv.index('install')+1:]
- for index, arg in enumerate(args):
- for option in ('--root', '--prefix'):
- if arg.startswith('%s=' % option):
- top_dir = arg.split('root=')[-1]
- return location.startswith(top_dir)
- elif arg == option:
- if len(args) > index:
- top_dir = args[index+1]
- return location.startswith(top_dir)
- elif option == '--user' and USER_SITE is not None:
- return location.startswith(USER_SITE)
- return True
-
-
-def _fake_setuptools():
- log.warn('Scanning installed packages')
- try:
- import pkg_resources
- except ImportError:
- # we're cool
- log.warn('Setuptools or Distribute does not seem to be installed.')
- return
- ws = pkg_resources.working_set
- try:
- setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools',
- replacement=False))
- except TypeError:
- # old distribute API
- setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools'))
-
- if setuptools_dist is None:
- log.warn('No setuptools distribution found')
- return
- # detecting if it was already faked
- setuptools_location = setuptools_dist.location
- log.warn('Setuptools installation detected at %s', setuptools_location)
-
- # if --root or --preix was provided, and if
- # setuptools is not located in them, we don't patch it
- if not _under_prefix(setuptools_location):
- log.warn('Not patching, --root or --prefix is installing Distribute'
- ' in another location')
- return
-
- # let's see if its an egg
- if not setuptools_location.endswith('.egg'):
- log.warn('Non-egg installation')
- res = _remove_flat_installation(setuptools_location)
- if not res:
- return
- else:
- log.warn('Egg installation')
- pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO')
- if (os.path.exists(pkg_info) and
- _same_content(pkg_info, SETUPTOOLS_PKG_INFO)):
- log.warn('Already patched.')
- return
- log.warn('Patching...')
- # let's create a fake egg replacing setuptools one
- res = _patch_egg_dir(setuptools_location)
- if not res:
- return
- log.warn('Patched done.')
- _relaunch()
-
-
-def _relaunch():
- log.warn('Relaunching...')
- # we have to relaunch the process
- args = [sys.executable] + sys.argv
- sys.exit(subprocess.call(args))
-
-
-def _extractall(self, path=".", members=None):
- """Extract all members from the archive to the current working
- directory and set owner, modification time and permissions on
- directories afterwards. `path' specifies a different directory
- to extract to. `members' is optional and must be a subset of the
- list returned by getmembers().
- """
- import copy
- import operator
- from tarfile import ExtractError
- directories = []
-
- if members is None:
- members = self
-
- for tarinfo in members:
- if tarinfo.isdir():
- # Extract directories with a safe mode.
- directories.append(tarinfo)
- tarinfo = copy.copy(tarinfo)
- tarinfo.mode = 448 # decimal for oct 0700
- self.extract(tarinfo, path)
-
- # Reverse sort directories.
- if sys.version_info < (2, 4):
- def sorter(dir1, dir2):
- return cmp(dir1.name, dir2.name)
- directories.sort(sorter)
- directories.reverse()
- else:
- directories.sort(key=operator.attrgetter('name'), reverse=True)
-
- # Set correct owner, mtime and filemode on directories.
- for tarinfo in directories:
- dirpath = os.path.join(path, tarinfo.name)
- try:
- self.chown(tarinfo, dirpath)
- self.utime(tarinfo, dirpath)
- self.chmod(tarinfo, dirpath)
- except ExtractError:
- e = sys.exc_info()[1]
- if self.errorlevel > 1:
- raise
- else:
- self._dbg(1, "tarfile: %s" % e)
-
-
-def main(argv, version=DEFAULT_VERSION):
- """Install or upgrade setuptools and EasyInstall"""
- tarball = download_setuptools()
- _install(tarball)
-
-
-if __name__ == '__main__':
- main(sys.argv[1:])
+#!python
+"""Bootstrap distribute installation
+
+If you want to use setuptools in your package's setup.py, just include this
+file in the same directory with it, and add this to the top of your setup.py::
+
+ from distribute_setup import use_setuptools
+ use_setuptools()
+
+If you want to require a specific version of setuptools, set a download
+mirror, or use an alternate download directory, you can do so by supplying
+the appropriate options to ``use_setuptools()``.
+
+This file can also be run as a script to install or upgrade setuptools.
+"""
+import os
+import sys
+import time
+import fnmatch
+import tempfile
+import tarfile
+from distutils import log
+
+try:
+ from site import USER_SITE
+except ImportError:
+ USER_SITE = None
+
+try:
+ import subprocess
+
+ def _python_cmd(*args):
+ args = (sys.executable,) + args
+ return subprocess.call(args) == 0
+
+except ImportError:
+ # will be used for python 2.3
+ def _python_cmd(*args):
+ args = (sys.executable,) + args
+ # quoting arguments if windows
+ if sys.platform == 'win32':
+ def quote(arg):
+ if ' ' in arg:
+ return '"%s"' % arg
+ return arg
+ args = [quote(arg) for arg in args]
+ return os.spawnl(os.P_WAIT, sys.executable, *args) == 0
+
+DEFAULT_VERSION = "0.6.10"
+DEFAULT_URL = "http://pypi.python.org/packages/source/d/distribute/"
+SETUPTOOLS_FAKED_VERSION = "0.6c11"
+
+SETUPTOOLS_PKG_INFO = """\
+Metadata-Version: 1.0
+Name: setuptools
+Version: %s
+Summary: xxxx
+Home-page: xxx
+Author: xxx
+Author-email: xxx
+License: xxx
+Description: xxx
+""" % SETUPTOOLS_FAKED_VERSION
+
+
+def _install(tarball):
+ # extracting the tarball
+ tmpdir = tempfile.mkdtemp()
+ log.warn('Extracting in %s', tmpdir)
+ old_wd = os.getcwd()
+ try:
+ os.chdir(tmpdir)
+ tar = tarfile.open(tarball)
+ _extractall(tar)
+ tar.close()
+
+ # going in the directory
+ subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
+ os.chdir(subdir)
+ log.warn('Now working in %s', subdir)
+
+ # installing
+ log.warn('Installing Distribute')
+ if not _python_cmd('setup.py', 'install'):
+ log.warn('Something went wrong during the installation.')
+ log.warn('See the error message above.')
+ finally:
+ os.chdir(old_wd)
+
+
+def _build_egg(egg, tarball, to_dir):
+ # extracting the tarball
+ tmpdir = tempfile.mkdtemp()
+ log.warn('Extracting in %s', tmpdir)
+ old_wd = os.getcwd()
+ try:
+ os.chdir(tmpdir)
+ tar = tarfile.open(tarball)
+ _extractall(tar)
+ tar.close()
+
+ # going in the directory
+ subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])
+ os.chdir(subdir)
+ log.warn('Now working in %s', subdir)
+
+ # building an egg
+ log.warn('Building a Distribute egg in %s', to_dir)
+ _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)
+
+ finally:
+ os.chdir(old_wd)
+ # returning the result
+ log.warn(egg)
+ if not os.path.exists(egg):
+ raise IOError('Could not build the egg.')
+
+
+def _do_download(version, download_base, to_dir, download_delay):
+ egg = os.path.join(to_dir, 'distribute-%s-py%d.%d.egg'
+ % (version, sys.version_info[0], sys.version_info[1]))
+ if not os.path.exists(egg):
+ tarball = download_setuptools(version, download_base,
+ to_dir, download_delay)
+ _build_egg(egg, tarball, to_dir)
+ sys.path.insert(0, egg)
+ import setuptools
+ setuptools.bootstrap_install_from = egg
+
+
+def use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
+ to_dir=os.curdir, download_delay=15, no_fake=True):
+ # making sure we use the absolute path
+ to_dir = os.path.abspath(to_dir)
+ was_imported = 'pkg_resources' in sys.modules or \
+ 'setuptools' in sys.modules
+ try:
+ try:
+ import pkg_resources
+ if not hasattr(pkg_resources, '_distribute'):
+ if not no_fake:
+ _fake_setuptools()
+ raise ImportError
+ except ImportError:
+ return _do_download(version, download_base, to_dir, download_delay)
+ try:
+ pkg_resources.require("distribute>="+version)
+ return
+ except pkg_resources.VersionConflict:
+ e = sys.exc_info()[1]
+ if was_imported:
+ sys.stderr.write(
+ "The required version of distribute (>=%s) is not available,\n"
+ "and can't be installed while this script is running. Please\n"
+ "install a more recent version first, using\n"
+ "'easy_install -U distribute'."
+ "\n\n(Currently using %r)\n" % (version, e.args[0]))
+ sys.exit(2)
+ else:
+ del pkg_resources, sys.modules['pkg_resources'] # reload ok
+ return _do_download(version, download_base, to_dir,
+ download_delay)
+ except pkg_resources.DistributionNotFound:
+ return _do_download(version, download_base, to_dir,
+ download_delay)
+ finally:
+ if not no_fake:
+ _create_fake_setuptools_pkg_info(to_dir)
+
+def download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,
+ to_dir=os.curdir, delay=15):
+ """Download distribute from a specified location and return its filename
+
+ `version` should be a valid distribute version number that is available
+ as an egg for download under the `download_base` URL (which should end
+ with a '/'). `to_dir` is the directory where the egg will be downloaded.
+ `delay` is the number of seconds to pause before an actual download
+ attempt.
+ """
+ # making sure we use the absolute path
+ to_dir = os.path.abspath(to_dir)
+ try:
+ from urllib.request import urlopen
+ except ImportError:
+ from urllib2 import urlopen
+ tgz_name = "distribute-%s.tar.gz" % version
+ url = download_base + tgz_name
+ saveto = os.path.join(to_dir, tgz_name)
+ src = dst = None
+ if not os.path.exists(saveto): # Avoid repeated downloads
+ try:
+ log.warn("Downloading %s", url)
+ src = urlopen(url)
+ # Read/write all in one block, so we don't create a corrupt file
+ # if the download is interrupted.
+ data = src.read()
+ dst = open(saveto, "wb")
+ dst.write(data)
+ finally:
+ if src:
+ src.close()
+ if dst:
+ dst.close()
+ return os.path.realpath(saveto)
+
+
+def _patch_file(path, content):
+ """Will backup the file then patch it"""
+ existing_content = open(path).read()
+ if existing_content == content:
+ # already patched
+ log.warn('Already patched.')
+ return False
+ log.warn('Patching...')
+ _rename_path(path)
+ f = open(path, 'w')
+ try:
+ f.write(content)
+ finally:
+ f.close()
+ return True
+
+
+def _same_content(path, content):
+ return open(path).read() == content
+
+def _no_sandbox(function):
+ def __no_sandbox(*args, **kw):
+ try:
+ from setuptools.sandbox import DirectorySandbox
+ def violation(*args):
+ pass
+ DirectorySandbox._old = DirectorySandbox._violation
+ DirectorySandbox._violation = violation
+ patched = True
+ except ImportError:
+ patched = False
+
+ try:
+ return function(*args, **kw)
+ finally:
+ if patched:
+ DirectorySandbox._violation = DirectorySandbox._old
+ del DirectorySandbox._old
+
+ return __no_sandbox
+
+@_no_sandbox
+def _rename_path(path):
+ new_name = path + '.OLD.%s' % time.time()
+ log.warn('Renaming %s into %s', path, new_name)
+ os.rename(path, new_name)
+ return new_name
+
+def _remove_flat_installation(placeholder):
+ if not os.path.isdir(placeholder):
+ log.warn('Unkown installation at %s', placeholder)
+ return False
+ found = False
+ for file in os.listdir(placeholder):
+ if fnmatch.fnmatch(file, 'setuptools*.egg-info'):
+ found = True
+ break
+ if not found:
+ log.warn('Could not locate setuptools*.egg-info')
+ return
+
+ log.warn('Removing elements out of the way...')
+ pkg_info = os.path.join(placeholder, file)
+ if os.path.isdir(pkg_info):
+ patched = _patch_egg_dir(pkg_info)
+ else:
+ patched = _patch_file(pkg_info, SETUPTOOLS_PKG_INFO)
+
+ if not patched:
+ log.warn('%s already patched.', pkg_info)
+ return False
+ # now let's move the files out of the way
+ for element in ('setuptools', 'pkg_resources.py', 'site.py'):
+ element = os.path.join(placeholder, element)
+ if os.path.exists(element):
+ _rename_path(element)
+ else:
+ log.warn('Could not find the %s element of the '
+ 'Setuptools distribution', element)
+ return True
+
+
+def _after_install(dist):
+ log.warn('After install bootstrap.')
+ placeholder = dist.get_command_obj('install').install_purelib
+ _create_fake_setuptools_pkg_info(placeholder)
+
+@_no_sandbox
+def _create_fake_setuptools_pkg_info(placeholder):
+ if not placeholder or not os.path.exists(placeholder):
+ log.warn('Could not find the install location')
+ return
+ pyver = '%s.%s' % (sys.version_info[0], sys.version_info[1])
+ setuptools_file = 'setuptools-%s-py%s.egg-info' % \
+ (SETUPTOOLS_FAKED_VERSION, pyver)
+ pkg_info = os.path.join(placeholder, setuptools_file)
+ if os.path.exists(pkg_info):
+ log.warn('%s already exists', pkg_info)
+ return
+
+ log.warn('Creating %s', pkg_info)
+ f = open(pkg_info, 'w')
+ try:
+ f.write(SETUPTOOLS_PKG_INFO)
+ finally:
+ f.close()
+
+ pth_file = os.path.join(placeholder, 'setuptools.pth')
+ log.warn('Creating %s', pth_file)
+ f = open(pth_file, 'w')
+ try:
+ f.write(os.path.join(os.curdir, setuptools_file))
+ finally:
+ f.close()
+
+def _patch_egg_dir(path):
+ # let's check if it's already patched
+ pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO')
+ if os.path.exists(pkg_info):
+ if _same_content(pkg_info, SETUPTOOLS_PKG_INFO):
+ log.warn('%s already patched.', pkg_info)
+ return False
+ _rename_path(path)
+ os.mkdir(path)
+ os.mkdir(os.path.join(path, 'EGG-INFO'))
+ pkg_info = os.path.join(path, 'EGG-INFO', 'PKG-INFO')
+ f = open(pkg_info, 'w')
+ try:
+ f.write(SETUPTOOLS_PKG_INFO)
+ finally:
+ f.close()
+ return True
+
+
+def _before_install():
+ log.warn('Before install bootstrap.')
+ _fake_setuptools()
+
+
+def _under_prefix(location):
+ if 'install' not in sys.argv:
+ return True
+ args = sys.argv[sys.argv.index('install')+1:]
+ for index, arg in enumerate(args):
+ for option in ('--root', '--prefix'):
+ if arg.startswith('%s=' % option):
+ top_dir = arg.split('root=')[-1]
+ return location.startswith(top_dir)
+ elif arg == option:
+ if len(args) > index:
+ top_dir = args[index+1]
+ return location.startswith(top_dir)
+ elif option == '--user' and USER_SITE is not None:
+ return location.startswith(USER_SITE)
+ return True
+
+
+def _fake_setuptools():
+ log.warn('Scanning installed packages')
+ try:
+ import pkg_resources
+ except ImportError:
+ # we're cool
+ log.warn('Setuptools or Distribute does not seem to be installed.')
+ return
+ ws = pkg_resources.working_set
+ try:
+ setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools',
+ replacement=False))
+ except TypeError:
+ # old distribute API
+ setuptools_dist = ws.find(pkg_resources.Requirement.parse('setuptools'))
+
+ if setuptools_dist is None:
+ log.warn('No setuptools distribution found')
+ return
+ # detecting if it was already faked
+ setuptools_location = setuptools_dist.location
+ log.warn('Setuptools installation detected at %s', setuptools_location)
+
+ # if --root or --preix was provided, and if
+ # setuptools is not located in them, we don't patch it
+ if not _under_prefix(setuptools_location):
+ log.warn('Not patching, --root or --prefix is installing Distribute'
+ ' in another location')
+ return
+
+ # let's see if its an egg
+ if not setuptools_location.endswith('.egg'):
+ log.warn('Non-egg installation')
+ res = _remove_flat_installation(setuptools_location)
+ if not res:
+ return
+ else:
+ log.warn('Egg installation')
+ pkg_info = os.path.join(setuptools_location, 'EGG-INFO', 'PKG-INFO')
+ if (os.path.exists(pkg_info) and
+ _same_content(pkg_info, SETUPTOOLS_PKG_INFO)):
+ log.warn('Already patched.')
+ return
+ log.warn('Patching...')
+ # let's create a fake egg replacing setuptools one
+ res = _patch_egg_dir(setuptools_location)
+ if not res:
+ return
+ log.warn('Patched done.')
+ _relaunch()
+
+
+def _relaunch():
+ log.warn('Relaunching...')
+ # we have to relaunch the process
+ args = [sys.executable] + sys.argv
+ sys.exit(subprocess.call(args))
+
+
+def _extractall(self, path=".", members=None):
+ """Extract all members from the archive to the current working
+ directory and set owner, modification time and permissions on
+ directories afterwards. `path' specifies a different directory
+ to extract to. `members' is optional and must be a subset of the
+ list returned by getmembers().
+ """
+ import copy
+ import operator
+ from tarfile import ExtractError
+ directories = []
+
+ if members is None:
+ members = self
+
+ for tarinfo in members:
+ if tarinfo.isdir():
+ # Extract directories with a safe mode.
+ directories.append(tarinfo)
+ tarinfo = copy.copy(tarinfo)
+ tarinfo.mode = 448 # decimal for oct 0700
+ self.extract(tarinfo, path)
+
+ # Reverse sort directories.
+ if sys.version_info < (2, 4):
+ def sorter(dir1, dir2):
+ return cmp(dir1.name, dir2.name)
+ directories.sort(sorter)
+ directories.reverse()
+ else:
+ directories.sort(key=operator.attrgetter('name'), reverse=True)
+
+ # Set correct owner, mtime and filemode on directories.
+ for tarinfo in directories:
+ dirpath = os.path.join(path, tarinfo.name)
+ try:
+ self.chown(tarinfo, dirpath)
+ self.utime(tarinfo, dirpath)
+ self.chmod(tarinfo, dirpath)
+ except ExtractError:
+ e = sys.exc_info()[1]
+ if self.errorlevel > 1:
+ raise
+ else:
+ self._dbg(1, "tarfile: %s" % e)
+
+
+def main(argv, version=DEFAULT_VERSION):
+ """Install or upgrade setuptools and EasyInstall"""
+ tarball = download_setuptools()
+ _install(tarball)
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
Property changes on: branches/rewrite/distribute_setup.py
___________________________________________________________________
Added: svn:eol-style
+ native
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9054
Revision: 9054
Author: xqt
Date: 2011-03-13 12:28:06 +0000 (Sun, 13 Mar 2011)
Log Message:
-----------
readme files for subdirs
Added Paths:
-----------
trunk/pywikipedia/cache/README
trunk/pywikipedia/category/README
Copied: trunk/pywikipedia/cache/README (from rev 9049, trunk/pywikipedia/botlists/README)
===================================================================
--- trunk/pywikipedia/cache/README (rev 0)
+++ trunk/pywikipedia/cache/README 2011-03-13 12:28:06 UTC (rev 9054)
@@ -0,0 +1,2 @@
+This directory/folder is empty when you get the package. It is used by the
+robots.
Copied: trunk/pywikipedia/category/README (from rev 9049, trunk/pywikipedia/botlists/README)
===================================================================
--- trunk/pywikipedia/category/README (rev 0)
+++ trunk/pywikipedia/category/README 2011-03-13 12:28:06 UTC (rev 9054)
@@ -0,0 +1,2 @@
+This directory/folder is empty when you get the package. It is used by the
+robots.
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9053
Revision: 9053
Author: xqt
Date: 2011-03-13 12:24:49 +0000 (Sun, 13 Mar 2011)
Log Message:
-----------
stripped trailing whitespace
Modified Paths:
--------------
trunk/pywikipedia/commonsdelinker/checkusage.py
trunk/pywikipedia/commonsdelinker/delinker.py
trunk/pywikipedia/commonsdelinker/image_replacer.py
trunk/pywikipedia/commonsdelinker/threadpool.py
Modified: trunk/pywikipedia/commonsdelinker/checkusage.py
===================================================================
--- trunk/pywikipedia/commonsdelinker/checkusage.py 2011-03-13 12:19:20 UTC (rev 9052)
+++ trunk/pywikipedia/commonsdelinker/checkusage.py 2011-03-13 12:24:49 UTC (rev 9053)
@@ -1,32 +1,32 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-This module provides a way for users of the Wikimedia toolserver to check the
+This module provides a way for users of the Wikimedia toolserver to check the
use of images from Commons on other Wikimedia wikis. It supports both running
-checkusage against the database and against the live wikis. It is very
-efficient as it only creates one HTTP connection and one MySQL connection
+checkusage against the database and against the live wikis. It is very
+efficient as it only creates one HTTP connection and one MySQL connection
during its life time. It is not suitable for multithreading!
-
+
The CheckUsage class' constructor accept as parameters the maximum number of
wikis that should be checked, an option to use it only live and the parameters
-to connect to the MySQL database. The top wikis in size will be checked. The
+to connect to the MySQL database. The top wikis in size will be checked. The
class provides multiple methods:
-
+
get_usage(image)
-This method will return a generator object that generates the usage of the
+This method will return a generator object that generates the usage of the
image, returned as the following tuple: (page_namespace, page_title,
full_title). page_namespace is the numeric namespace, page_title the page title
without namespace, full_title the page title including localized namespace.
-
+
get_usage_db(dbname, image), get_usage_live(domain, image)
Those methods allow querying a specific wiki, respectively against the database
and against the live wiki. They accept respectively the database name and the
-domain name. The return a generator which generates the same results as
+domain name. The return a generator which generates the same results as
get_usage().
-
+
get_usage_multi(images)
Calls get_usage for each image and returns a dictionary with usages.
-
+
get_replag(dbname)
Returns the time in seconds since the latest known edit of dbname.
"""
@@ -37,13 +37,13 @@
#
__version__ = '$Id$'
#
-
+
import httplib, urlparse, socket, time
from urllib import urlencode
import simplejson
import wikipedia, family
-
+
try:
import MySQLdb
except ImportError:
@@ -53,7 +53,7 @@
except ImportError:
pass
__ver__ = '0.4c'
-
+
def strip_ns(title):
title = title.replace(' ', '_')
if title.find(':') != -1:
@@ -63,14 +63,14 @@
if title.startswith('Image:'):
return strip_ns(title)
return title
-
+
def family(domain):
if domain is None:
raise RuntimeError('None is not a valid family')
-
+
wiki = domain.split('.')
# Standard family
- if wiki[1] in ('wikipedia', 'wiktionary', 'wikibooks',
+ if wiki[1] in ('wikipedia', 'wiktionary', 'wikibooks',
'wikiquote', 'wikisource', 'wikinews', 'wikiversity'):
return wiki[0], wiki[1]
# Family on own domain
@@ -92,7 +92,7 @@
#self._conn.set_debuglevel(100)
self._conn.connect()
- def request(self, method, path, headers, data):
+ def request(self, method, path, headers, data):
if not headers: headers = {}
if not data: data = ''
headers['Connection'] = 'Keep-Alive'
@@ -143,28 +143,28 @@
data = simplejson.load(res)
finally:
res.close()
-
+
if 'error' in data:
if data['error']['code'] == u'internal_api_error_DBConnectionError':
return self.query_api(host, path, **kwargs)
- raise wikipedia.Error(data['error']['code'],
+ raise wikipedia.Error(data['error']['code'],
data['error']['info'])
-
+
return data
def close(self):
self._conn.close()
class HTTPPool(list):
- def __init__(self, retry_timeout = 10, max_retries = -1,
+ def __init__(self, retry_timeout = 10, max_retries = -1,
callback = lambda *args: None):
-
+
self.retry_timeout = retry_timeout
self.max_retries = -1
self.callback = callback
self.current_retry = 0
-
+
list.__init__(self, ())
-
+
def query_api(self, host, path, **kwargs):
conn = self.find_conn(host)
while True:
@@ -180,7 +180,7 @@
self.wait()
conn = self.find_conn(host)
-
+
def find_conn(self, host):
for conn in self:
if host in conn.hosts:
@@ -199,37 +199,37 @@
conn.hosts = []
self.append(conn)
return self
-
+
def wait(self):
if self.current_retry > self.max_retries and self.max_retries != -1:
raise RuntimeError('Maximum retries exceeded')
if self.current_retry:
self.callback(self)
- time.sleep(self.current_retry * self.retry_timeout)
+ time.sleep(self.current_retry * self.retry_timeout)
self.current_retry += 1
-
+
def close(self):
for conn in self:
conn.close()
del self[:]
-
+
class CheckUsage(object):
- def __init__(self, limit = 100,
+ def __init__(self, limit = 100,
mysql_default_server = 3, mysql_host_prefix = 'sql-s', mysql_host_suffix = '',
- mysql_kwargs = {}, no_db = False, use_autoconn = False,
-
- http_retry_timeout = 30, http_max_retries = -1,
+ mysql_kwargs = {}, no_db = False, use_autoconn = False,
+
+ http_retry_timeout = 30, http_max_retries = -1,
http_callback = lambda *args: None,
-
+
mysql_retry_timeout = 60,
mysql_max_retries = -1, mysql_callback = lambda *args: None):
-
- self.http = None
+
+ self.http = None
self.http_retry_timeout = http_retry_timeout
self.http_max_retries = http_max_retries
self.http_callback = http_callback
-
+
if no_db: return
self.mysql_host_prefix = mysql_host_prefix
@@ -239,18 +239,18 @@
self.mysql_retry_timeout = mysql_retry_timeout
self.mysql_max_retries = mysql_max_retries
self.mysql_callback = mysql_callback
-
+
self.connections = []
-
+
# Mapping database name -> mysql connection
self.databases = {}
# Mapping server id -> mysql connection
self.servers = {}
# Mapping database name -> (lang, family)
self.sites = {}
-
+
self.domains = {}
-
+
self.unknown_families = []
# Mapping family name -> family object
self.known_families = {}
@@ -263,7 +263,7 @@
for dbname, domain, server in cursor.fetchall():
if server not in self.servers:
self.servers[server] = self.connect_mysql(mysql_host_prefix + str(server) + mysql_host_suffix)
-
+
# FIXME: wikimediafoundation!
# TODO: This is one big mess
try:
@@ -275,7 +275,7 @@
else:
self.sites[dbname] = (lang, fam)
self.databases[dbname] = self.servers[server]
-
+
self.domains[dbname] = domain
@@ -286,8 +286,8 @@
if self.use_autoconn:
database = mysql_autoconnection.connect(
use_unicode = False, host = host,
- retry_timeout = self.mysql_retry_timeout,
- max_retries = self.mysql_max_retries,
+ retry_timeout = self.mysql_retry_timeout,
+ max_retries = self.mysql_max_retries,
callback = self.mysql_callback,
**self.mysql_kwargs)
else:
@@ -298,7 +298,7 @@
return database, cursor
def connect_http(self):
if not self.http:
- self.http = HTTPPool(retry_timeout = self.http_retry_timeout,
+ self.http = HTTPPool(retry_timeout = self.http_retry_timeout,
max_retries = self.http_max_retries, callback = self.http_callback)
def get_usage(self, image):
@@ -311,14 +311,14 @@
#image = strip_image(image)
lang, family_name = self.sites[dbname]
family = self.known_families[family_name]
-
+
if family.shared_image_repository(lang) != (lang, family_name) and shared:
left_join = 'LEFT JOIN %s.image ON (il_to = img_name) WHERE img_name IS NULL AND' % dbname
else:
left_join = 'WHERE';
query = """SELECT page_namespace, page_title FROM %s.page, %s.imagelinks
%s page_id = il_from AND il_to = %%s"""
- self.databases[dbname][1].execute(query % (dbname, dbname, left_join),
+ self.databases[dbname][1].execute(query % (dbname, dbname, left_join),
(image.encode('utf-8', 'ignore'), ))
for page_namespace, page_title in self.databases[dbname][1]:
stripped_title = page_title.decode('utf-8', 'ignore')
@@ -330,32 +330,32 @@
def get_usage_live(self, site, image, shared = False):
self.connect_http()
-
+
if type(site) is str:
hostname = site
apipath = '/w/api.php'
else:
hostname = site.hostname()
apipath = site.apipath()
-
+
# FIXME: Use continue
kwargs = {'action': 'query', 'iutitle': u'Image:' + image,
'titles': u'Image:' + image, 'prop': 'info'}
kwargs['list'] = 'imageusage'
kwargs['iulimit'] = '500'
-
+
res = self.http.query_api(hostname, apipath,
**kwargs)
if '-1' not in res['query']['pages'] and shared:
return
-
+
usages = res['query'].get('imageusage')
if not usages: return
-
+
# Apparently this someday changed from dict to list?
if type(usages) is dict:
usages = usages.values()
-
+
for usage in usages:
title = usage['title'].replace(' ', '_')
namespace = usage['ns']
@@ -365,7 +365,7 @@
stripped_title = title
yield namespace, stripped_title, title
-
+
def exists(self, site, image):
self.connect_http()
# Check whether the image still is deleted on Commons.
@@ -375,8 +375,8 @@
# BUG: This is ugly.
return '-1' not in self.http.query_api(site.hostname(), site.apipath(),
action = 'query', titles = 'Image:' + image)['query']['pages']
-
-
+
+
def close(self):
if getattr(self, 'http'):
self.http.close()
@@ -384,7 +384,6 @@
for connection, cursor in self.databases.itervalues():
try:
connection.close()
- except:
+ except:
pass
-
\ No newline at end of file
Modified: trunk/pywikipedia/commonsdelinker/delinker.py
===================================================================
--- trunk/pywikipedia/commonsdelinker/delinker.py 2011-03-13 12:19:20 UTC (rev 9052)
+++ trunk/pywikipedia/commonsdelinker/delinker.py 2011-03-13 12:24:49 UTC (rev 9053)
@@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-This script keeps track of image deletions and delinks removed files
+This script keeps track of image deletions and delinks removed files
from (any) wiki. Usage
on protected pages or pages containing blacklisted external links cannot
be processed.
@@ -15,7 +15,7 @@
Please refer to delinker.txt for full documentation.
"""
#
-#
+#
# (C) Kyle/Orgullomoore, 2006-2007
# (C) Siebrand Mazeland, 2006-2007
# (C) Bryan Tong Minh, 2007-2008
@@ -55,7 +55,7 @@
output(u'%s Connection has been lost in %s. Attempting reconnection.' % (threading.currentThread(), repr(object)), False)
if hasattr(object, 'error'):
output(u'Error was %s: %s' % tuple(object.error))
-
+
def universal_unicode(s):
if type(s) is str:
return s.decode('utf-8', 'ignore')
@@ -75,11 +75,11 @@
# the standard MySQL character set.
kwargs['use_unicode'] = False
kwargs['callback'] = wait_callback
-
+
return mysql_autoconnection.connect(**kwargs)
# TODO: Add support for sqlite3
raise RuntimeError('Unsupported database engine %s' % engine)
-
+
class ImmutableByReference(object):
def __init__(self, data):
self.data = data
@@ -100,30 +100,30 @@
threadpool.Thread.__init__(self, pool)
self.CommonsDelinker = CommonsDelinker
self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new')
-
+
def delink_image(self, image, usage, timestamp, admin, reason, replacement = None):
""" Performs the delink for image on usage. """
output(u'%s Usage of %s: %s' % (self, image, usage))
if self.CommonsDelinker.exec_hook('before_delink',
(image, usage, timestamp, admin, reason, replacement)) is False:
return
-
+
skipped_images = {}
for (lang, family), pages in usage.iteritems():
site = self.CommonsDelinker.get_site(lang, family)
if not site:
output(u'%s Warning! Unknown site %s:%s' % (self, family, lang))
continue
-
+
try:
summary = self.get_summary(site, image, admin, reason, replacement)
-
+
for page_namespace, page_title, title in pages:
if (site.lang, site.family.name) == (self.CommonsDelinker.site.lang,
self.CommonsDelinker.site.family.name) and \
(page_namespace, page_title) == (6, image):
continue
-
+
if self.CommonsDelinker.set_edit(str(site), title):
# The page is currently being editted. Postpone.
if (lang, family) not in skipped_images:
@@ -133,7 +133,7 @@
else:
# Delink the image
output(u'%s Delinking %s from %s' % (self, image, site))
-
+
try:
try:
result = self.replace_image(image, site, title, summary, replacement)
@@ -147,14 +147,14 @@
(page_namespace, page_title, title))
finally:
self.CommonsDelinker.unset_edit(str(site), title)
-
+
# Add to logging queue
if self.sql_layout == 'new':
- self.CommonsDelinker.Loggers.append((timestamp, image,
+ self.CommonsDelinker.Loggers.append((timestamp, image,
site.lang, site.family.name, page_namespace, page_title,
result, replacement))
else:
- self.CommonsDelinker.Loggers.append((timestamp, image, site.hostname(),
+ self.CommonsDelinker.Loggers.append((timestamp, image, site.hostname(),
page_namespace, page_title, result, replacement))
finally:
self.CommonsDelinker.unlock_site(site)
@@ -168,14 +168,14 @@
elif replacement:
# Let them know that we are done replacing.
self.CommonsDelinker.Loggers.append((timestamp, image, replacement))
-
+
def replace_image(self, image, site, page_title, summary, replacement = None):
""" The actual replacement. Giving None as argument for replacement
will delink instead of replace."""
-
+
page = wikipedia.Page(site, page_title)
hook = None
-
+
# TODO: Per site config.
if page.namespace() in self.CommonsDelinker.config['delink_namespaces']:
try:
@@ -183,25 +183,25 @@
except wikipedia.NoPage:
return 'failed'
new_text = text
-
+
m_image = ImmutableByReference(image)
m_replacement = ImmutableByReference(replacement)
- self.CommonsDelinker.exec_hook('before_replace',
+ self.CommonsDelinker.exec_hook('before_replace',
(page, summary, m_image, m_replacement))
image = m_image.get()
replacement = m_replacement.get()
-
+
def create_regex(s):
first, other = re.escape(s[0]), re.escape(s[1:])
return ur'(?:[%s%s]%s)' % (first.upper(), first.lower(), other)
def create_regex_i(s):
return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(), c.lower()) for c in s])
-
+
namespaces = site.namespace(6, all = True) + site.namespace(-2, all = True)
r_namespace = ur'\s*(?:%s)\s*\:\s*' % u'|'.join(map(create_regex_i, namespaces))
# Note that this regex creates a group!
r_image = u'(%s)' % create_regex(image).replace(r'\_', '[ _]')
-
+
def simple_replacer(match):
m_replacement = ImmutableByReference(replacement)
groups = list(match.groups())
@@ -209,21 +209,21 @@
if False is self.CommonsDelinker.exec_hook('%s_replace' % hook,
(page, summary, image, m_replacement, match, groups)):
return u''.join(groups)
-
+
if m_replacement.get() is None:
return u''
else:
groups[1] = m_replacement.get()
return u''.join(groups)
-
- # Previously links in image descriptions will cause
+
+ # Previously links in image descriptions will cause
# unexpected behaviour: [[Image:image.jpg|thumb|[[link]] in description]]
# will truncate at the first occurence of ]]. This cannot be
# fixed using one regular expression.
# This means that all ]] after the start of the image
# must be located. If it then does not have an associated
# [[, this one is the closure of the image.
-
+
r_simple_s = u'(\[\[%s)%s' % (r_namespace, r_image)
r_s = '\[\['
r_e = '\]\]'
@@ -231,25 +231,25 @@
image_starts = [match.start() for match in re.finditer(r_simple_s, text)]
link_starts = [match.start() for match in re.finditer(r_s, text)]
link_ends = [match.end() for match in re.finditer(r_e, text)]
-
+
r_simple = u'(\[\[%s)%s(.*)' % (r_namespace, r_image)
hook = 'simple'
replacements = []
for image_start in image_starts:
- current_link_starts = [link_start for link_start in link_starts
+ current_link_starts = [link_start for link_start in link_starts
if link_start > image_start]
- current_link_ends = [link_end for link_end in link_ends
+ current_link_ends = [link_end for link_end in link_ends
if link_end > image_start]
end = image_start
if current_link_ends: end = current_link_ends[0]
-
+
while current_link_starts and current_link_ends:
start = current_link_starts.pop(0)
end = current_link_ends.pop(0)
if end <= start and end > image_start:
# Found the end of the image
break
-
+
# Check whether this image is the first one on the line
if image_start == 0:
prev = ''
@@ -262,38 +262,38 @@
end += 1
else:
break
-
+
# Add the replacement to the todo list. Doing the
# replacement right know would alter the indices.
replacements.append((new_text[image_start:end],
- re.sub(r_simple, simple_replacer,
+ re.sub(r_simple, simple_replacer,
new_text[image_start:end])))
-
+
# Perform the replacements
for old, new in replacements:
if old: new_text = new_text.replace(old, new)
-
+
# Remove the image from galleries
hook = 'gallery'
- r_galleries = ur'(?s)(\<%s\>)(.*?)(\<\/%s\>)' % (create_regex_i('gallery'),
+ r_galleries = ur'(?s)(\<%s\>)(.*?)(\<\/%s\>)' % (create_regex_i('gallery'),
create_regex_i('gallery'))
r_gallery = ur'(?m)^((?:%s)?)%s(\s*(?:\|.*?)?\s*$)' % (r_namespace, r_image)
def gallery_replacer(match):
- return ur'%s%s%s' % (match.group(1), re.sub(r_gallery,
+ return ur'%s%s%s' % (match.group(1), re.sub(r_gallery,
simple_replacer, match.group(2)), match.group(3))
new_text = re.sub(r_galleries, gallery_replacer, new_text)
-
+
if text == new_text or self.CommonsDelinker.config.get('force_complex', False):
# All previous steps did not work, so the image is
# likely embedded in a complicated template.
hook = 'complex'
r_templates = ur'(?s)(\{\{.*?\}\})'
r_complicated = u'(?s)(?<=[|{=])[\s\u200E\uFEFF\u200B\u200C]*((?:%s)?)%s[\u200E\uFEFF\u200B\u200C]*' % (r_namespace, r_image)
-
+
def template_replacer(match):
return re.sub(r_complicated, simple_replacer, match.group(1))
new_text = re.sub(r_templates, template_replacer, text)
-
+
if text != new_text:
# Save to the wiki
# Code for checking user page existance has been moved
@@ -304,7 +304,7 @@
if False is self.CommonsDelinker.exec_hook('before_save',
(page, text, new_text, m_summary)):
return 'skipped'
-
+
is_retry = False
while True:
try:
@@ -330,18 +330,18 @@
else:
return 'skipped'
return 'skipped'
-
-
-
+
+
+
def do(self, args):
try:
self.delink_image(*args)
except:
output(u'An exception occured in %s' % self, False)
traceback.print_exc(file = sys.stderr)
-
+
def get_summary(self, site, image, admin, reason, replacement):
- """ Get the summary template and substitute the
+ """ Get the summary template and substitute the
correct values."""
# FIXME: Hardcode is EVIL, but now only the global bot uses this
if (site.lang != 'commons' and self.CommonsDelinker.config['global']):
@@ -350,7 +350,7 @@
tlp = self.CommonsDelinker.SummaryCache.get(site, 'replace-I18n')
else:
tlp = self.CommonsDelinker.SummaryCache.get(site, 'summary-I18n')
-
+
tlp = tlp.replace('$1', image)
if replacement:
tlp = tlp.replace('$2', replacement)
@@ -359,23 +359,23 @@
else:
tlp = tlp.replace('$2', unicode(admin))
tlp = tlp.replace('$3', unicode(reason))
-
+
return tlp
-
+
class SummaryCache(object):
""" Object to thread-safe cache summary templates. """
def __init__(self, CommonsDelinker):
self.summaries = {}
self.lock = threading.Lock()
self.CommonsDelinker = CommonsDelinker
-
+
def get(self, site, type, key = None, default = None):
- # This can probably also provide something for
- # localised settings, but then it first needs to
+ # This can probably also provide something for
+ # localised settings, but then it first needs to
# check whether the page is sysop only.
if not key:
key = str(site)
-
+
self.lock.acquire()
try:
if type not in self.summaries:
@@ -385,9 +385,9 @@
self.CommonsDelinker.config['summary_cache']:
# Return cached result
return self.summaries[type][key][0]
-
+
output(u'%s Fetching new summary for %s' % (self, site))
-
+
# FIXME: evil
if self.CommonsDelinker.config['global']:
self.check_user_page(site)
@@ -402,25 +402,25 @@
pass
finally:
self.lock.release()
-
+
# No i18n available, but it may be available in the wikipedia
# of that language. Only do so for wiktionary, wikibooks,
# wikiquote, wikisource, wikinews, wikiversity
# This will cause the bot to function even on special wikis
# like mediawiki.org and meta and species.
output(u'%s Using default summary for %s' % (self, site))
-
+
if default: return default
-
+
if site.family.name != 'wikipedia' and self.CommonsDelinker.config['global']:
- if site.family.name in ('wiktionary', 'wikibooks', 'wikiquote',
+ if site.family.name in ('wiktionary', 'wikibooks', 'wikiquote',
'wikisource', 'wikinews', 'wikiversity'):
if site.lang in config.usernames['wikipedia']:
- newsite = self.CommonsDelinker.get_site(site.lang,
+ newsite = self.CommonsDelinker.get_site(site.lang,
wikipedia.Family('wikipedia'))
return self.get(newsite, type, key = key)
return self.CommonsDelinker.config['default_settings'].get(type, '')
-
+
def check_user_page(self, site):
"Check whether a userpage exists. Only used for CommonsDelinker."
try:
@@ -435,24 +435,24 @@
ftxt = f.read()
f.close()
if not '#' + str(site) in ftxt:
- username = config.usernames[site.family.name][site.lang]
-
+ username = config.usernames[site.family.name][site.lang]
+
userpage = wikipedia.Page(site, 'User:' + username)
- # Removed check for page existence. If it is not in our
+ # Removed check for page existence. If it is not in our
# database we can safely assume that we have no user page
# there. In case there is, we will just overwrite it once.
- # It causes no real problems, but it is one call to the
+ # It causes no real problems, but it is one call to the
# servers less.
# TODO: Config setting?
userpage.put('#REDIRECT [[m:User:CommonsDelinker]]', '')
-
+
f = open(filename, 'a')
f.write('#' + str(site))
f.close()
except wikipedia.LockedPage:
# User page is protected, continue anyway
- pass
-
+ pass
+
class CheckUsage(threadpool.Thread):
timeout = 120
def __init__(self, pool, CommonsDelinker):
@@ -460,14 +460,14 @@
self.CommonsDelinker = CommonsDelinker
# Not really thread safe, but we should only do read operations...
self.site = CommonsDelinker.site
-
+
def run(self):
try:
self.connect()
except:
return self.exit()
threadpool.Thread.run(self)
-
+
def connect(self):
output(u'%s Connecting to databases' % self)
config = self.CommonsDelinker.config
@@ -475,22 +475,22 @@
# Note: global use requires MySQL
self.CheckUsage = checkusage.CheckUsage(limit = sys.maxint,
mysql_kwargs = config['sql_config'],
- use_autoconn = True,
+ use_autoconn = True,
http_callback = wait_callback,
mysql_callback = wait_callback,
mysql_host_suffix = '-fast')
else:
self.CheckUsage = checkusage.CheckUsage(sys.maxint,
http_callback = wait_callback, no_db = True)
-
-
+
+
def check_usage(self, image, timestamp, admin, reason, replacement):
""" Check whether this image needs to be delinked. """
-
+
# Check whether the image still is deleted on Commons.
# BUG: This also returns true for images with a page, but
# without the image itself. Can be fixed by querying query.php
- # instead of api.php. Also should this be made as an exits()
+ # instead of api.php. Also should this be made as an exits()
# method of checkusage.CheckUsage?
if self.site.shared_image_repository() != (None, None):
shared_image_repository = self.CommonsDelinker.get_site(*self.site.shared_image_repository())
@@ -505,12 +505,12 @@
not bool(replacement):
output(u'%s %s exists again!' % (self, image))
return
-
-
+
+
if self.CommonsDelinker.config['global']:
usage = self.CheckUsage.get_usage(image)
usage_domains = {}
-
+
count = 0
# Sort usage per domain
for (lang, family), (page_namespace, page_title, title) in usage:
@@ -520,21 +520,21 @@
count += 1
else:
#FIX!
- usage_domains = {(self.site.lang, self.site.family.name):
- list(self.CheckUsage.get_usage_live(self.site,
+ usage_domains = {(self.site.lang, self.site.family.name):
+ list(self.CheckUsage.get_usage_live(self.site,
image))}
count = len(usage_domains[(self.site.lang, self.site.family.name)])
-
+
output(u'%s %s used on %s pages' % (self, image, count))
-
+
if count:
# Pass the usage to the Delinker pool along with other arguments
- self.CommonsDelinker.Delinkers.append((image, usage_domains,
+ self.CommonsDelinker.Delinkers.append((image, usage_domains,
timestamp, admin, reason, replacement))
elif replacement:
# Record replacement done
self.CommonsDelinker.Loggers.append((timestamp, image, replacement))
-
+
def do(self, args):
try:
self.check_usage(*args)
@@ -544,12 +544,12 @@
traceback.print_exc(file = sys.stderr)
self.exit()
self.CommonsDelinker.thread_died()
-
+
def starve(self):
self.pool.jobLock.acquire()
try:
if self.pool[id(self)].isSet(): return False
-
+
output(u'%s Starving' % self)
self.CheckUsage.close()
del self.pool[id(self)]
@@ -557,66 +557,66 @@
return True
finally:
self.pool.jobLock.release()
-
+
class Logger(threadpool.Thread):
timeout = 360
-
+
def __init__(self, pool, CommonsDelinker):
threadpool.Thread.__init__(self, pool)
self.CommonsDelinker = CommonsDelinker
self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new')
self.enabled = self.CommonsDelinker.config.get('enable_logging', True)
-
+
def run(self):
self.connect()
threadpool.Thread.run(self)
-
+
def connect(self):
output(u'%s Connecting to log database' % self)
self.database = connect_database()
self.cursor = self.database.cursor()
-
-
+
+
def log_result_legacy(self, timestamp, image, domain, namespace, page, status = "ok", newimage = None):
# TODO: Make sqlite3 ready
-
+
# The original delinker code cached log results,
# in order to limit the number of connections.
# However, since we are now using persistent
# connections, we can safely insert the result
# on the fly.
output(u'%s Logging %s for %s on %s' % (self, repr(status), image, page))
-
+
# There is no need to escape each parameter if
- # a parametrized call is made.
+ # a parametrized call is made.
self.cursor.execute("""INSERT INTO %s (timestamp, img, wiki, page_title,
namespace, status, newimg) VALUES
(%%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % self.CommonsDelinker.config['log_table'],
(timestamp, image, domain, page, namespace, status, newimage))
self.database.commit()
-
- def log_result_new(self, timestamp, image, site_lang, site_family,
+
+ def log_result_new(self, timestamp, image, site_lang, site_family,
page_namespace, page_title, status = 'ok', new_image = None):
-
+
output(u'%s Logging %s for %s on %s' % (self, repr(status), image, page_title))
self.cursor.execute("""INSERT INTO %s (timestamp, image, site_lang, site_family,
page_namespace, page_title, status, new_image) VALUES
(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % self.CommonsDelinker.config['log_table'],
- (timestamp, image, site_lang, site_family, page_namespace, page_title,
+ (timestamp, image, site_lang, site_family, page_namespace, page_title,
status, new_image))
self.database.commit()
-
+
def log_replacement(self, timestamp, old_image, new_image):
# TODO: Same as above
-
+
output(u'Replacing %s by %s done' % (old_image, new_image))
- self.cursor.execute("""UPDATE %s SET status = 'done' WHERE
- timestamp = %%s AND old_image = %%s AND
+ self.cursor.execute("""UPDATE %s SET status = 'done' WHERE
+ timestamp = %%s AND old_image = %%s AND
new_image = %%s""" % self.CommonsDelinker.config['replacer_table'],
(timestamp, old_image, new_image))
self.database.commit()
-
+
def do(self, args):
if not self.enabled: return
try:
@@ -633,12 +633,12 @@
traceback.print_exc(file = sys.stderr)
self.exit()
self.CommonsDelinker.thread_died()
-
+
def starve(self):
self.pool.jobLock.acquire()
try:
if self.pool[id(self)].isSet(): return False
-
+
output(u'%s Starving' % self)
self.database.close()
del self.pool[id(self)]
@@ -653,7 +653,7 @@
self.config = config.CommonsDelinker
self.site = wikipedia.getSite()
self.site.forceLogin()
-
+
# Initialize workers
self.CheckUsages = threadpool.ThreadPool(CheckUsage, self.config['checkusage_instances'], self)
self.Delinkers = threadpool.ThreadPool(Delinker, self.config['delinker_instances'], self)
@@ -661,34 +661,34 @@
self.Loggers = threadpool.ThreadPool(Logger, self.config['logger_instances'], self)
else:
self.Loggers = threadpool.ThreadPool(Logger, 1, self)
-
+
self.http = checkusage.HTTP(self.site.hostname())
-
+
self.edit_list = []
self.editLock = threading.Lock()
-
+
self.sites = {}
self.siteLock = threading.Lock()
-
+
self.SummaryCache = SummaryCache(self)
-
+
if self.config.get('enable_replacer', False):
self.connect_mysql()
-
+
if self.config.get('no_sysop', False):
# Don't edit as sysop
if hasattr(config, 'sysopnames'):
config.sysopnames = dict([(fam, {}) for fam in config.sysopnames.keys()])
-
+
self.last_check = time.time()
-
+
#if 'bot' in self.site.userGroups:
# self.log_limit = '5000'
#else:
# self.log_limit = '500'
self.log_limit = '500'
self.init_plugins()
-
+
def init_plugins(self, do_reload = False):
import plugins
self.hooks = {}
@@ -705,7 +705,7 @@
self.hooks[plugin.hook].append(plugin)
output(u"%s Loaded plugin %s for hook '%s'" % \
(self, plugin, plugin.hook))
-
+
def exec_hook(self, name, args):
# TODO: Threadsafety!
if name in self.hooks:
@@ -729,16 +729,16 @@
self.hooks[name].remove(plugin)
finally:
self.siteLock.release()
-
+
def reload_plugins(signalnum, stack):
pass
-
+
def connect_mysql(self):
self.database = connect_database()
self.cursor = self.database.cursor()
-
+
def set_edit(self, domain, page):
- """ Make sure the bot does not create edit
+ """ Make sure the bot does not create edit
conflicts with itself."""
self.editLock.acquire()
being_editted = (domain, page) in self.edit_list
@@ -751,9 +751,9 @@
self.editLock.acquire()
self.edit_list.remove((domain, page))
self.editLock.release()
-
+
def get_site(self, code, fam):
- # Threadsafe replacement of wikipedia.getSite
+ # Threadsafe replacement of wikipedia.getSite
key = '%s:%s' % (code, fam)
self.siteLock.acquire()
try:
@@ -779,34 +779,34 @@
self.sites[key][self.sites[key].index((site, True))] = (site, False)
finally:
self.siteLock.release()
-
-
+
+
def read_deletion_log(self):
ts_format = '%Y-%m-%dT%H:%M:%SZ'
wait = self.config['delink_wait']
exclusion = self.config['exclude_string']
-
+
ts_from = self.last_check
# Truncate -> int()
ts_end = int(time.time())
self.last_check = ts_end
-
+
# Format as a Mediawiki timestamp and substract a
# certain wait period.
ts_from_s = time.strftime(ts_format, time.gmtime(ts_from - wait + 1))
ts_end_s = time.strftime(ts_format, time.gmtime(ts_end - wait))
-
+
try:
# Assume less than 500 deletion have been made between
- # this and the previous check of the log. If this is not
+ # this and the previous check of the log. If this is not
# the case, timeout should be set lower.
result = self.http.query_api(self.site.hostname(), self.site.apipath(),
- action = 'query', list = 'logevents', letype = 'delete',
- lelimit = self.log_limit, lestart = ts_from_s, leend = ts_end_s,
+ action = 'query', list = 'logevents', letype = 'delete',
+ lelimit = self.log_limit, lestart = ts_from_s, leend = ts_end_s,
ledir = 'newer')
logevents = result['query']['logevents']
except Exception, e:
- if type(e) in (SystemError, KeyboardInterrupt): raise
+ if type(e) in (SystemError, KeyboardInterrupt): raise
# Something happened, but since it is a network error,
# it will not be critical. In order to prevent data loss
# the last_check timestamp has to be set correctly.
@@ -814,7 +814,7 @@
output('Warning! Unable to read deletion logs', False)
output('%s: %s' % (e.__class__.__name__, str(e)), False)
return time.sleep(self.config['timeout'])
-
+
for logevent in logevents:
if logevent['ns'] == 6 and logevent['action'] == 'delete':
if exclusion not in logevent.get('comment', ''):
@@ -823,14 +823,14 @@
timestamp = timestamp.replace(':', '')
timestamp = timestamp.replace('T', '')
timestamp = timestamp.replace('Z', '')
-
+
output(u'Deleted image: %s' % logevent['title'])
self.CheckUsages.append((checkusage.strip_ns(logevent['title']),
timestamp, logevent['user'], logevent.get('comment', ''),
None))
else:
output(u'Skipping deleted image: %s' % logevent['title'])
-
+
def read_replacement_log(self):
# TODO: Make sqlite3 ready
# TODO: Single process replacer
@@ -845,22 +845,22 @@
self.CheckUsages.append((old_image, timestamp, user, comment, new_image))
output(u'Replacing %s by %s' % (old_image, new_image))
self.cursor.execute(update, ('ok', id))
-
+
self.database.commit()
-
+
def start(self):
# Gracefully exit all threads on SIG_INT or SIG_TERM
threadpool.catch_signals()
-
+
# Start threads
self.Loggers.start()
self.Delinkers.start()
self.CheckUsages.start()
-
+
# Give threads some time to initialize
time.sleep(self.config['timeout'])
output(u'All workers started')
-
+
# Main loop
while True:
if self.config.get('enable_delinker', True):
@@ -871,17 +871,17 @@
self.read_deletion_log()
if self.config.get('enable_replacer', False):
self.read_replacement_log()
-
+
time.sleep(self.config['timeout'])
-
+
def thread_died(self):
# Obsolete
return
-
+
@staticmethod
def output(*args):
return output(*args)
-
+
def output(message, toStdout = True):
message = time.strftime('[%Y-%m-%d %H:%M:%S] ') + message
wikipedia.output(message, toStdout = toStdout)
@@ -895,16 +895,16 @@
output(u'Running ' + __version__)
CD = CommonsDelinker()
output(u'This bot runs from: ' + str(CD.site))
-
+
re._MAXCACHE = 4
-
+
args = wikipedia.handleArgs()
if '-since' in args:
# NOTE: Untested
ts_format = '%Y-%m-%d %H:%M:%S'
try:
since = time.strptime(
- args[args.index('-since') + 1],
+ args[args.index('-since') + 1],
ts_format)
except ValueError:
if args[args.index('-since') + 1][0] == '[' and \
@@ -917,7 +917,7 @@
output(u'Reading deletion log since [%s]' %\
time.strftime(ts_format, since))
CD.last_check = time.mktime(since)
-
+
try:
try:
CD.start()
Modified: trunk/pywikipedia/commonsdelinker/image_replacer.py
===================================================================
--- trunk/pywikipedia/commonsdelinker/image_replacer.py 2011-03-13 12:19:20 UTC (rev 9052)
+++ trunk/pywikipedia/commonsdelinker/image_replacer.py 2011-03-13 12:24:49 UTC (rev 9053)
@@ -4,7 +4,7 @@
Please refer to delinker.txt for full documentation.
"""
#
-#
+#
# (C) Bryan Tong Minh, 2007
#
# Distributed under the terms of the MIT license.
@@ -43,44 +43,44 @@
self.config.update(getattr(config, 'Replacer', ()))
self.template = re.compile(r'\{\{%s\|([^|]*?)\|([^|]*?)(?:(?:\|reason\=(.*?))?)\}\}' % \
self.config['replace_template'])
- self.disallowed_replacements = [(re.compile(i[0], re.I), re.compile(i[1], re.I))
+ self.disallowed_replacements = [(re.compile(i[0], re.I), re.compile(i[1], re.I))
for i in self.config.get('disallowed_replacements', ())]
-
+
self.site = wikipedia.getSite(persistent_http = True)
self.site.forceLogin()
-
+
self.database = connect_database()
self.cursor = self.database.cursor()
-
+
self.first_revision = 0
if self.config.get('replacer_report_replacements', False):
self.reporters = threadpool.ThreadPool(Reporter, 1, self.site, self.config)
self.reporters.start()
-
-
+
+
def read_replace_log(self):
""" The actual worker method """
-
+
# FIXME: Make sqlite3 compatible
- insert = """INSERT INTO %s (timestamp, old_image, new_image,
+ insert = """INSERT INTO %s (timestamp, old_image, new_image,
status, user, comment) VALUES (%%s, %%s, %%s,
'pending', %%s, %%s)""" % self.config['replacer_table']
-
+
page = wikipedia.Page(self.site, self.config['command_page'])
-
+
# Get last revision date
- if self.cursor.execute("""SELECT timestamp FROM %s
+ if self.cursor.execute("""SELECT timestamp FROM %s
ORDER BY timestamp DESC LIMIT 1""" % \
self.config['replacer_table']):
since = mw_timestamp(self.cursor.fetchone()[0])
else:
since = None
-
+
if self.config.get('clean_list', False):
username = config.sysopnames[self.site.family.name][self.site.lang]
else:
username = None
-
+
try:
# Fetch revision history
revisions = self.get_history(page.title(), since, username)
@@ -95,18 +95,18 @@
#self.site.conn.close()
#self.site.conn.connect()
return time.sleep(self.config['timeout'])
-
+
# We're being killed
if '{{stop}}' in text.lower():
output(u'Found {{stop}} on command page. Not replacing anything.')
return time.sleep(self.config['timeout'])
-
+
# Sort oldest first
revisions.sort(key = lambda rev: rev['timestamp'])
-
+
# Find all commands
replacements = self.template.finditer(text)
-
+
remove_from_list = []
count = 0
for replacement in replacements:
@@ -122,10 +122,10 @@
remove_from_list.append(replacement.group(0))
output('Replacing %s by %s: %s' % replacement.groups())
count += 1
-
+
# Save all replaces to database
self.database.commit()
-
+
if remove_from_list and self.config.get('clean_list', False):
# Cleanup the command page
while True:
@@ -144,10 +144,10 @@
except wikipedia.EditConflict:
# Try again
text = page.get()
-
+
def get_history(self, title, since, username):
""" Fetch the last 50 revisions using the API """
-
+
address = self.site.api_address()
predata = [
('action', 'query'),
@@ -170,10 +170,10 @@
if 'missing' in page:
raise Exception('Missing page!')
return page.get('revisions', [])
-
+
def examine_revision_history(self, revisions, replacement, username):
""" Find out who is to blame for a replacement """
-
+
for revision in revisions:
if replacement.group(0) in revision['*']:
db_time = db_timestamp(revision['timestamp'])
@@ -182,52 +182,52 @@
return (db_time, strip_image(replacement.group(1)),
strip_image(replacement.group(2)),
revision['user'], replacement.group(3))
-
+
output('Warning! Could not find out who did %s' % \
repr(replacement.group(0)), False)
return
-
+
def read_finished_replacements(self):
- """ Find out which replacements have been completed and add them to
+ """ Find out which replacements have been completed and add them to
the reporters queue. """
-
+
self.cursor.execute('START TRANSACTION WITH CONSISTENT SNAPSHOT')
self.cursor.execute("""SELECT old_image, new_image, user, comment FROM
%s WHERE status = 'done' AND timestamp >= %i""" % \
(self.config['replacer_table'], self.first_revision))
finished_images = list(self.cursor)
- self.cursor.execute("""UPDATE %s SET status = 'reported'
+ self.cursor.execute("""UPDATE %s SET status = 'reported'
WHERE status = 'done' AND timestamp >= %i""" % \
(self.config['replacer_table'], self.first_revision))
self.cursor.commit()
-
+
for old_image, new_image, user, comment in finished_images:
- self.cursor.execute("""SELECT wiki, namespace, page_title
- FROM %s WHERE img = %%s AND status <> 'ok'""" %
+ self.cursor.execute("""SELECT wiki, namespace, page_title
+ FROM %s WHERE img = %%s AND status <> 'ok'""" %
self.config['log_table'], (old_image, ))
not_ok = [(wiki, namespace, page_title.decode('utf-8', 'ignore'))
for wiki, namespace, page_title in self.cursor]
-
+
if not comment: comment = ''
-
+
self.reporters.append((old_image.decode('utf-8', 'ignore'),
- new_image.decode('utf-8', 'ignore'),
- user.decode('utf-8', 'ignore'),
+ new_image.decode('utf-8', 'ignore'),
+ user.decode('utf-8', 'ignore'),
comment.decode('utf-8', 'ignore'), not_ok))
-
-
+
+
def start(self):
while True:
self.read_replace_log()
if self.config.get('replacer_report_replacements', False):
self.read_finished_replacements()
-
+
# Replacer should not loop as often as delinker
time.sleep(self.config['timeout'] * 2)
-
+
def allowed_replacement(self, replacement):
""" Method to prevent World War III """
-
+
for source, target in self.disallowed_replacements:
if source.search(replacement.group(1)) and \
target.search(replacement.group(2)):
@@ -236,14 +236,14 @@
class Reporter(threadpool.Thread):
""" Asynchronous worker to report finished replacements to file pages. """
-
+
def __init__(self, pool, site, config):
self.site = wikipedia.getSite(site.lang, site.family,
site.user, True)
self.config = config
-
+
threadpool.Thread.__init__(self, pool)
-
+
def do(self, args):
try:
self.report(args)
@@ -254,7 +254,7 @@
sys.stderr.flush()
self.exit()
os.kill(0, signal.SIGTERM)
-
+
def report(self, (old_image, new_image, user, comment, not_ok)):
not_ok_items = []
for wiki, namespace, page_title in not_ok:
@@ -265,7 +265,7 @@
namespace_name = namespace_name + u':'
else:
namespace_name = u''
-
+
if unicode(site) == unicode(self.site):
if (namespace, page_title) != (6, old_image):
not_ok_items.append(u'[[:%s%s]]' % \
@@ -273,13 +273,13 @@
else:
not_ok_items.append(u'[[:%s:%s%s]]' % (site_prefix(site),
namespace_name, page_title))
-
+
template = u'{{%s|new_image=%s|user=%s|comment=%s|not_ok=%s}}' % \
(self.config['replacer_report_template'],
- new_image, user, comment,
+ new_image, user, comment,
self.config.get('replacer_report_seperator', u', ').join(not_ok_items))
page = wikipedia.Page(self.site, u'Image:' + old_image)
-
+
try:
text = page.get()
except wikipedia.NoPage:
@@ -289,7 +289,7 @@
output(u'Warning! %s is a redirect; not reporting replacement!' % old_image)
return
try:
- page.put(u'%s\n%s' % (template, text),
+ page.put(u'%s\n%s' % (template, text),
comment = u'This image has been replaced by ' + new_image)
except wikipedia.PageNotSaved, e:
output(u'Warning! Unable to report replacement to %s.' % old_image, False)
@@ -301,11 +301,11 @@
else:
output(u'Reporting replacement of %s by %s.' % \
(old_image, new_image))
-
+
def main():
global R
-
+
import sys, traceback
wikipedia.handleArgs()
output(u'Running ' + __version__)
@@ -327,5 +327,5 @@
except:
pass
wikipedia.stopme()
-
+
if __name__ == '__main__': main()
Modified: trunk/pywikipedia/commonsdelinker/threadpool.py
===================================================================
--- trunk/pywikipedia/commonsdelinker/threadpool.py 2011-03-13 12:19:20 UTC (rev 9052)
+++ trunk/pywikipedia/commonsdelinker/threadpool.py 2011-03-13 12:24:49 UTC (rev 9053)
@@ -1,19 +1,19 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-This module implements a threadpool which allows scripts that require
+This module implements a threadpool which allows scripts that require
performing concurrent jobs, an efficient and thread safe way to do this.
-
-The two classes available are ThreadPool and Thread. ThreadPool is the
+
+The two classes available are ThreadPool and Thread. ThreadPool is the
controller class and contains a collection of Thread objects, which must be
subclassed.
Any thread can add a job to the ThreadPool by calling its append() method.
The pool will add this task to the jobqueue and activate a sleeping thread, if
available. In case no thread is directly available, the job will be handled by
the first free thread.
-
+
The Thread class must be subclassed and passed to the ThreadPool's constructor.
-The subclass should implement a do(args) method, which will receive as its
+The subclass should implement a do(args) method, which will receive as its
argument the job. Please note that providing mutable variables to the jobqueue
may cause thread unsafety!
"""
@@ -24,9 +24,9 @@
#
__version__ = '$Id$'
#
-
+
import sys, threading, os
-
+
class ThreadPool(dict):
pools = []
def __init__(self, worker, max_threads, *args, **kwargs):
@@ -36,7 +36,7 @@
self.jobQueue = []
self.worker = worker
self.threads = []
-
+
self.max_threads = max_threads
self.args = args
self.kwargs = kwargs
@@ -50,7 +50,7 @@
self.jobQueue.append(job)
# The amount of workers needed to be unlocked
unlock_workers = len(self.jobQueue)
-
+
for event in self.itervalues():
if not event.isSet():
event.set()
@@ -62,7 +62,7 @@
if counter == 0 and len(self.threads) < self.max_threads:
self.add_thread()
self.start()
-
+
def add_thread(self):
self.jobLock.acquire()
try:
@@ -71,7 +71,7 @@
self[id(thread)] = threading.Event()
finally:
self.jobLock.release()
-
+
def start(self):
for thread in self.threads:
if not thread.isAlive():
@@ -92,23 +92,23 @@
threading.Thread.__init__(self)
self.pool = pool
self.quit = False
-
+
def run(self):
while True:
# No try..finally: lock.release() here:
# The lock might be released twice, in case
- # the thread waits for an event, a race
+ # the thread waits for an event, a race
# condition might occur where a lock is released
- # that is acquired by another thread.
+ # that is acquired by another thread.
self.pool.jobLock.acquire()
-
+
if self.quit and not self.pool.jobQueue:
# Only return once the jobQueue is empty.
self.pool.jobLock.release()
return
-
+
if not self.pool.jobQueue:
- # In case no job is available, wait for the pool
+ # In case no job is available, wait for the pool
# to call and do not start a busy while loop.
event = self.pool[id(self)]
self.pool.jobLock.release()
@@ -119,9 +119,9 @@
continue
job = self.pool.jobQueue.pop(0)
self.pool.jobLock.release()
-
+
self.do(job)
-
+
def exit(self):
self.pool.jobLock.acquire()
try:
@@ -133,7 +133,7 @@
self.pool.threads.remove(self)
finally:
self.pool.jobLock.release()
-
+
def starve(self):
pass
@@ -146,12 +146,12 @@
import signal
for pool in ThreadPool.pools:
pool.exit()
-
+
if signalnum == signal.SIGINT:
raise KeyboardInterrupt
if signalnum == signal.SIGTERM:
raise SystemExit
-
+
def terminate():
# Maybe not a good idea, will also kill child processes
import signal
@@ -160,24 +160,24 @@
if __name__ == '__main__':
import time
# Test cases
-
+
class Worker(Thread):
def do(self, args):
print 'Working', self
time.sleep(10)
print 'Done', self
-
+
pool = ThreadPool(Worker)
print 'Spawning 5 threads'
[pool.add_thread() for i in xrange(5)]
pool.start()
-
+
print 'Doing 25 jobs'
for i in xrange(25):
print 'Job', i
pool.append(i)
time.sleep(i % 6)
-
+
for thread in pool.threads:
thread.exit()