http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10384
Revision: 10384 Author: xqt Date: 2012-06-20 13:27:42 +0000 (Wed, 20 Jun 2012) Log Message: ----------- enable xml-dumps for solving double redirects
Modified Paths: -------------- branches/rewrite/scripts/redirect.py
Modified: branches/rewrite/scripts/redirect.py =================================================================== --- branches/rewrite/scripts/redirect.py 2012-06-20 13:11:29 UTC (rev 10383) +++ branches/rewrite/scripts/redirect.py 2012-06-20 13:27:42 UTC (rev 10384) @@ -17,17 +17,25 @@
and arguments can be:
+-xml Retrieve information from a local XML dump + (http://download.wikimedia.org). Argument can also be given as + "-xml:filename.xml". Cannot be used with -fullscan or -moves. + -fullscan Retrieve redirect pages from live wiki, not from a special page
-moves Use the page move log to find double-redirect candidates. Only - works with action "double". + works with action "double", does not work with -xml.
+ NOTE: If neither of -xml -fullscan -moves is given, info will be + loaded from a special page of the live wiki. + -namespace:n Namespace to process. Can be given multiple times, for several namespaces. If omitted, only the main (article) namespace is treated.
-offset:n With -moves, the number of hours ago to start scanning moved - pages. Otherwise, ignored. + pages. With -xml, the number of the redirect to restart with + (see progress). Otherwise, ignored.
-start:title The starting page title in each namespace. Page need not exist.
@@ -40,14 +48,6 @@ -always Don't prompt you for each replacement.
""" - -# XML not yet implemented: deleted help text follows -##-xml Retrieve information from a local XML dump -## (http://download.wikimedia.org). Argument can also be given as -## "-xml:filename.xml". Cannot be used with -api or -moves. -## If neither of -xml -api -moves is given, info will be loaded -## from a special page of the live wiki. - # # (C) Daniel Herding, 2004. # (C) Purodha Blissenbach, 2009. @@ -62,7 +62,7 @@ import pywikibot from pywikibot import i18n from pywikibot import config -# import xmlreader +from pywikibot import xmlreader
class RedirectGenerator: @@ -82,77 +82,75 @@ self.api_number = number self.api_step = step
-# note: rewrite branch does not yet support XML dumps, so this is commented out -# until that support is added -## def get_redirects_from_dump(self, alsoGetPageTitles=False): -## ''' -## Load a local XML dump file, look at all pages which have the -## redirect flag set, and find out where they're pointing at. Return -## a dictionary where the redirect names are the keys and the redirect -## targets are the values. -## ''' -## xmlFilename = self.xmlFilename -## redict = {} -## # open xml dump and read page titles out of it -## dump = xmlreader.XmlDump(xmlFilename) -## redirR = self.site.redirectRegex() -## readPagesCount = 0 -## if alsoGetPageTitles: -## pageTitles = set() -## for entry in dump.parse(): -## readPagesCount += 1 -## # always print status message after 10000 pages -## if readPagesCount % 10000 == 0: -## pywikibot.output(u'%i pages read...' % readPagesCount) -## if len(self.namespaces) > 0: -## if pywikibot.Page(self.site, entry.title).namespace() \ -## not in self.namespaces: -## continue -## if alsoGetPageTitles: -## pageTitles.add(entry.title.replace(' ', '_')) -## -## m = redirR.match(entry.text) -## if m: -## target = m.group(1) -## # There might be redirects to another wiki. Ignore these. -## for code in self.site.family.langs.keys(): -## if target.startswith('%s:' % code) \ -## or target.startswith(':%s:' % code): -## if code == self.site.language(): -## # link to our wiki, but with the lang prefix -## target = target[(len(code)+1):] -## if target.startswith(':'): -## target = target[1:] -## else: -## pywikibot.output( -## u'NOTE: Ignoring %s which is a redirect to %s:' -## % (entry.title, code)) -## target = None -## break -## # if the redirect does not link to another wiki -## if target: -## source = entry.title.replace(' ', '_') -## target = target.replace(' ', '_') -## # remove leading and trailing whitespace -## target = target.strip('_') -## # capitalize the first letter -## if not pywikibot.getSite().nocapitalize: -## source = source[:1].upper() + source[1:] -## target = target[:1].upper() + target[1:] -## if '#' in target: -## target = target[:target.index('#')].rstrip("_") -## if '|' in target: -## pywikibot.output( -## u'HINT: %s is a redirect with a pipelink.' -## % entry.title) -## target = target[:target.index('|')].rstrip("_") -## if target: # in case preceding steps left nothing -## redict[source] = target -## if alsoGetPageTitles: -## return redict, pageTitles -## else: -## return redict -## + def get_redirects_from_dump(self, alsoGetPageTitles=False): + ''' + Load a local XML dump file, look at all pages which have the + redirect flag set, and find out where they're pointing at. Return + a dictionary where the redirect names are the keys and the redirect + targets are the values. + ''' + xmlFilename = self.xmlFilename + redict = {} + # open xml dump and read page titles out of it + dump = xmlreader.XmlDump(xmlFilename) + redirR = self.site.redirectRegex() + readPagesCount = 0 + if alsoGetPageTitles: + pageTitles = set() + for entry in dump.parse(): + readPagesCount += 1 + # always print status message after 10000 pages + if readPagesCount % 10000 == 0: + pywikibot.output(u'%i pages read...' % readPagesCount) + if len(self.namespaces) > 0: + if pywikibot.Page(self.site, entry.title).namespace() \ + not in self.namespaces: + continue + if alsoGetPageTitles: + pageTitles.add(entry.title.replace(' ', '_')) + + m = redirR.match(entry.text) + if m: + target = m.group(1) + # There might be redirects to another wiki. Ignore these. + for code in self.site.family.langs.keys(): + if target.startswith('%s:' % code) \ + or target.startswith(':%s:' % code): + if code == self.site.language(): + # link to our wiki, but with the lang prefix + target = target[(len(code)+1):] + if target.startswith(':'): + target = target[1:] + else: + pywikibot.output( + u'NOTE: Ignoring %s which is a redirect to %s:' + % (entry.title, code)) + target = None + break + # if the redirect does not link to another wiki + if target: + source = entry.title.replace(' ', '_') + target = target.replace(' ', '_') + # remove leading and trailing whitespace + target = target.strip('_') + # capitalize the first letter + if not pywikibot.getSite().nocapitalize: + source = source[:1].upper() + source[1:] + target = target[:1].upper() + target[1:] + if '#' in target: + target = target[:target.index('#')].rstrip("_") + if '|' in target: + pywikibot.output( + u'HINT: %s is a redirect with a pipelink.' + % entry.title) + target = target[:target.index('|')].rstrip("_") + if target: # in case preceding steps left nothing + redict[source] = target + if alsoGetPageTitles: + return redict, pageTitles + else: + return redict + def get_redirect_pages_via_api(self): """Return generator that yields Pages that are redirects. @@ -299,23 +297,22 @@ count += 1 if count >= self.api_number: break - + elif self.xmlFilename: + redict = self.get_redirects_from_dump() + num = 0 + for (key, value) in redict.iteritems(): + num += 1 + # check if the value - that is, the redirect target - is a + # redirect as well + if num > self.offset and value in redict: + yield key + pywikibot.output(u'\nChecking redirect %i of %i...' + % (num + 1, len(redict))) else: # retrieve information from double redirect special page pywikibot.output(u'Retrieving special page...') for redir_name in self.site.double_redirects(): yield redir_name.title() -## else: -## redict = self.get_redirects_from_dump() -## num = 0 -## for (key, value) in redict.iteritems(): -## num += 1 -## # check if the value - that is, the redirect target - is a -## # redirect as well -## if num > self.offset and value in redict: -## yield key -## pywikibot.output(u'\nChecking redirect %i of %i...' -## % (num + 1, len(redict)))
def get_moved_pages_redirects(self): '''generate redirects to recently-moved pages''' @@ -693,10 +690,8 @@ else: pywikibot.output(u'Unknown argument: %s' % arg)
- if xmlFilename: - pywikibot.error(u"Sorry, xmlreader is not yet implemented in rewrite") - elif not action: # or (xmlFilename and moved_pages) - # or (api and xmlFilename): + if not action or (xmlFilename and moved_pages) \ + or (fullscan and xmlFilename): pywikibot.showHelp('redirect') else: gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages,