http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10384
Revision: 10384
Author: xqt
Date: 2012-06-20 13:27:42 +0000 (Wed, 20 Jun 2012)
Log Message:
-----------
enable xml-dumps for solving double redirects
Modified Paths:
--------------
branches/rewrite/scripts/redirect.py
Modified: branches/rewrite/scripts/redirect.py
===================================================================
--- branches/rewrite/scripts/redirect.py 2012-06-20 13:11:29 UTC (rev 10383)
+++ branches/rewrite/scripts/redirect.py 2012-06-20 13:27:42 UTC (rev 10384)
@@ -17,17 +17,25 @@
and arguments can be:
+-xml Retrieve information from a local XML dump
+ (
http://download.wikimedia.org). Argument can also be given as
+ "-xml:filename.xml". Cannot be used with -fullscan or -moves.
+
-fullscan Retrieve redirect pages from live wiki, not from a special page
-moves Use the page move log to find double-redirect candidates. Only
- works with action "double".
+ works with action "double", does not work with -xml.
+ NOTE: If neither of -xml -fullscan -moves is given, info will be
+ loaded from a special page of the live wiki.
+
-namespace:n Namespace to process. Can be given multiple times, for several
namespaces. If omitted, only the main (article) namespace is
treated.
-offset:n With -moves, the number of hours ago to start scanning moved
- pages. Otherwise, ignored.
+ pages. With -xml, the number of the redirect to restart with
+ (see progress). Otherwise, ignored.
-start:title The starting page title in each namespace. Page need not exist.
@@ -40,14 +48,6 @@
-always Don't prompt you for each replacement.
"""
-
-# XML not yet implemented: deleted help text follows
-##-xml Retrieve information from a local XML dump
-## (
http://download.wikimedia.org). Argument can also be given as
-## "-xml:filename.xml". Cannot be used with -api or -moves.
-## If neither of -xml -api -moves is given, info will be loaded
-## from a special page of the live wiki.
-
#
# (C) Daniel Herding, 2004.
# (C) Purodha Blissenbach, 2009.
@@ -62,7 +62,7 @@
import pywikibot
from pywikibot import i18n
from pywikibot import config
-# import xmlreader
+from pywikibot import xmlreader
class RedirectGenerator:
@@ -82,77 +82,75 @@
self.api_number = number
self.api_step = step
-# note: rewrite branch does not yet support XML dumps, so this is commented out
-# until that support is added
-## def get_redirects_from_dump(self, alsoGetPageTitles=False):
-## '''
-## Load a local XML dump file, look at all pages which have the
-## redirect flag set, and find out where they're pointing at. Return
-## a dictionary where the redirect names are the keys and the redirect
-## targets are the values.
-## '''
-## xmlFilename = self.xmlFilename
-## redict = {}
-## # open xml dump and read page titles out of it
-## dump = xmlreader.XmlDump(xmlFilename)
-## redirR = self.site.redirectRegex()
-## readPagesCount = 0
-## if alsoGetPageTitles:
-## pageTitles = set()
-## for entry in dump.parse():
-## readPagesCount += 1
-## # always print status message after 10000 pages
-## if readPagesCount % 10000 == 0:
-## pywikibot.output(u'%i pages read...' % readPagesCount)
-## if len(self.namespaces) > 0:
-## if pywikibot.Page(self.site, entry.title).namespace() \
-## not in self.namespaces:
-## continue
-## if alsoGetPageTitles:
-## pageTitles.add(entry.title.replace(' ', '_'))
-##
-## m = redirR.match(entry.text)
-## if m:
-## target = m.group(1)
-## # There might be redirects to another wiki. Ignore these.
-## for code in self.site.family.langs.keys():
-## if target.startswith('%s:' % code) \
-## or target.startswith(':%s:' % code):
-## if code == self.site.language():
-## # link to our wiki, but with the lang prefix
-## target = target[(len(code)+1):]
-## if target.startswith(':'):
-## target = target[1:]
-## else:
-## pywikibot.output(
-## u'NOTE: Ignoring %s which is a redirect to
%s:'
-## % (entry.title, code))
-## target = None
-## break
-## # if the redirect does not link to another wiki
-## if target:
-## source = entry.title.replace(' ', '_')
-## target = target.replace(' ', '_')
-## # remove leading and trailing whitespace
-## target = target.strip('_')
-## # capitalize the first letter
-## if not pywikibot.getSite().nocapitalize:
-## source = source[:1].upper() + source[1:]
-## target = target[:1].upper() + target[1:]
-## if '#' in target:
-## target =
target[:target.index('#')].rstrip("_")
-## if '|' in target:
-## pywikibot.output(
-## u'HINT: %s is a redirect with a pipelink.'
-## % entry.title)
-## target =
target[:target.index('|')].rstrip("_")
-## if target: # in case preceding steps left nothing
-## redict[source] = target
-## if alsoGetPageTitles:
-## return redict, pageTitles
-## else:
-## return redict
-##
+ def get_redirects_from_dump(self, alsoGetPageTitles=False):
+ '''
+ Load a local XML dump file, look at all pages which have the
+ redirect flag set, and find out where they're pointing at. Return
+ a dictionary where the redirect names are the keys and the redirect
+ targets are the values.
+ '''
+ xmlFilename = self.xmlFilename
+ redict = {}
+ # open xml dump and read page titles out of it
+ dump = xmlreader.XmlDump(xmlFilename)
+ redirR = self.site.redirectRegex()
+ readPagesCount = 0
+ if alsoGetPageTitles:
+ pageTitles = set()
+ for entry in dump.parse():
+ readPagesCount += 1
+ # always print status message after 10000 pages
+ if readPagesCount % 10000 == 0:
+ pywikibot.output(u'%i pages read...' % readPagesCount)
+ if len(self.namespaces) > 0:
+ if pywikibot.Page(self.site, entry.title).namespace() \
+ not in self.namespaces:
+ continue
+ if alsoGetPageTitles:
+ pageTitles.add(entry.title.replace(' ', '_'))
+
+ m = redirR.match(entry.text)
+ if m:
+ target = m.group(1)
+ # There might be redirects to another wiki. Ignore these.
+ for code in self.site.family.langs.keys():
+ if target.startswith('%s:' % code) \
+ or target.startswith(':%s:' % code):
+ if code == self.site.language():
+ # link to our wiki, but with the lang prefix
+ target = target[(len(code)+1):]
+ if target.startswith(':'):
+ target = target[1:]
+ else:
+ pywikibot.output(
+ u'NOTE: Ignoring %s which is a redirect to %s:'
+ % (entry.title, code))
+ target = None
+ break
+ # if the redirect does not link to another wiki
+ if target:
+ source = entry.title.replace(' ', '_')
+ target = target.replace(' ', '_')
+ # remove leading and trailing whitespace
+ target = target.strip('_')
+ # capitalize the first letter
+ if not pywikibot.getSite().nocapitalize:
+ source = source[:1].upper() + source[1:]
+ target = target[:1].upper() + target[1:]
+ if '#' in target:
+ target =
target[:target.index('#')].rstrip("_")
+ if '|' in target:
+ pywikibot.output(
+ u'HINT: %s is a redirect with a pipelink.'
+ % entry.title)
+ target =
target[:target.index('|')].rstrip("_")
+ if target: # in case preceding steps left nothing
+ redict[source] = target
+ if alsoGetPageTitles:
+ return redict, pageTitles
+ else:
+ return redict
+
def get_redirect_pages_via_api(self):
"""Return generator that yields
Pages that are redirects.
@@ -299,23 +297,22 @@
count += 1
if count >= self.api_number:
break
-
+ elif self.xmlFilename:
+ redict = self.get_redirects_from_dump()
+ num = 0
+ for (key, value) in redict.iteritems():
+ num += 1
+ # check if the value - that is, the redirect target - is a
+ # redirect as well
+ if num > self.offset and value in redict:
+ yield key
+ pywikibot.output(u'\nChecking redirect %i of %i...'
+ % (num + 1, len(redict)))
else:
# retrieve information from double redirect special page
pywikibot.output(u'Retrieving special page...')
for redir_name in self.site.double_redirects():
yield redir_name.title()
-## else:
-## redict = self.get_redirects_from_dump()
-## num = 0
-## for (key, value) in redict.iteritems():
-## num += 1
-## # check if the value - that is, the redirect target - is a
-## # redirect as well
-## if num > self.offset and value in redict:
-## yield key
-## pywikibot.output(u'\nChecking redirect %i of %i...'
-## % (num + 1, len(redict)))
def get_moved_pages_redirects(self):
'''generate redirects to recently-moved pages'''
@@ -693,10 +690,8 @@
else:
pywikibot.output(u'Unknown argument: %s' % arg)
- if xmlFilename:
- pywikibot.error(u"Sorry, xmlreader is not yet implemented in rewrite")
- elif not action: # or (xmlFilename and moved_pages)
- # or (api and xmlFilename):
+ if not action or (xmlFilename and moved_pages) \
+ or (fullscan and xmlFilename):
pywikibot.showHelp('redirect')
else:
gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages,