Revision: 7357
Author: alexsh
Date: 2009-10-02 20:12:56 +0000 (Fri, 02 Oct 2009)
Log Message:
-----------
redirect optimize: add self.site to replace wikipedia.getSite() in functions
Modified Paths:
--------------
trunk/pywikipedia/redirect.py
Modified: trunk/pywikipedia/redirect.py
===================================================================
--- trunk/pywikipedia/redirect.py 2009-10-02 11:53:53 UTC (rev 7356)
+++ trunk/pywikipedia/redirect.py 2009-10-02 20:12:56 UTC (rev 7357)
@@ -179,6 +179,8 @@
def __init__(self, xmlFilename=None, namespaces=[], offset=-1,
use_move_log=False,
use_api=False, start=None, until=None, number=None):
+ self.site = wikipedia.getSite()
+
self.xmlFilename = xmlFilename
self.namespaces = namespaces
self.offset = offset
@@ -199,8 +201,7 @@
redict = {}
# open xml dump and read page titles out of it
dump = xmlreader.XmlDump(xmlFilename)
- site = wikipedia.getSite()
- redirR = site.redirectRegex()
+ redirR = self.site.redirectRegex()
readPagesCount = 0
if alsoGetPageTitles:
pageTitles = set()
@@ -210,7 +211,7 @@
if readPagesCount % 10000 == 0:
wikipedia.output(u'%i pages read...' % readPagesCount)
if len(self.namespaces) > 0:
- if wikipedia.Page(site, entry.title).namespace() \
+ if wikipedia.Page(self.site, entry.title).namespace() \
not in self.namespaces:
continue
if alsoGetPageTitles:
@@ -220,10 +221,10 @@
if m:
target = m.group(1)
# There might be redirects to another wiki. Ignore these.
- for code in site.family.langs.keys():
+ for code in self.site.family.langs.keys():
if target.startswith('%s:' % code) \
or target.startswith(':%s:' % code):
- if code == site.language():
+ if code == self.site.language():
# link to our wiki, but with the lang prefix
target = target[(len(code)+1):]
if target.startswith(':'):
@@ -258,7 +259,7 @@
else:
return redict
- def get_redirect_pageids_via_api(self, number = u'max', namespaces = [], site = None,
+ def get_redirect_pageids_via_api(self, number = u'max', namespaces = [],
start = None, until = None ):
"""
Generator which will yield page IDs of Pages that are redirects.
@@ -268,11 +269,9 @@
"""
# wikipedia.output(u'====> get_redirect_pageids_via_api(number=%s, #ns=%d, start=%s, until=%s)' % (number, len(namespaces), start, until))
import urllib
- if site is None:
- site = wikipedia.getSite()
if namespaces == []:
namespaces = [ 0 ]
- apiQ0 = site.api_address()
+ apiQ0 = self.site.api_address()
apiQ0 += 'action=query'
apiQ0 += '&list=allpages'
apiQ0 += '&apfilterredir=redirects'
@@ -310,19 +309,17 @@
break
def _next_redirects_via_api_commandline(self, apiQi, number = 'max', namespaces = [],
- site = None, start = None, until = None ):
+ start = None, until = None ):
"""
yields commands to the api for checking a set op page ids.
"""
# wikipedia.output(u'====> _next_redirects_via_api_commandline(apiQi=%s, number=%s, #ns=%d, start=%s, until=%s)' % (apiQi, number, len(namespaces), start, until))
- if site is None:
- site = wikipedia.getSite()
if namespaces == []:
namespaces = [ 0 ]
maxurllen = 1018 # accomodate "GET " + apiQ + CR + LF in 1024 bytes.
apiQ = ''
for pageid in self.get_redirect_pageids_via_api(number = number, namespaces = namespaces,
- site = site, start = start, until = until ):
+ start = start, until = until ):
if apiQ:
tmp = ( '%s|%s' % ( apiQ, pageid ) )
else:
@@ -334,7 +331,7 @@
if apiQ:
yield apiQ
- def get_redirects_via_api(self, number = u'max', namespaces = [], site = None, start = None,
+ def get_redirects_via_api(self, number = u'max', namespaces = [], start = None,
until = None, maxlen = 8 ):
"""
Generator which will yield a tuple of data about Pages that are redirects:
@@ -358,11 +355,9 @@
"""
# wikipedia.output(u'====> get_redirects_via_api(number=%s, #ns=%d, start=%s, until=%s, maxlen=%s)' % (number, len(namespaces), start, until, maxlen))
import urllib
- if site is None:
- site = wikipedia.getSite()
if namespaces == []:
namespaces = [ 0 ]
- apiQ1 = site.api_address()
+ apiQ1 = self.site.api_address()
apiQ1 += 'action=query'
apiQ1 += '&redirects'
apiQ1 += '&format=xml'
@@ -371,9 +366,9 @@
missingpageRe = re.compile('<page .*? title="(.*?)" missing=""')
existingpageRe = re.compile('<page pageid=".*?" .*? title="(.*?)"')
for apiQ in self._next_redirects_via_api_commandline(apiQ1, number = number,
- namespaces = namespaces, site = site, start = start, until = until ):
+ namespaces = namespaces, start = start, until = until ):
# wikipedia.output (u'===apiQ=%s' % apiQ)
- result = site.getUrl(apiQ)
+ result = self.site.getUrl(apiQ)
# wikipedia.output(u'===RESULT===\n%s\n' % result)
redirects = {}
pages = {}
@@ -408,11 +403,10 @@
def retrieve_broken_redirects(self):
if self.use_api:
- mysite = wikipedia.getSite()
count = 0
for (pagetitle, type, target, final) in self.get_redirects_via_api(
namespaces = self.namespaces,
- site = mysite, start = self.api_start,
+ start = self.api_start,
until = self.api_until, maxlen = 2):
if type == 0:
yield pagetitle
@@ -423,11 +417,10 @@
elif self.xmlFilename == None:
# retrieve information from the live wiki's maintenance page
- mysite = wikipedia.getSite()
# broken redirect maintenance page's URL
- path = mysite.broken_redirects_address(default_limit = False)
+ path = self.site.broken_redirects_address(default_limit = False)
wikipedia.output(u'Retrieving special page...')
- maintenance_txt = mysite.getUrl(path)
+ maintenance_txt = self.site.getUrl(path)
# regular expression which finds redirects which point to a
# non-existing page inside the HTML
@@ -450,11 +443,10 @@
def retrieve_double_redirects(self):
if self.use_api:
- mysite = wikipedia.getSite()
count = 0
for (pagetitle, type, target, final) in self.get_redirects_via_api(
namespaces = self.namespaces,
- site = mysite, start = self.api_start,
+ start = self.api_start,
until = self.api_until, maxlen = 2):
if type != 0 and type != 1:
yield pagetitle
@@ -468,13 +460,12 @@
for redir_page in self.get_moved_pages_redirects():
yield redir_page.title()
return
- mysite = wikipedia.getSite()
# retrieve information from the live wiki's maintenance page
# double redirect maintenance page's URL
# wikipedia.config.special_page_limit = 1000
- path = mysite.double_redirects_address(default_limit = False)
+ path = self.site.double_redirects_address(default_limit = False)
wikipedia.output(u'Retrieving special page...')
- maintenance_txt = mysite.getUrl(path)
+ maintenance_txt = self.site.getUrl(path)
# regular expression which finds redirects which point to
# another redirect inside the HTML
@@ -500,9 +491,7 @@
wiki = re.escape(wikipedia.getSite().nice_get_address(''))
# /w/index.php
index = re.escape(wikipedia.getSite().path())
- move_regex = re.compile(
- r'moved <a href.*?>(.*?)</a> to <a href=.*?>.*?</a>.*?</li>'
- )
+ move_regex = re.compile(r'moved <a href.*?>(.*?)</a> to <a href=.*?>.*?</a>.*?</li>')
def get_moved_pages_redirects(self):
'''generate redirects to recently-moved pages'''
@@ -511,19 +500,17 @@
if self.offset <= 0:
self.offset = 1
- offsetpattern = re.compile(
-r"""\(<a href="/w/index\.php\?title=Special:Log&offset=(\d+)&limit=500&type=move" title="Special:Log" rel="next">older 500</a>\)""")
+ offsetpattern = re.compile(r"""\(<a href="/w/index\.php\?title=Special:Log&offset=(\d+)&limit=500&type=move" title="Special:Log" rel="next">older 500</a>\)""")
start = datetime.datetime.utcnow() \
- datetime.timedelta(0, self.offset*3600)
# self.offset hours ago
offset_time = start.strftime("%Y%m%d%H%M%S")
- site = wikipedia.getSite()
while True:
move_url = \
- site.path() + "?title=Special:Log&limit=500&offset=%s&type=move"\
+ self.site.path() + "?title=Special:Log&limit=500&offset=%s&type=move"\
% offset_time
try:
- move_list = site.getUrl(move_url)
+ move_list = self.site.getUrl(move_url)
if wikipedia.verbose:
wikipedia.output(u"[%s]" % offset_time)
except:
@@ -534,7 +521,7 @@
if wikipedia.verbose:
wikipedia.output(u"%s moved pages" % len(g))
for moved_title in g:
- moved_page = wikipedia.Page(site, moved_title)
+ moved_page = wikipedia.Page(self.site, moved_title)
try:
if not moved_page.isRedirectPage():
continue
@@ -545,8 +532,7 @@
# moved_page is now a redirect, so any redirects pointing
# to it need to be changed
try:
- for page in moved_page.getReferences(follow_redirects=True,
- redirectsOnly=True):
+ for page in moved_page.getReferences(follow_redirects=True, redirectsOnly=True):
yield page
except wikipedia.NoPage:
# original title must have been deleted after move
@@ -558,6 +544,9 @@
class RedirectRobot:
def __init__(self, action, generator, always=False, number=None):
+
+ self.site = wikipedia.getSite()
+
self.action = action
self.generator = generator
self.always = always
@@ -578,20 +567,18 @@
return True
def delete_broken_redirects(self):
- mysite = wikipedia.getSite()
# get reason for deletion text
- reason = wikipedia.translate(mysite, reason_broken)
+ reason = wikipedia.translate(self.site, reason_broken)
for redir_name in self.generator.retrieve_broken_redirects():
- self.delete_1_broken_redirect(mysite, redir_name, reason)
+ self.delete_1_broken_redirect( redir_name, reason)
if self.exiting:
break
- def delete_1_broken_redirect(self, mysite, redir_name, reason):
- redir_page = wikipedia.Page(mysite, redir_name)
+ def delete_1_broken_redirect(self, redir_name, reason):
+ redir_page = wikipedia.Page(self.site, redir_name)
# Show the title of the page we're working on.
# Highlight the title in purple.
- wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
- % redir_page.title())
+ wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % redir_page.title())
try:
targetPage = redir_page.getRedirectTarget()
except wikipedia.IsNotRedirectPage:
@@ -625,19 +612,17 @@
wikipedia.output(u'')
def fix_double_redirects(self):
- mysite = wikipedia.getSite()
- summary = wikipedia.translate(mysite, msg_double)
+ summary = wikipedia.translate(self.site, msg_double)
for redir_name in self.generator.retrieve_double_redirects():
- self.fix_1_double_redirect(mysite, redir_name, summary)
+ self.fix_1_double_redirect(redir_name, summary)
if self.exiting:
break
- def fix_1_double_redirect(self, mysite, redir_name, summary):
- redir = wikipedia.Page(mysite, redir_name)
+ def fix_1_double_redirect(self, redir_name, summary):
+ redir = wikipedia.Page(self.site, redir_name)
# Show the title of the page we're working on.
# Highlight the title in purple.
- wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
- % redir.title())
+ wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % redir.title())
newRedir = redir
redirList = [] # bookkeeping to detect loops
while True:
@@ -684,9 +669,8 @@
wikipedia.output(
u' Links to: %s.'
% targetPage.aslink())
- if targetPage.site() != mysite:
- wikipedia.output(
- u'Warning: redirect target (%s) is on a different site.'
+ if targetPage.site() != self.site:
+ wikipedia.output(u'Warning: redirect target (%s) is on a different site.'
% (targetPage.aslink()))
if self.always:
break # skip if automatic
@@ -710,10 +694,8 @@
and targetPage.site().lang in sd_tagging_sum:
wikipedia.output(u"Tagging redirect for deletion")
# Delete the two redirects
- content = wikipedia.translate(targetPage.site().lang,
- sd_template)+"\n"+content
- summ = wikipedia.translate(targetPage.site().lang,
- sd_tagging_sum)
+ content = wikipedia.translate(targetPage.site().lang, sd_template)+"\n"+content
+ summ = wikipedia.translate(targetPage.site().lang, sd_tagging_sum)
targetPage.put(content, summ)
redir.put(content, summ)
else:
@@ -726,9 +708,9 @@
except wikipedia.BadTitle:
wikipedia.output(u"Bad Title Error")
break
- text = mysite.redirectRegex().sub(
+ text = self.site.redirectRegex().sub(
'#%s %s' %
- (mysite.redirect( True ),
+ (self.site.redirect( True ),
targetPage.aslink()),
oldText)
if text == oldText:
@@ -740,40 +722,36 @@
except wikipedia.LockedPage:
wikipedia.output(u'%s is locked.' % redir.title())
except wikipedia.SpamfilterError, error:
- wikipedia.output(
-u"Saving page [[%s]] prevented by spam filter: %s"
+ wikipedia.output(u"Saving page [[%s]] prevented by spam filter: %s"
% (redir.title(), error.url))
except wikipedia.PageNotSaved, error:
wikipedia.output(u"Saving page [[%s]] failed: %s"
% (redir.title(), error))
except wikipedia.NoUsername:
- wikipedia.output(
-u"Page [[%s]] not saved; sysop privileges required."
+ wikipedia.output(u"Page [[%s]] not saved; sysop privileges required."
% redir.title())
except wikipedia.Error, error:
- wikipedia.output(
-u"Unexpected error occurred trying to save [[%s]]: %s"
+ wikipedia.output(u"Unexpected error occurred trying to save [[%s]]: %s"
% (redir.title(), error))
break
def fix_double_or_delete_broken_redirects(self):
# TODO: part of this should be moved to generator, the rest merged into self.run()
- mysite = wikipedia.getSite()
# get reason for deletion text
- delete_reason = wikipedia.translate(mysite, reason_broken)
- double_summary = wikipedia.translate(mysite, msg_double)
+ delete_reason = wikipedia.translate(self.site, reason_broken)
+ double_summary = wikipedia.translate(self.site, msg_double)
count = 0
for (redir_name, code, target, final) in self.generator.get_redirects_via_api(
namespaces = self.generator.namespaces,
- site = mysite, start = self.generator.api_start,
+ start = self.generator.api_start,
until = self.generator.api_until, maxlen = 2):
if code == 1:
continue
elif code == 0:
- self.delete_1_broken_redirect(mysite, redir_name, delete_reason)
+ self.delete_1_broken_redirect(redir_name, delete_reason)
count += 1
else:
- self.fix_1_double_redirect(mysite, redir_name, double_summary)
+ self.fix_1_double_redirect(redir_name, double_summary)
count += 1
# print ('%s .. %s' % (count, self.number))
if self.exiting or ( self.number and count >= self.number ):
@@ -785,7 +763,7 @@
if self.action == 'double':
# get summary text
wikipedia.setAction(
- wikipedia.translate(wikipedia.getSite(), msg_double))
+ wikipedia.translate(self.site, msg_double))
self.fix_double_redirects()
elif self.action == 'broken':
self.delete_broken_redirects()