Revision: 6363
Author: russblau
Date: 2009-02-18 13:25:41 +0000 (Wed, 18 Feb 2009)
Log Message:
-----------
use framework's category redirect methods; clean up output, exception catching, etc.
Modified Paths:
--------------
trunk/pywikipedia/category_redirect.py
Modified: trunk/pywikipedia/category_redirect.py
===================================================================
--- trunk/pywikipedia/category_redirect.py 2009-02-18 12:49:54 UTC (rev 6362)
+++ trunk/pywikipedia/category_redirect.py 2009-02-18 13:25:41 UTC (rev 6363)
@@ -149,11 +149,10 @@
self.edit_request_text = wikipedia.translate(self.site.lang,
{'en': u"""\
-{{editprotected}}
The following protected pages have been detected as requiring updates to \
category links:
%s
---~~~~
+~~~~
""",
})
@@ -281,6 +280,13 @@
waited = 0
while True:
response, data = self.site.postForm(addr, querydata)
+ if response.status != 200:
+ # WARNING: if the server is down, this could
+ # cause an infinite loop
+ wikipedia.output(u"HTTP error %i received; retrying..."
+ % response.status)
+ time.sleep(5)
+ continue
if data.startswith(u"unknown_action"):
e = {'code': data[:14], 'info': data[16:]}
raise APIError(e)
@@ -289,9 +295,9 @@
except ValueError:
# if the result isn't valid JSON, there must be a server
# problem. Wait a few seconds and try again
- # TODO: warn user; if the server is down, this could
+ # WARNING: if the server is down, this could
# cause an infinite loop
- wikipedia.output("Invalid API response received; retrying...")
+ wikipedia.output(u"Invalid API response received; retrying...")
time.sleep(5)
continue
if type(result) is dict and result.has_key("error"):
@@ -313,8 +319,9 @@
return
assert type(result) is dict, \
"Unexpected result of type '%s' received." % type(result)
- assert result.has_key("query"), \
- "No 'query' response found, result keys = %s" % result.keys()
+ if "query" not in result:
+ # query returned no results
+ return
yield result['query']
if result.has_key("query-continue"):
assert len(result['query-continue'].keys()) == 1, \
@@ -388,9 +395,13 @@
if record:
cPickle.dump(record, open(datafile + ".bak", "wb"))
- # Set up regexes for later scanning
- template_list = self.redir_templates[self.site.family.name
- ][self.site.lang]
+ try:
+ template_list = self.redir_templates[self.site.family.name
+ ][self.site.lang]
+ except KeyError:
+ wikipedia.output(u"No redirect templates defined for %s"
+ % self.site.sitename())
+ return
# regex to match soft category redirects
# note that any templates containing optional "category:" are
# incorrect and will be fixed by the bot
@@ -405,21 +416,10 @@
for item in template_list),
'catns': self.site.namespace(14)},
re.I|re.X)
- # regex to match hard redirects to category pages
- catredir_regex = re.compile(
- ur'\s*#(?:%(redir)s)\s*:?\s*\[\[\s*:?%(catns)s\s*:(.*?)\]\]\s*'
- % {'redir': "|".join(redirect_magicwords),
- 'catns': self.site.namespace(14)},
- re.I)
- # regex to match all other hard redirects
- redir_regex = re.compile(ur"(?i)\s*#(?:%s)\s*:?\s*\[\[(.*?)\]\]"
- % "|".join(redirect_magicwords),
- re.I)
# check for hard-redirected categories that are not already marked
# with an appropriate template
comment = wikipedia.translate(self.site.lang, self.redir_comment)
- print comment
for result in self.query_results(list='allpages',
apnamespace='14', # Category:
apfrom='!',
@@ -427,16 +427,16 @@
aplimit='max'):
gen = (wikipedia.Page(self.site, page_item['title'])
for page_item in result['allpages'])
+ # gen yields all hard redirect pages in namespace 14
for page in pagegenerators.PreloadingGenerator(gen, 120):
- text = page.get(get_redirect=True)
- if re.search(template_regex, text):
+ if page.isCategoryRedirect():
# this is already a soft-redirect, so skip it (for now)
continue
- m = catredir_regex.match(text)
- if m:
+ target = page.getRedirectTarget()
+ if target.namespace() == 14:
# this is a hard-redirect to a category page
newtext = (u"{{%(template)s|%(cat)s}}"
- % {'cat': m.group(1),
+ % {'cat': target.titleWithoutNamespace(),
'template': template_list[0]})
try:
page.put(newtext, comment, minorEdit=True)
@@ -450,16 +450,10 @@
page.aslink(textlink=True),
e))
else:
- r = redir_regex.match(text)
- if r:
- problems.append(
- u"# %s is a hard redirect to [[:%s]]"
- % (page.aslink(textlink=True),
- r.group(1)))
- else:
- problems.append(
- u"# %s is a hard redirect; unable to extract target."
- % page.aslink(textlink=True))
+ problems.append(
+ u"# %s is a hard redirect to %s"
+ % (page.aslink(textlink=True),
+ target.aslink(textlink=True)))
wikipedia.output("Done checking hard-redirect category pages.")
@@ -496,41 +490,40 @@
for cat in pagegenerators.PreloadingGenerator(catpages, 120):
cat_title = cat.titleWithoutNamespace()
if "category redirect" in cat_title:
- self.log_text.append(u"* Ignoring [[:%s%s]]"
- % (self.catprefix, cat_title))
+ self.log_text.append(u"* Ignoring %s"
+ % cat.aslink(textlink=True))
continue
try:
text = cat.get(get_redirect=True)
except wikipedia.Error:
- self.log_text.append(u"* Could not load [[:%s%s]]; ignoring"
- % (self.catprefix, cat_title))
+ self.log_text.append(u"* Could not load %s; ignoring"
+ % cat.aslink(textlink=True))
continue
- match = template_regex.search(text)
- if match is None:
- self.log_text.append(u"* False positive: [[:%s%s]]"
- % (self.catprefix, cat_title))
+ if not cat.isCategoryRedirect():
+ self.log_text.append(u"* False positive: %s"
+ % cat.aslink(textlink=True))
continue
if cat_title not in record:
# make sure every redirect has a record entry
record[cat_title] = {today: None}
catlist.append(cat)
- destination = match.group(2)
- target = catlib.Category(self.site, self.catprefix+destination)
+ target = cat.getCategoryRedirectTarget()
+ destination = target.titleWithoutNamespace()
destmap.setdefault(target, []).append(cat)
catmap[cat] = destination
- if match.group(1):
- # category redirect target starts with "Category:" - fix it
- text = text[ :match.start(1)] + text[match.end(1): ]
- try:
- cat.put(text,
- u"Robot: fixing category redirect parameter format")
- self.log_text.append(
- u"* Removed category prefix from parameter in %s"
- % cat.aslink(textlink=True))
- except wikipedia.Error:
- self.log_text.append(
- u"* Unable to save changes to %s"
- % cat.aslink(textlink=True))
+## if match.group(1):
+## # category redirect target starts with "Category:" - fix it
+## text = text[ :match.start(1)] + text[match.end(1): ]
+## try:
+## cat.put(text,
+## u"Robot: fixing category redirect parameter format")
+## self.log_text.append(
+## u"* Removed category prefix from parameter in %s"
+## % cat.aslink(textlink=True))
+## except wikipedia.Error:
+## self.log_text.append(
+## u"* Unable to save changes to %s"
+## % cat.aslink(textlink=True))
# delete record entries for non-existent categories
for cat_name in list(record.keys()):