[Pywikipedia-l] SVN: [6363] trunk/pywikipedia/category_redirect.py

russblau at svn.wikimedia.org russblau at svn.wikimedia.org
Wed Feb 18 13:25:42 UTC 2009


Revision: 6363
Author:   russblau
Date:     2009-02-18 13:25:41 +0000 (Wed, 18 Feb 2009)

Log Message:
-----------
use framework's category redirect methods; clean up output, exception catching, etc.

Modified Paths:
--------------
    trunk/pywikipedia/category_redirect.py

Modified: trunk/pywikipedia/category_redirect.py
===================================================================
--- trunk/pywikipedia/category_redirect.py	2009-02-18 12:49:54 UTC (rev 6362)
+++ trunk/pywikipedia/category_redirect.py	2009-02-18 13:25:41 UTC (rev 6363)
@@ -149,11 +149,10 @@
 
         self.edit_request_text = wikipedia.translate(self.site.lang,
             {'en': u"""\
-{{editprotected}}
 The following protected pages have been detected as requiring updates to \
 category links:
 %s
---~~~~
+~~~~
 """,
             })
 
@@ -281,6 +280,13 @@
         waited = 0
         while True:
             response, data = self.site.postForm(addr, querydata)
+            if response.status != 200:
+                # WARNING: if the server is down, this could
+                # cause an infinite loop
+                wikipedia.output(u"HTTP error %i received; retrying..."
+                                  % response.status)
+                time.sleep(5)
+                continue
             if data.startswith(u"unknown_action"):
                 e = {'code': data[:14], 'info': data[16:]}
                 raise APIError(e)
@@ -289,9 +295,9 @@
             except ValueError:
                 # if the result isn't valid JSON, there must be a server
                 # problem.  Wait a few seconds and try again
-                # TODO: warn user; if the server is down, this could
+                # WARNING: if the server is down, this could
                 # cause an infinite loop
-                wikipedia.output("Invalid API response received; retrying...")
+                wikipedia.output(u"Invalid API response received; retrying...")
                 time.sleep(5)
                 continue
             if type(result) is dict and result.has_key("error"):
@@ -313,8 +319,9 @@
                 return
             assert type(result) is dict, \
                    "Unexpected result of type '%s' received." % type(result)
-            assert result.has_key("query"), \
-                   "No 'query' response found, result keys = %s" % result.keys()
+            if "query" not in result:
+                # query returned no results
+                return
             yield result['query']
             if result.has_key("query-continue"):
                 assert len(result['query-continue'].keys()) == 1, \
@@ -388,9 +395,13 @@
         if record:
             cPickle.dump(record, open(datafile + ".bak", "wb"))
 
-        # Set up regexes for later scanning
-        template_list = self.redir_templates[self.site.family.name
-                                             ][self.site.lang]
+        try:
+            template_list = self.redir_templates[self.site.family.name
+                                                ][self.site.lang]
+        except KeyError:
+            wikipedia.output(u"No redirect templates defined for %s"
+                              % self.site.sitename())
+            return
         # regex to match soft category redirects
         #  note that any templates containing optional "category:" are
         #  incorrect and will be fixed by the bot
@@ -405,21 +416,10 @@
                                           for item in template_list),
                      'catns': self.site.namespace(14)},
             re.I|re.X)
-        # regex to match hard redirects to category pages
-        catredir_regex = re.compile(
-            ur'\s*#(?:%(redir)s)\s*:?\s*\[\[\s*:?%(catns)s\s*:(.*?)\]\]\s*'
-                             % {'redir': "|".join(redirect_magicwords),
-                                'catns': self.site.namespace(14)},
-                            re.I)
-        # regex to match all other hard redirects
-        redir_regex = re.compile(ur"(?i)\s*#(?:%s)\s*:?\s*\[\[(.*?)\]\]"
-                                  % "|".join(redirect_magicwords),
-                                 re.I)
 
         # check for hard-redirected categories that are not already marked
         # with an appropriate template
         comment = wikipedia.translate(self.site.lang, self.redir_comment)
-        print comment
         for result in self.query_results(list='allpages',
                                          apnamespace='14', # Category:
                                          apfrom='!',
@@ -427,16 +427,16 @@
                                          aplimit='max'):
             gen = (wikipedia.Page(self.site, page_item['title'])
                    for page_item in result['allpages'])
+            # gen yields all hard redirect pages in namespace 14
             for page in pagegenerators.PreloadingGenerator(gen, 120):
-                text = page.get(get_redirect=True)
-                if re.search(template_regex, text):
+                if page.isCategoryRedirect():
                     # this is already a soft-redirect, so skip it (for now)
                     continue
-                m = catredir_regex.match(text)
-                if m:
+                target = page.getRedirectTarget()
+                if target.namespace() == 14:
                     # this is a hard-redirect to a category page
                     newtext = (u"{{%(template)s|%(cat)s}}"
-                               % {'cat': m.group(1),
+                               % {'cat': target.titleWithoutNamespace(),
                                   'template': template_list[0]})
                     try:
                         page.put(newtext, comment, minorEdit=True)
@@ -450,16 +450,10 @@
                                 page.aslink(textlink=True),
                                 e))
                 else:
-                    r = redir_regex.match(text)
-                    if r:
-                        problems.append(
-                            u"# %s is a hard redirect to [[:%s]]"
-                             % (page.aslink(textlink=True),
-                                r.group(1)))
-                    else:
-                        problems.append(
-                    u"# %s is a hard redirect; unable to extract target."
-                             % page.aslink(textlink=True))
+                    problems.append(
+                        u"# %s is a hard redirect to %s"
+                         % (page.aslink(textlink=True),
+                            target.aslink(textlink=True)))
 
         wikipedia.output("Done checking hard-redirect category pages.")
 
@@ -496,41 +490,40 @@
         for cat in pagegenerators.PreloadingGenerator(catpages, 120):
             cat_title = cat.titleWithoutNamespace()
             if "category redirect" in cat_title:
-                self.log_text.append(u"* Ignoring [[:%s%s]]"
-                                      % (self.catprefix, cat_title))
+                self.log_text.append(u"* Ignoring %s"
+                                      % cat.aslink(textlink=True))
                 continue
             try:
                 text = cat.get(get_redirect=True)
             except wikipedia.Error:
-                self.log_text.append(u"* Could not load [[:%s%s]]; ignoring"
-                                      % (self.catprefix, cat_title))
+                self.log_text.append(u"* Could not load %s; ignoring"
+                                      % cat.aslink(textlink=True))
                 continue
-            match = template_regex.search(text)
-            if match is None:
-                self.log_text.append(u"* False positive: [[:%s%s]]"
-                                      % (self.catprefix, cat_title))
+            if not cat.isCategoryRedirect():
+                self.log_text.append(u"* False positive: %s"
+                                      % cat.aslink(textlink=True))
                 continue
             if cat_title not in record:
                 # make sure every redirect has a record entry
                 record[cat_title] = {today: None}
             catlist.append(cat)
-            destination = match.group(2)
-            target = catlib.Category(self.site, self.catprefix+destination)
+            target = cat.getCategoryRedirectTarget()
+            destination = target.titleWithoutNamespace()
             destmap.setdefault(target, []).append(cat)
             catmap[cat] = destination
-            if match.group(1):
-                # category redirect target starts with "Category:" - fix it
-                text = text[ :match.start(1)] + text[match.end(1): ]
-                try:
-                    cat.put(text,
-                            u"Robot: fixing category redirect parameter format")
-                    self.log_text.append(
-                        u"* Removed category prefix from parameter in %s"
-                         % cat.aslink(textlink=True))
-                except wikipedia.Error:
-                    self.log_text.append(
-                        u"* Unable to save changes to %s"
-                         % cat.aslink(textlink=True))
+##            if match.group(1):
+##                # category redirect target starts with "Category:" - fix it
+##                text = text[ :match.start(1)] + text[match.end(1): ]
+##                try:
+##                    cat.put(text,
+##                            u"Robot: fixing category redirect parameter format")
+##                    self.log_text.append(
+##                        u"* Removed category prefix from parameter in %s"
+##                         % cat.aslink(textlink=True))
+##                except wikipedia.Error:
+##                    self.log_text.append(
+##                        u"* Unable to save changes to %s"
+##                         % cat.aslink(textlink=True))
 
         # delete record entries for non-existent categories
         for cat_name in list(record.keys()):





More information about the Pywikipedia-l mailing list