SVN: [8530] trunk/pywikipedia/imagerecat.py - Pywikipedia-svn

12 Sep 2010

Revision: 8530
Author:   xqt
Date:     2010-09-12 12:19:44 +0000 (Sun, 12 Sep 2010)
Log Message:
-----------
import wikipedia as pywikibot for merging to rewrite branch;
use wikipedia lib for categoryRedirects; some speedup fixes
Modified Paths:
--------------
    trunk/pywikipedia/imagerecat.py
Modified: trunk/pywikipedia/imagerecat.py
===================================================================

--- trunk/pywikipedia/imagerecat.py	2010-09-12 11:41:43 UTC (rev 8529)
+++ trunk/pywikipedia/imagerecat.py	2010-09-12 12:19:44 UTC (rev 8530)
@@ -3,13 +3,16 @@
 Program to (re)categorize images at commons.
The program uses commonshelper for category suggestions.
-It takes the suggestions and the current categories. Put the categories through some filters and adds the result.
+It takes the suggestions and the current categories. Put the categories through
+some filters and adds the result.
The following command line parameters are supported:
--onlyfilter     Don't use Commonsense to get categories, just filter the current categories
+-onlyfilter     Don't use Commonsense to get categories, just filter the current
+                categories
--onlyuncat      Only work on uncategorized images. Will prevent the bot from working on an image multiple times.
+-onlyuncat      Only work on uncategorized images. Will prevent the bot from
+                working on an image multiple times.
-hint           Give Commonsense a hint.
                 For example -hint:li.wikipedia.org
@@ -32,9 +35,11 @@
 import urllib, httplib, urllib2
 import catlib
 import time
-import wikipedia, config
-import pagegenerators, StringIO
 import socket
+import StringIO
+import wikipedia as pywikibot
+import config
+import pagegenerators
category_blacklist = []
 countries = []
@@ -49,29 +54,33 @@
     global category_blacklist
     global countries
-    blacklistPage = wikipedia.Page(wikipedia.getSite(u'commons', u'commons'), u'User:Multichill/Category_blacklist')
+    blacklistPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'),
+                                   u'User:Multichill/Category_blacklist')
     for cat in blacklistPage.linkedPages():
         category_blacklist.append(cat.titleWithoutNamespace())
-    countryPage = wikipedia.Page(wikipedia.getSite(u'commons', u'commons'), u'User:Multichill/Countries')
+    countryPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'),
+                                 u'User:Multichill/Countries')
     for country in countryPage.linkedPages():
         countries.append(country.titleWithoutNamespace())
     return
def categorizeImages(generator, onlyFilter, onlyUncat):
+    ''' Loop over all images in generator and try to categorize them. Get
+    category suggestions from CommonSense.
+
     '''
-    Loop over all images in generator and try to categorize them. Get category suggestions from CommonSense.
-    '''
     for page in generator:
-        if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage()):
-            imagepage = wikipedia.ImagePage(page.site(), page.title())
-            wikipedia.output(u'Working on ' + imagepage.title())
+        if page.exists() and (page.namespace() == 6) and \
+           (not page.isRedirectPage()):
+            imagepage = pywikibot.ImagePage(page.site(), page.title())
+            pywikibot.output(u'Working on ' + imagepage.title())
-            if(onlyUncat and not(u'Uncategorized' in imagepage.templates())):
-                wikipedia.output(u'No Uncategorized template found')
+            if (onlyUncat and not(u'Uncategorized' in imagepage.templates())):
+                pywikibot.output(u'No Uncategorized template found')
             else:                
                 currentCats = getCurrentCats(imagepage)
-                if(onlyFilter):
+                if onlyFilter:
                     commonshelperCats = []
                     usage = []
                     galleries = []
@@ -81,25 +90,22 @@
if (len(newcats) > 0 and not(set(currentCats)==set(newcats))):
                     for cat in newcats:
-                        wikipedia.output(u' Found new cat: ' + cat);
-                    saveImagePage(imagepage, newcats, usage, galleries, onlyFilter)
+                        pywikibot.output(u' Found new cat: ' + cat);
+                    saveImagePage(imagepage, newcats, usage, galleries,
+                                  onlyFilter)
-
-
 def getCurrentCats(imagepage):
-    '''
-    Get the categories currently on the image
-    '''
+    ''' Get the categories currently on the image '''
     result = []
     for cat in imagepage.categories():
         result.append(cat.titleWithoutNamespace())
     return list(set(result))
-
 def getCommonshelperCats(imagepage):
+    ''' Get category suggestions from CommonSense. Parse them and return a list
+    of suggestions.
+
     '''
-    Get category suggestions from CommonSense. Parse them and return a list of suggestions.
-    '''
     commonshelperCats = []
     usage = []
     galleries = []
@@ -116,82 +122,76 @@
     else:
         #Cant handle other sites atm
         return ([], [], [])
-    
+
     commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+((?P<usagenum>(\d)+))\s(?P<usage>(.*))\s#KEYWORDS(\s)+((?P<keywords>(\d)+))(.*)#CATEGORIES(\s)+((?P<catnum>(\d)+))\s(?P<cats>(.*))\s#GALLERIES(\s)+((?P<galnum>(\d)+))\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL)
gotInfo = False
     matches = None
     maxtries = 10
     tries = 0
-    
     while(not gotInfo):
         try:
             if ( tries < maxtries ):
                 tries = tries + 1
-                commonsHelperPage = urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)
-                matches = commonsenseRe.search(commonsHelperPage.read().decode('utf-8'))
+                commonsHelperPage = urllib.urlopen(
+                    "http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)
+                matches = commonsenseRe.search(
+                    commonsHelperPage.read().decode('utf-8'))
                 gotInfo = True
             else:
                 break
         except IOError:
-            wikipedia.output(u'Got an IOError, let's try again')
+            pywikibot.output(u'Got an IOError, let's try again')
         except socket.timeout:
-            wikipedia.output(u'Got a timeout, let's try again')
+            pywikibot.output(u'Got a timeout, let's try again')
if (matches and gotInfo):
-        if(matches.group('usagenum') > 0):
+        if (matches.group('usagenum') > 0):
             used = matches.group('usage').splitlines()
             for use in used:
                 usage= usage + getUsage(use)
-                #wikipedia.output(use)
-        if(matches.group('catnum') > 0):
+                #pywikibot.output(use)
+        if (matches.group('catnum') > 0):
             cats = matches.group('cats').splitlines()
             for cat in cats:
                 commonshelperCats.append(cat.replace('_',' '))
-                wikipedia.output(u'category : ' + cat)
-        if(matches.group('galnum') > 0):
+                pywikibot.output(u'category : ' + cat)
+        if (matches.group('galnum') > 0):
             gals = matches.group('gals').splitlines()
             for gal in gals:
                 galleries.append(gal.replace('_',' '))
-                wikipedia.output(u'gallery : ' + gal)
+                pywikibot.output(u'gallery : ' + gal)
     commonshelperCats = list(set(commonshelperCats))
     galleries = list(set(galleries))
     for (lang, project, article) in usage:
-        wikipedia.output(lang + project + article)
-        
+        pywikibot.output(lang + project + article)
     return (commonshelperCats, usage, galleries)
def getUsage(use):
-    '''
-    Parse the Commonsense output to get the usage
-    '''
+    ''' Parse the Commonsense output to get the usage '''
     result = []
     lang = ''
     project = ''
     article = ''
-    usageRe = re.compile('^(?P<lang>([\w]+)).(?P<project>([\w]+)).org:(?P<articles>\s(.*))')
+    usageRe = re.compile(
+        '^(?P<lang>([\w]+)).(?P<project>([\w]+)).org:(?P<articles>\s(.*))')
     matches = usageRe.search(use)
     if matches:
-        if(matches.group('lang')):
+        if (matches.group('lang')):
             lang = matches.group('lang')
-            #wikipedia.output(lang)
-        if(matches.group('project')):
+            #pywikibot.output(lang)
+        if (matches.group('project')):
             project = matches.group('project')
-            #wikipedia.output(project)
-        if(matches.group('articles')):
+            #pywikibot.output(project)
+        if (matches.group('articles')):
             articles = matches.group('articles')
-            #wikipedia.output(articles)
+            #pywikibot.output(articles)
     for article in articles.split():
         result.append((lang, project, article))
-
     return result
-    
-
def applyAllFilters(categories):
-    '''
-    Apply all filters on categories.
-    '''
+    ''' Apply all filters on categories. '''
     result = []
     result = filterBlacklist(categories)
     result = filterDisambiguation(result)
@@ -200,11 +200,8 @@
     result = filterParents(result)
     return result
-
 def filterBlacklist(categories):
-    '''
-    Filter out categories which are on the blacklist.
-    '''
+    ''' Filter out categories which are on the blacklist. '''
     result = []
     for cat in categories:
         cat = cat.replace('_', ' ')
@@ -212,40 +209,36 @@
             result.append(cat)
     return list(set(result))
-
 def filterDisambiguation(categories):
-    '''
-    Filter out disambiguation categories.
-    '''
+    ''' Filter out disambiguation categories. '''
     result = []
     for cat in categories:
-        if(not wikipedia.Page(wikipedia.getSite(u'commons', u'commons'), u'Category:' + cat).isDisambig()):
+        if (not pywikibot.Page(pywikibot.getSite(u'commons', u'commons'),
+                               cat, defaultNamespace=14).isDisambig()):
             result.append(cat)
     return result
def followRedirects(categories):
-    '''
-    If a category is a redirect, replace the category with the target.
-    '''
+    ''' If a category is a redirect, replace the category with the target. '''
     result = []
     for cat in categories:
-        categoryPage = wikipedia.Page(wikipedia.getSite(u'commons', u'commons'), u'Category:' + cat)
-        if u'Category redirect' in categoryPage.templates() or u'Seecat' in categoryPage.templates():
-            for template in categoryPage.templatesWithParams():
-                if ((template[0]==u'Category redirect' or template[0]==u'Seecat') and (len(template[1]) > 0)):
-                    result.append(template[1][0])
+        categoryPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'),
+                                      cat, defaultNamespace=14)
+        if categoryPage.isCategoryRedirect():
+            result.append(getCategoryRedirectTarget(),
+                          categoryPage.titleWithoutNamespace())
         else:
             result.append(cat)
     return result
-
 def filterCountries(categories):
+    ''' Try to filter out ...by country categories.
+    First make a list of any ...by country categories and try to find some
+    countries. If a by country category has a subcategoy containing one of the
+    countries found, add it. The ...by country categories remain in the set and
+    should be filtered out by filterParents.
+
     '''
-    Try to filter out ...by country categories.
-    First make a list of any ...by country categories and try to find some countries.
-    If a by country category has a subcategoy containing one of the countries found, add it.
-    The ...by country categories remain in the set and should be filtered out by filterParents.
-    '''
     result = categories
     listByCountry = []
     listCountries = []
@@ -259,76 +252,70 @@
             for country in countries:
                 if country in cat:
                     listCountries.append(country)
-
     if(len(listByCountry) > 0):
         for bc in listByCountry:
-            category = catlib.Category(wikipedia.getSite(u'commons', u'commons'), u'Category:' + bc)
+            category = catlib.Category(
+                pywikibot.getSite(u'commons', u'commons'), u'Category:' + bc)
             for subcategory in category.subcategories():
                 for country in listCountries:
                     if (subcategory.titleWithoutNamespace().endswith(country)):
                         result.append(subcategory.titleWithoutNamespace())
-
     return list(set(result))
def filterParents(categories):
+    ''' Remove all parent categories from the set to prevent overcategorization.
+
     '''
-    Remove all parent categories from the set to prevent overcategorization.
-    '''
     result = []
     toFilter = u''
-
     for cat in categories:
         cat = cat.replace('_',' ')
         toFilter = toFilter + "[[Category:" + cat + "]]\n"
-    #try:
-    parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'), 'bot' : '1'})
+    parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'),
+                                   'bot' : '1'})
     filterCategoriesRe = re.compile('[[Category:([^]]*)]]')
     try:
-        filterCategoriesPage = urllib.urlopen("http://toolserver.org/~multichill/filtercats.php?%s" % parameters)
-        result = filterCategoriesRe.findall(filterCategoriesPage.read().decode('utf-8'))
+        filterCategoriesPage = urllib.urlopen(
+            "http://toolserver.org/~multichill/filtercats.php?%s" % parameters)
+        result = filterCategoriesRe.findall(
+            filterCategoriesPage.read().decode('utf-8'))
     except IOError:
         #Something is wrong, forget about this filter and just return the input
         return categories
-        
+
     if not result:
         #Is empty, dont want to remove all categories
         return categories
     return result
-
 def saveImagePage(imagepage, newcats, usage, galleries, onlyFilter):
-    '''
-    Remove the old categories and add the new categories to the image.
-    '''
-    newtext = wikipedia.removeCategoryLinks(imagepage.get(), imagepage.site())    
-
+    ''' Remove the old categories and add the new categories to the image. '''
+    newtext = pywikibot.removeCategoryLinks(imagepage.get(), imagepage.site())    
     if not(onlyFilter):
         newtext = removeTemplates(newtext)
-        newtext = newtext + getCheckCategoriesTemplate(usage, galleries, len(newcats))
-
+        newtext = newtext + getCheckCategoriesTemplate(usage, galleries,
+                                                       len(newcats))
     newtext = newtext + u'\n'
-    
     for category in newcats:
         newtext = newtext + u'[[Category:' + category + u']]\n'
-
     if(onlyFilter):
         comment = u'Filtering categories'
     else:
         comment = u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]'
-
-    wikipedia.showDiff(imagepage.get(), newtext)
+    pywikibot.showDiff(imagepage.get(), newtext)
     imagepage.put(newtext, comment)
     return
-
 def removeTemplates(oldtext = u''):
     '''
     Remove {{Uncategorized}} and {{Check categories}} templates
     '''
     result = u''
-    result = re.sub(u'{{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*}}', u'', oldtext)
-    result = re.sub(u'<!-- Remove this line once you have added categories -->', u'', result)
+    result = re.sub(
+        u'{{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*}}', u'', oldtext)
+    result = re.sub(u'<!-- Remove this line once you have added categories -->',
+                    u'', result)
     result = re.sub(u'{{\s*[Cc]heck categories[^}]*}}', u'', result)
     return result
@@ -337,25 +324,21 @@
     Build the check categories template with all parameters
     '''
     result = u'{{Check categories|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}\n'
-
     usageCounter = 1
     for (lang, project, article) in usage:
-        result = result + u'|lang' + str(usageCounter) + u'=' + lang
-        result = result + u'|wiki' + str(usageCounter) + u'=' + project
-        result = result + u'|article' + str(usageCounter) + u'=' + article
-        result = result + u'\n'
+        result += u'|lang%d=' % (usageCounter, lang)
+        result += u'|wiki%d=' % (usageCounter, project)
+        result += u'|article%d=' % (usageCounter, article)
+        result += u'\n'
         usageCounter = usageCounter + 1
-    
     galleryCounter = 1
     for gallery in galleries:
-        result = result + u'|gallery' + str(galleryCounter) + u'=' + gallery.replace('_', ' ') + u'\n'
+        result += u'|gallery%d=%s' % (galleryCounter, gallery.replace('_', ' ')) + u'\n'
         galleryCounter = galleryCounter + 1
-    
-    result = result + u'|ncats='+ str(ncats) + u'\n'
-    result = result + u'}}\n'
+    result += u'|ncats=%d\n' % ncats
+    result += u'}}\n'
     return result
-
 def main(args):
     '''
     Main loop. Get a generator and options. Work on all images in the generator.
@@ -368,9 +351,9 @@
     global search_wikis
     global hint_wiki
-    site = wikipedia.getSite(u'commons', u'commons')
-    wikipedia.setSite(site)
-    for arg in wikipedia.handleArgs():
+    site = pywikibot.getSite(u'commons', u'commons')
+    pywikibot.setSite(site)
+    for arg in pywikibot.handleArgs():
         if arg == '-onlyfilter':
             onlyFilter = True
         elif arg == '-onlyuncat':
@@ -384,15 +367,15 @@
generator = genFactory.getCombinedGenerator()
     if not generator:
-        generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site, u'Category:Media needing categories'), recurse=True)
-
+        generator = pagegenerators.CategorizedPageGenerator(
+            catlib.Category(site, u'Category:Media needing categories'),
+            recurse=True)
     initLists()
     categorizeImages(generator, onlyFilter, onlyUncat)
+    pywikibot.output(u'All done')
-    wikipedia.output(u'All done')
-
 if __name__ == "__main__":
     try:
         main(sys.argv[1:])
     finally:
-        wikipedia.stopme()
+        pywikibot.stopme()