SVN: [5816] trunk/pywikipedia/imagerecat.py - Pywikipedia-l

19 Aug 2008

Revision: 5816
Author:   multichill
Date:     2008-08-19 19:20:14 +0000 (Tue, 19 Aug 2008)

Log Message:
-----------
Rewrite, removed all the threading. Made some initial filters.

Modified Paths:
--------------
    trunk/pywikipedia/imagerecat.py

Modified: trunk/pywikipedia/imagerecat.py
===================================================================

--- trunk/pywikipedia/imagerecat.py	2008-08-19 12:05:23 UTC (rev 5815)
+++ trunk/pywikipedia/imagerecat.py	2008-08-19 19:20:14 UTC (rev 5816)
@@ -2,29 +2,9 @@
 """
 Program to (re)categorize images at commons.
 
-The program uses commonshelper for category suggestions. The program consists of three
parts.
+The program uses commonshelper for category suggestions.
+It takes the suggestions and the current categories. Put the categories through some
filters and add the result
 
-1. prefetchThread - Fetches all the information
-2. userThread - Gets input from the user
-3. putThread - modifies the images
-
-You need to install the Python Imaging Library http://www.pythonware.com/products/pil/ to
get this program working
-
-The program is far from finished. The framework is there, but still a lot has to be
implemented:
-1. The prefetch thread
-    * Mostly finished.
-    * Should add some error handling to cope with a slow toolserver
-    * Should check if images with special chars work alright
-    * Parameter to dont use commonshelper?
-2. The user thread
-    * Tkinter layout is awful atm
-    * Tkinter have to implement most of the interaction
-    * Tkinter category webbrowser link
-    * Tkinter something with category auto completion (like the javascript in the search
box)
-3. The put thread
-    * Nothing much to put atm
-    * Should remove the Uncategorized template (+ redirects)
-    * Should check if something is actually changed (set operations?)
 """
 #
 #  (C) Multichill 2008
@@ -34,183 +14,135 @@
 #
 import os, sys, re, codecs
 import urllib, httplib, urllib2
-import catlib, thread
-import time, threading
+import catlib
+import time
 import wikipedia, config
-import pagegenerators, add_text, Queue, StringIO
+import pagegenerators, StringIO
 import socket
 
-exitProgram = False
-#autonomous = False
-
 category_blacklist = [u'Hidden categories']
 
-class prefetchThread (threading.Thread):
+def categorizeImages(generator):
+    for page in generator:
+        if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage()):
+            imagepage = wikipedia.ImagePage(page.site(), page.title())
+            #imagepage.get()
+            wikipedia.output(u'Working on ' + imagepage.title());
+            currentCats = getCurrentCats(imagepage)
+            commonshelperCats = getCommonshelperCats(imagepage)
+            newcats = filterBlacklist(commonshelperCats+currentCats)
+            #newcats = filterDisambiguation(newcats)
+            #newcats = filterRedirects(newcats)
+            #newcats = filterCountries(newcats)
+            newcats = filterParents(newcats)
+            if len(newcats) > 0:
+                for cat in newcats:
+                    wikipedia.output(u' Found new cat: ' + cat);
+                saveImagePage(imagepage, newcats)
+
+
+def getCurrentCats(imagepage):
     '''
-    Class to fetch al the info for the user. This thread gets the imagepage, the
commonshelper suggestions and the image.
-    The thread puts this item in a queue. When there are no more pages left the thread
puts a None object in the queue and exits.
+    Get the categories currently on the image
     '''
-    def __init__ (self, generator, prefetchToPutQueue):
-        '''
-        Get the thread ready
-        '''
-        self.generator = generator
-        self.prefetchToPutQueue = prefetchToPutQueue
-        self.currentCats = []
-        self.commonshelperCats = []
-        self.image = None
-        self.imagepage = None
-        self.pregenerator = pagegenerators.PreloadingGenerator(self.generator)
-        threading.Thread.__init__ ( self )
-        
-    def run(self):
-        global exitProgram
-        #global autonomous
-        for page in self.pregenerator:
-            if exitProgram:
-                break;            
-            if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage())
:
-                self.imagepage = wikipedia.ImagePage(page.site(), page.title())
-                self.imagepage.get()
-                wikipedia.output(u'Working on ' + self.imagepage.title());
-                self.currentCats = self.getCurrentCats(self.imagepage)
-                self.commonshelperCats = self.filterCats(self.currentCats,
self.getCommonshelperCats(self.imagepage))
-                                    
-                #if not autonomous:
-                #    self.image = self.getImage(self.imagepage)
-                #self.prefetchToUserQueue.put((self.imagepage, self.currentCats,
self.commonshelperCats, self.image))
+    result = []
+    for cat in imagepage.categories():
+        result.append(cat.titleWithoutNamespace())
+    return list(set(result))
 
-                if len(self.commonshelperCats) > 0:
-                    for cat in self.commonshelperCats:
-                        wikipedia.output(u' Found new cat: ' + cat);             
         
-                    self.prefetchToPutQueue.put((self.imagepage,
self.commonshelperCats))
 
-        self.prefetchToPutQueue.put(None)
-        return
-    
-    def getCurrentCats(self, imagepage):
-        '''
-        Get the categories currently on the image
-        '''
-        result = []
-        for cat in imagepage.categories():
-            result.append(cat.titleWithoutNamespace())
-        return result
-    
-    def getCommonshelperCats(self, imagepage):
-        '''
-        Get category suggestions from commonshelper. Parse them and return a list of
suggestions.        
-        '''
-        parameters = urllib.urlencode({'i' :
imagepage.titleWithoutNamespace().encode('utf-8'), 'r' : 'on',
'go-clean' : 'Find+Categories', 'cl' : 'li'})
-        commonsenseRe =
re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usage>(\d)+)\)(.*)#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)(.*)#EOF$',
re.MULTILINE + re.DOTALL)
+def getCommonshelperCats(imagepage):
+    '''
+    Get category suggestions from commonshelper. Parse them and return a list of
suggestions.        
+    '''
+    result = []
+    parameters = urllib.urlencode({'i' :
imagepage.titleWithoutNamespace().encode('utf-8'), 'r' : 'on',
'go-clean' : 'Find+Categories', 'cl' : 'li'})
+    commonsenseRe =
re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usage>(\d)+)\)(.*)#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)(.*)#EOF$',
re.MULTILINE + re.DOTALL)
 
-        gotInfo = False;
+    gotInfo = False;
 
-        while(not gotInfo):
-            try:
-                commonsHelperPage =
urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.ph… %
parameters)
-                matches =
commonsenseRe.search(commonsHelperPage.read().decode('utf-8'))
-                gotInfo = True;
-            except IOError:
-                wikipedia.output(u'Got an IOError, let\'s try again')
-            except socket.timeout:
-                wikipedia.output(u'Got a timeout, let\'s try again')         
      
+    while(not gotInfo):
+        try:
+            commonsHelperPage =
urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.ph… %
parameters)
+            matches =
commonsenseRe.search(commonsHelperPage.read().decode('utf-8'))
+            gotInfo = True
+        except IOError:
+            wikipedia.output(u'Got an IOError, let\'s try again')
+        except socket.timeout:
+            wikipedia.output(u'Got a timeout, let\'s try again')             
  
 
-        if matches:
-            if(matches.group('catnum') > 0):
-                return matches.group('cats').splitlines()
-        else:            
-            return []
-        
-    def filterCats(self, currentCats, commonshelperCats):
-        '''
-        Remove the current categories from the suggestions and remove blacklisted cats.
-        '''
-        result = []
-        toFilter = ""
+    if matches:
+        if(matches.group('catnum') > 0):
+            categories = matches.group('cats').splitlines()
+            for cat in categories:
+                result.append(cat.replace('_',' '))
+            
+    return list(set(result))
 
-        for cat in currentCats:
-            cat = cat.replace('_',' ')
-            toFilter = toFilter + "[[Category:" + cat + "]]\n"
-        for cat in commonshelperCats:
-            cat = cat.replace('_',' ')
-            toFilter = toFilter + "[[Category:" + cat + "]]\n"
-        parameters = urllib.urlencode({'source' :
toFilter.encode('utf-8'), 'bot' : '1'})
-        filterCategoriesPage =
urllib.urlopen("http://toolserver.org/~multichill/filtercats.php?%s&qu… %
parameters)
-        #print filterCategoriesPage.read().decode('utf-8')
-        filterCategoriesRe = re.compile('\[\[Category:([^\]]*)\]\]')
-        result =
filterCategoriesRe.findall(filterCategoriesPage.read().decode('utf-8'))
-        #print matches
-        '''
-            if matches:
-                print "Found matches"
-                if(matches.group('cats') > 0):
-                    print matches.group('cats').splitlines()
-                    '''
-        '''
-                
-        #currentCatsSet = set(currentCats)
-        for cat in commonshelperCats:
-            cat = cat.replace('_',' ')
-            if (cat not in currentCatsSet) and (cat not in category_blacklist):
-                result.append(cat)
-        '''
-        return list(set(result))
-    
-    def getImage(self, imagepage):
-        '''
-        Get the image from the wiki
-        '''
-        url = imagepage.fileUrl()
-        uo = wikipedia.MyURLopener()
-          
-        file = uo.open(url)
 
-        if 'text/html' in file.info().getheader('Content-Type'):
-            wikipedia.output(u'Couldn\'t download the image: the requested URL
was not found on this server.')
-            return
-        
-        image = file.read()             
-        file.close()
-     
-        return image                           
+def filterBlacklist(categories):
+    result = []
+    for cat in categories:
+        if (cat not in category_blacklist):
+            result.append(cat)
+    return list(set(result))
 
-class putThread (threading.Thread):
+
+def filterDisambiguation(categories):
+    result = []
+    return result
+
+
+def filterRedirects(categories):
+    result = []
+    return result
+
+
+def filterCountries(categories):
+    result = []
+    return result
+
+
+def filterParents(categories):
     '''
-    class to do the actual changing of images
+    Remove the current categories from the suggestions and remove blacklisted cats.
     '''
-    def __init__ (self, userToPutQueue):        
-        self.userToPutQueue = userToPutQueue
-        self.item = None
-        self.imagepage = None
-        self.newcats = []
-        self.newtext = u''
-        threading.Thread.__init__ ( self )
-        
-    def run(self):        
+    result = []
+    toFilter = u''
 
-        while True:
-            self.item = self.userToPutQueue.get()            
-            if self.item is None:                
-                break
-            else:
-                (self.imagepage, self.newcats)=self.item
-                self.newtext = wikipedia.removeCategoryLinks(self.imagepage.get(),
self.imagepage.site())
-                self.newtext = self.removeUncat(self.newtext) +
u'{{subst:chc}}\n'
-                for category in self.newcats:
-                    self.newtext = self.newtext + u'[[Category:' + category +
u']]\n'
-                    
-                wikipedia.showDiff(self.imagepage.get(), self.newtext)
-                #Should change this for not autonomous operation.
-                self.imagepage.put(self.newtext, u'Image is categorized by a bot
using data from [[Commons:Tools#CommonSense|CommonSense]]')
-        return
-    def removeUncat(self, oldtext = u''):
-        result = u''
-        result = re.sub(u'\{\{\s*([Uu]ncat(egori[sz]ed(
image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'', oldtext)        
-        result = re.sub(u'<!-- Remove this line once you have added categories
-->', u'', result)
-        #wikipedia.showDiff(oldtext, result)
-        return result      
+    for cat in categories:
+        cat = cat.replace('_',' ')
+        toFilter = toFilter + "[[Category:" + cat + "]]\n"
+    #try:
+    parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'),
'bot' : '1'})
+    filterCategoriesPage =
urllib.urlopen("http://toolserver.org/~multichill/filtercats.php?%s&qu… %
parameters)
+    #print filterCategoriesPage.read().decode('utf-8')
+    filterCategoriesRe = re.compile('\[\[Category:([^\]]*)\]\]')
+    result =
filterCategoriesRe.findall(filterCategoriesPage.read().decode('utf-8'))
+    #except:
+    
+    return result
+
+
+def saveImagePage(imagepage, newcats):
+    newtext = wikipedia.removeCategoryLinks(imagepage.get(), imagepage.site())
+    newtext = removeTemplates(newtext) + u'{{subst:chc}}\n'
+    for category in newcats:
+        newtext = newtext + u'[[Category:' + category + u']]\n'
         
+    wikipedia.showDiff(imagepage.get(), newtext)
+    #imagepage.put(newtext, u'Image is categorized by a bot using data from
[[Commons:Tools#CommonSense|CommonSense]]')
+    return
+
+
+def removeTemplates(oldtext = u''):
+    result = u''
+    result = re.sub(u'\{\{\s*([Uu]ncat(egori[sz]ed(
image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'', oldtext)        
+    result = re.sub(u'<!-- Remove this line once you have added categories
-->', u'', result)
+    result = re.sub(u'\{\{\s*[Cc]heck categories[^}]*\}\}', u'',
oldtext)
+    return result         
+
+
 def main(args):
     '''
     Main loop. Get a generator. Set up the 3 threads and the 2 queue's and fire
everything up.
@@ -218,7 +150,7 @@
     generator = None;
     
     genFactory = pagegenerators.GeneratorFactory()
-    #global autonomous
+
     site = wikipedia.getSite(u'commons', u'commons')
     wikipedia.setSite(site)
     for arg in wikipedia.handleArgs():
@@ -227,32 +159,15 @@
                 generator = [wikipedia.Page(site, wikipedia.input(u'What page do you
want to use?'))]
             else:
                 generator = [wikipedia.Page(site, arg[6:])]
-        elif arg == '-autonomous':
-            autonomous = True
         else:
             generator = genFactory.handleArg(arg)
     if not generator:
-        generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site,
u'Category:Media needing categories'))
-        #raise add_text.NoEnoughData('You have to specify the generator you want to
use for the script!')
+        generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site,
u'Category:Media needing categories'), recurse=True)
 
+    categorizeImages(generator)
+        
+    wikipedia.output(u'All done')    
 
-    prefetchToPutQueue=Queue.Queue()    
-    
-    # Start the prefetch thread
-    prefetchThread(generator, prefetchToPutQueue).start()
- 
-    # Start the user thread
-    # userThread(prefetchToUserQueue, userToPutQueue).start()
-
-    # Start the put thread
-    putThread(prefetchToPutQueue).start()
-
-    # Wait for all threads to finish    
-    for openthread in threading.enumerate():
-        if openthread != threading.currentThread():
-            openthread.join()        
-    wikipedia.output(u'All threads are done')    
-
 if __name__ == "__main__":
     try:
         main(sys.argv[1:])