SVN: [5798] trunk/pywikipedia/imagerecat.py - Pywikipedia-l

15 Aug 2008

Revision: 5798
Author:   multichill
Date:     2008-08-15 13:51:19 +0000 (Fri, 15 Aug 2008)
Log Message:
-----------
Building this program in a different way. Only going to run autonomous. Still have to remove the threading part, it's now kind of useless.
Modified Paths:
--------------
    trunk/pywikipedia/imagerecat.py
Modified: trunk/pywikipedia/imagerecat.py
===================================================================

--- trunk/pywikipedia/imagerecat.py	2008-08-15 02:35:56 UTC (rev 5797)
+++ trunk/pywikipedia/imagerecat.py	2008-08-15 13:51:19 UTC (rev 5798)
@@ -32,29 +32,29 @@
 # Distributed under the terms of the MIT license.
 #
 #
-
-from Tkinter import *
-from PIL import Image, ImageTk
 import os, sys, re, codecs
 import urllib, httplib, urllib2
-import catlib, thread, webbrowser
+import catlib, thread
 import time, threading
 import wikipedia, config
 import pagegenerators, add_text, Queue, StringIO
-exitProgram = 0
+exitProgram = False
+#autonomous = False
+category_blacklist = [u'Hidden categories']
+
 class prefetchThread (threading.Thread):
     '''
     Class to fetch al the info for the user. This thread gets the imagepage, the commonshelper suggestions and the image.
     The thread puts this item in a queue. When there are no more pages left the thread puts a None object in the queue and exits.
     '''
-    def __init__ (self, generator, prefetchToUserQueue):
+    def __init__ (self, generator, prefetchToPutQueue):
         '''
         Get the thread ready
         '''
         self.generator = generator
-        self.prefetchToUserQueue = prefetchToUserQueue
+        self.prefetchToPutQueue = prefetchToPutQueue
         self.currentCats = []
         self.commonshelperCats = []
         self.image = None
@@ -63,19 +63,28 @@
         threading.Thread.__init__ ( self )
def run(self):
-
-        global exitProgram        
+        global exitProgram
+        #global autonomous
         for page in self.pregenerator:
-            if exitProgram != 0:
+            if exitProgram:
                 break;            
             if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage()) :
                 self.imagepage = wikipedia.ImagePage(page.site(), page.title())
                 self.imagepage.get()
+                wikipedia.output(u'Working on ' + self.imagepage.title());
                 self.currentCats = self.getCurrentCats(self.imagepage)
-                self.commonshelperCats = self.filterCommonsHelperCats(self.currentCats, self.getCommonshelperCats(self.imagepage))
-                self.image = self.getImage(self.imagepage)
-                self.prefetchToUserQueue.put((self.imagepage, self.currentCats, self.commonshelperCats, self.image))
-        self.prefetchToUserQueue.put(None)
+                self.commonshelperCats = self.filterCats(self.currentCats, self.getCommonshelperCats(self.imagepage))
+                                    
+                #if not autonomous:
+                #    self.image = self.getImage(self.imagepage)
+                #self.prefetchToUserQueue.put((self.imagepage, self.currentCats, self.commonshelperCats, self.image))
+
+                if len(self.commonshelperCats) > 0:
+                    for cat in self.commonshelperCats:
+                        wikipedia.output(u' Found new cat: ' + cat);                       
+                    self.prefetchToPutQueue.put((self.imagepage, self.commonshelperCats))
+
+        self.prefetchToPutQueue.put(None)
         return
def getCurrentCats(self, imagepage):
@@ -91,29 +100,61 @@
         '''
         Get category suggestions from commonshelper. Parse them and return a list of suggestions.        
         '''
-        parameters = urllib.urlencode({'i' : imagepage.titleWithoutNamespace(), 'r' : 'on', 'go-clean' : 'Find+Categories'})        
-        commonsHelperPage = urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)        
-        
+        parameters = urllib.urlencode({'i' : imagepage.titleWithoutNamespace().encode('utf-8'), 'r' : 'on', 'go-clean' : 'Find+Categories', 'cl' : 'li'})
         commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+((?P<usage>(\d)+))(.*)#KEYWORDS(\s)+((?P<keywords>(\d)+))(.*)#CATEGORIES(\s)+((?P<catnum>(\d)+))\s(?P<cats>(.*))\s#GALLERIES(\s)+((?P<galnum>(\d)+))(.*)#EOF$', re.MULTILINE + re.DOTALL)
-        matches = commonsenseRe.search(commonsHelperPage.read())
+        gotInfo = False;
+
+        while(not gotInfo):
+            try:
+                commonsHelperPage = urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)
+                matches = commonsenseRe.search(commonsHelperPage.read().decode('utf-8'))
+                gotInfo = True;
+            except IOError:
+                wikipedia.output(u'Got an IOError, let's try again')
+            except socket.timeout:
+                wikipedia.output(u'Got a timeout, let's try again')                
+
         if matches:
             if(matches.group('catnum') > 0):
                 return matches.group('cats').splitlines()
         else:            
             return []
-    def filterCommonsHelperCats(self, currentCats, commonshelperCats):
+    def filterCats(self, currentCats, commonshelperCats):
         '''
-        Remove the current categories from the suggestions.
+        Remove the current categories from the suggestions and remove blacklisted cats.
         '''
         result = []
-        currentCatsSet = set(currentCats)
+        toFilter = ""
+
+        for cat in currentCats:
+            cat = cat.replace('_',' ')
+            toFilter = toFilter + "[[Category:" + cat + "]]\n"
         for cat in commonshelperCats:
             cat = cat.replace('_',' ')
-            if cat not in currentCatsSet:
+            toFilter = toFilter + "[[Category:" + cat + "]]\n"
+        parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'), 'bot' : '1'})
+        filterCategoriesPage = urllib.urlopen("http://toolserver.org/~multichill/filtercats.php?%s" % parameters)
+        #print filterCategoriesPage.read().decode('utf-8')
+        filterCategoriesRe = re.compile('[[Category:([^]]*)]]')
+        result = filterCategoriesRe.findall(filterCategoriesPage.read().decode('utf-8'))
+        #print matches
+        '''
+            if matches:
+                print "Found matches"
+                if(matches.group('cats') > 0):
+                    print matches.group('cats').splitlines()
+                    '''
+        '''
+                
+        #currentCatsSet = set(currentCats)
+        for cat in commonshelperCats:
+            cat = cat.replace('_',' ')
+            if (cat not in currentCatsSet) and (cat not in category_blacklist):
                 result.append(cat)
-        return result
+        '''
+        return list(set(result))
def getImage(self, imagepage):
         '''
@@ -132,176 +173,51 @@
         file.close()
return image                           
-    
-class userThread (threading.Thread):    
-    def __init__ (self, prefetchToUserQueue, userToPutQueue):
-        self.prefetchToUserQueue = prefetchToUserQueue
-        self.userToPutQueue = userToPutQueue
-        self.item = None
-        self.imagepage = None
-        self.image = None
-        self.currentCats = []
-        self.commonshelperCats = []
-        self.newCats = []        
-        self.skip = 0
-        
-        threading.Thread.__init__ ( self )
-        
-    def run(self):
-        
-        global exitProgram
-        while exitProgram == 0:
-            self.item = self.prefetchToUserQueue.get()           
-            if self.item is None:                                
-                break
-            else:
-                (self.imagepage, self.currentCats, self.commonshelperCats, self.image) = self.item
-                (self.skip, exitProgram, self.newCats) = Tkdialog(self.imagepage.titleWithoutNamespace(), self.image, self.imagepage.get(), self.currentCats, self.commonshelperCats, self.imagepage.permalink()).run()
-                if not self.skip:
-                    self.userToPutQueue.put((self.imagepage, self.newCats))
-        self.userToPutQueue.put(None)
-        return
-
 class putThread (threading.Thread):
     '''
     class to do the actual changing of images
     '''
     def __init__ (self, userToPutQueue):        
         self.userToPutQueue = userToPutQueue
+        self.item = None
+        self.imagepage = None
+        self.newcats = []
+        self.newtext = u''
         threading.Thread.__init__ ( self )
def run(self):        
-        item = None
-        imagepage = None
-        newtext = u''
+
         while True:
-            item = self.userToPutQueue.get()            
-            if item is None:                
+            self.item = self.userToPutQueue.get()            
+            if self.item is None:                
                 break
             else:
-                (imagepage, newtext)=item
-                #wikipedia.showDiff(imagepage.get(), newtext)
-                #imagepage.put(newtext, u'Recat by bot')
+                (self.imagepage, self.newcats)=self.item
+                self.newtext = wikipedia.removeCategoryLinks(self.imagepage.get(), self.imagepage.site())
+                self.newtext = self.removeUncat(self.newtext) + u'{{subst:chc}}\n'
+                for category in self.newcats:
+                    self.newtext = self.newtext + u'[[Category:' + category + u']]\n'
+                    
+                wikipedia.showDiff(self.imagepage.get(), self.newtext)
+                #Should change this for not autonomous operation.
+                #self.imagepage.put(self.newtext, u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]')
         return
-
-class Tkdialog:
-    '''
-    The Tk dialog presented to the user. The user can add and remove categories. View the images in a webbrowser, skip the image, apply the changes or exit.
-    '''
-    def __init__(self, image_title = u'', image = None, pagetext=u'', currentCats = [], commonsHelperCats = [], url= ''):
-        self.newCats = currentCats
-        self.url = url
-        self.skip = 0
-        self.exit = 0
-        self.root=Tk()
-        self.root.title(image_title)
-        w = 1600 #image1.width()
-        h = 900 #image1.height()
-        x = 50
-        y = 50
-        self.root.geometry("%dx%d+%d+%d" % (w, h, x, y))
-        self.root.rowconfigure( 0, weight = 1 )
-        self.root.columnconfigure( 0, weight = 1 )
-
-        image1 = self.getImage(image, 800, 600)                 
-               
-        panel1 = Label(self.root, image=image1)
-        panel1.grid(row=0, column=2, rowspan=11, columnspan=11)
-        panel1.image = image1                
-
-        self.cb = []
-        self.cbstate = []
-        self.entry = []
-        for i in range(0, 10):
-            self.cbstate.append(IntVar())
-            self.cb.append(Checkbutton (self.root, variable=self.cbstate[i]))
-            self.entry.append(Entry (self.root, width=50))            
-            self.cb[i].grid(row=i, column=0)
-            self.entry[i].grid(row=i, column=1)
-
-        catindex = 0
-
-        for cat in currentCats:
-            self.entry[catindex].delete(0, END)            
-            self.entry[catindex].insert(0, cat)
-            self.entry[catindex].config(background="green")
-            self.cb[catindex].select()
-            catindex = catindex + 1            
+    def removeUncat(self, oldtext = u''):
+        result = u''
+        result = re.sub(u'{{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*}}', u'', oldtext)        
+        result = re.sub(u'<!-- Remove this line once you have added categories -->', u'', result)
+        #wikipedia.showDiff(oldtext, result)
+        return result
-        for cat in commonsHelperCats:
-            self.entry[catindex].delete(0, END)            
-            self.entry[catindex].insert(0, cat)
-            self.entry[catindex].config(background="yellow")
-            self.cb[catindex].deselect()
-            catindex = catindex + 1
-
-        textarea=Text(self.root)
-        scrollbar=Scrollbar(self.root, orient=VERTICAL)
-        textarea.insert(END, pagetext.encode('utf-8'))
-        textarea.config(state=DISABLED, height=12, width=80, padx=0, pady=0, wrap=WORD, yscrollcommand=scrollbar.set)
-
-        scrollbar.config(command=textarea.yview)
-
-        browserButton=Button(self.root, text='View in browser', command=self.openInBrowser)
-        skipButton=Button(self.root, text="Skip", command=self.skipFile)
-        okButton=Button(self.root, text="OK", command=self.okFile)
-        exitButton=Button(self.root, text="EXIT", command=self.exitProgram)      
-        
-        textarea.grid(row=12, column=4, columnspan=10)
-        scrollbar.grid(row=12, column=3)
-        
-        okButton.grid(row=20, column=0, rowspan=2)
-        skipButton.grid(row=20, column=1, rowspan=2)
-        browserButton.grid(row=20, column=2, rowspan=2)
-        exitButton.grid(row=20, column=3, rowspan=2)
-                
-    def getImage(self, image, width, height):        
-        output = StringIO.StringIO(image)
-        image2 = Image.open(output)
-        image2.thumbnail((width, height))
-        imageTk = ImageTk.PhotoImage(image2)
-        return imageTk
-
-    def okFile(self):
-        '''
-        The user pressed the OK button.
-        '''
-        #Read what the user has entered
-        self.root.destroy()
-        
-    def skipFile(self):
-        '''
-        The user pressed the Skip button.
-        '''
-        self.skip=1
-        self.root.destroy()
-        
-    def openInBrowser(self):
-        '''
-        The user pressed the View in browser button.
-        '''
-        webbrowser.open(self.url)
-
-    def exitProgram(self):
-        '''
-        Exit the program
-        '''
-        self.skip=1
-        self.exit=1
-        self.root.destroy()
-        
-    def run (self):
-        self.root.mainloop()
-        return (self.skip, self.exit, self.newCats)
-    
 def main(args):
     '''
     Main loop. Get a generator. Set up the 3 threads and the 2 queue's and fire everything up.
     '''
     generator = None;
+    
     genFactory = pagegenerators.GeneratorFactory()
-
+    #global autonomous
     site = wikipedia.getSite(u'commons', u'commons')
     wikipedia.setSite(site)
     for arg in wikipedia.handleArgs():
@@ -310,25 +226,25 @@
                 generator = [wikipedia.Page(site, wikipedia.input(u'What page do you want to use?'))]
             else:
                 generator = [wikipedia.Page(site, arg[6:])]
-        elif arg == '-always':
-            always = True
+        elif arg == '-autonomous':
+            autonomous = True
         else:
             generator = genFactory.handleArg(arg)
     if not generator:
         generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site, u'Category:Media needing categories'))
         #raise add_text.NoEnoughData('You have to specify the generator you want to use for the script!')
-    prefetchToUserQueue=Queue.Queue()    
-    userToPutQueue=Queue.Queue()
+
+    prefetchToPutQueue=Queue.Queue()
# Start the prefetch thread
-    prefetchThread(generator, prefetchToUserQueue).start()
+    prefetchThread(generator, prefetchToPutQueue).start()
# Start the user thread
-    userThread(prefetchToUserQueue, userToPutQueue).start()
+    # userThread(prefetchToUserQueue, userToPutQueue).start()
# Start the put thread
-    putThread(userToPutQueue).start()
+    putThread(prefetchToPutQueue).start()
# Wait for all threads to finish    
     for openthread in threading.enumerate():