Revision: 5816 Author: multichill Date: 2008-08-19 19:20:14 +0000 (Tue, 19 Aug 2008)
Log Message: ----------- Rewrite, removed all the threading. Made some initial filters.
Modified Paths: -------------- trunk/pywikipedia/imagerecat.py
Modified: trunk/pywikipedia/imagerecat.py =================================================================== --- trunk/pywikipedia/imagerecat.py 2008-08-19 12:05:23 UTC (rev 5815) +++ trunk/pywikipedia/imagerecat.py 2008-08-19 19:20:14 UTC (rev 5816) @@ -2,29 +2,9 @@ """ Program to (re)categorize images at commons.
-The program uses commonshelper for category suggestions. The program consists of three parts. +The program uses commonshelper for category suggestions. +It takes the suggestions and the current categories. Put the categories through some filters and add the result
-1. prefetchThread - Fetches all the information -2. userThread - Gets input from the user -3. putThread - modifies the images - -You need to install the Python Imaging Library http://www.pythonware.com/products/pil/ to get this program working - -The program is far from finished. The framework is there, but still a lot has to be implemented: -1. The prefetch thread - * Mostly finished. - * Should add some error handling to cope with a slow toolserver - * Should check if images with special chars work alright - * Parameter to dont use commonshelper? -2. The user thread - * Tkinter layout is awful atm - * Tkinter have to implement most of the interaction - * Tkinter category webbrowser link - * Tkinter something with category auto completion (like the javascript in the search box) -3. The put thread - * Nothing much to put atm - * Should remove the Uncategorized template (+ redirects) - * Should check if something is actually changed (set operations?) """ # # (C) Multichill 2008 @@ -34,183 +14,135 @@ # import os, sys, re, codecs import urllib, httplib, urllib2 -import catlib, thread -import time, threading +import catlib +import time import wikipedia, config -import pagegenerators, add_text, Queue, StringIO +import pagegenerators, StringIO import socket
-exitProgram = False -#autonomous = False - category_blacklist = [u'Hidden categories']
-class prefetchThread (threading.Thread): +def categorizeImages(generator): + for page in generator: + if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage()): + imagepage = wikipedia.ImagePage(page.site(), page.title()) + #imagepage.get() + wikipedia.output(u'Working on ' + imagepage.title()); + currentCats = getCurrentCats(imagepage) + commonshelperCats = getCommonshelperCats(imagepage) + newcats = filterBlacklist(commonshelperCats+currentCats) + #newcats = filterDisambiguation(newcats) + #newcats = filterRedirects(newcats) + #newcats = filterCountries(newcats) + newcats = filterParents(newcats) + if len(newcats) > 0: + for cat in newcats: + wikipedia.output(u' Found new cat: ' + cat); + saveImagePage(imagepage, newcats) + + +def getCurrentCats(imagepage): ''' - Class to fetch al the info for the user. This thread gets the imagepage, the commonshelper suggestions and the image. - The thread puts this item in a queue. When there are no more pages left the thread puts a None object in the queue and exits. + Get the categories currently on the image ''' - def __init__ (self, generator, prefetchToPutQueue): - ''' - Get the thread ready - ''' - self.generator = generator - self.prefetchToPutQueue = prefetchToPutQueue - self.currentCats = [] - self.commonshelperCats = [] - self.image = None - self.imagepage = None - self.pregenerator = pagegenerators.PreloadingGenerator(self.generator) - threading.Thread.__init__ ( self ) - - def run(self): - global exitProgram - #global autonomous - for page in self.pregenerator: - if exitProgram: - break; - if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage()) : - self.imagepage = wikipedia.ImagePage(page.site(), page.title()) - self.imagepage.get() - wikipedia.output(u'Working on ' + self.imagepage.title()); - self.currentCats = self.getCurrentCats(self.imagepage) - self.commonshelperCats = self.filterCats(self.currentCats, self.getCommonshelperCats(self.imagepage)) - - #if not autonomous: - # self.image = self.getImage(self.imagepage) - #self.prefetchToUserQueue.put((self.imagepage, self.currentCats, self.commonshelperCats, self.image)) + result = [] + for cat in imagepage.categories(): + result.append(cat.titleWithoutNamespace()) + return list(set(result))
- if len(self.commonshelperCats) > 0: - for cat in self.commonshelperCats: - wikipedia.output(u' Found new cat: ' + cat); - self.prefetchToPutQueue.put((self.imagepage, self.commonshelperCats))
- self.prefetchToPutQueue.put(None) - return - - def getCurrentCats(self, imagepage): - ''' - Get the categories currently on the image - ''' - result = [] - for cat in imagepage.categories(): - result.append(cat.titleWithoutNamespace()) - return result - - def getCommonshelperCats(self, imagepage): - ''' - Get category suggestions from commonshelper. Parse them and return a list of suggestions. - ''' - parameters = urllib.urlencode({'i' : imagepage.titleWithoutNamespace().encode('utf-8'), 'r' : 'on', 'go-clean' : 'Find+Categories', 'cl' : 'li'}) - commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+((?P<usage>(\d)+))(.*)#KEYWORDS(\s)+((?P<keywords>(\d)+))(.*)#CATEGORIES(\s)+((?P<catnum>(\d)+))\s(?P<cats>(.*))\s#GALLERIES(\s)+((?P<galnum>(\d)+))(.*)#EOF$', re.MULTILINE + re.DOTALL) +def getCommonshelperCats(imagepage): + ''' + Get category suggestions from commonshelper. Parse them and return a list of suggestions. + ''' + result = [] + parameters = urllib.urlencode({'i' : imagepage.titleWithoutNamespace().encode('utf-8'), 'r' : 'on', 'go-clean' : 'Find+Categories', 'cl' : 'li'}) + commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+((?P<usage>(\d)+))(.*)#KEYWORDS(\s)+((?P<keywords>(\d)+))(.*)#CATEGORIES(\s)+((?P<catnum>(\d)+))\s(?P<cats>(.*))\s#GALLERIES(\s)+((?P<galnum>(\d)+))(.*)#EOF$', re.MULTILINE + re.DOTALL)
- gotInfo = False; + gotInfo = False;
- while(not gotInfo): - try: - commonsHelperPage = urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters) - matches = commonsenseRe.search(commonsHelperPage.read().decode('utf-8')) - gotInfo = True; - except IOError: - wikipedia.output(u'Got an IOError, let's try again') - except socket.timeout: - wikipedia.output(u'Got a timeout, let's try again') + while(not gotInfo): + try: + commonsHelperPage = urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters) + matches = commonsenseRe.search(commonsHelperPage.read().decode('utf-8')) + gotInfo = True + except IOError: + wikipedia.output(u'Got an IOError, let's try again') + except socket.timeout: + wikipedia.output(u'Got a timeout, let's try again')
- if matches: - if(matches.group('catnum') > 0): - return matches.group('cats').splitlines() - else: - return [] - - def filterCats(self, currentCats, commonshelperCats): - ''' - Remove the current categories from the suggestions and remove blacklisted cats. - ''' - result = [] - toFilter = "" + if matches: + if(matches.group('catnum') > 0): + categories = matches.group('cats').splitlines() + for cat in categories: + result.append(cat.replace('_',' ')) + + return list(set(result))
- for cat in currentCats: - cat = cat.replace('_',' ') - toFilter = toFilter + "[[Category:" + cat + "]]\n" - for cat in commonshelperCats: - cat = cat.replace('_',' ') - toFilter = toFilter + "[[Category:" + cat + "]]\n" - parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'), 'bot' : '1'}) - filterCategoriesPage = urllib.urlopen("http://toolserver.org/~multichill/filtercats.php?%s" % parameters) - #print filterCategoriesPage.read().decode('utf-8') - filterCategoriesRe = re.compile('[[Category:([^]]*)]]') - result = filterCategoriesRe.findall(filterCategoriesPage.read().decode('utf-8')) - #print matches - ''' - if matches: - print "Found matches" - if(matches.group('cats') > 0): - print matches.group('cats').splitlines() - ''' - ''' - - #currentCatsSet = set(currentCats) - for cat in commonshelperCats: - cat = cat.replace('_',' ') - if (cat not in currentCatsSet) and (cat not in category_blacklist): - result.append(cat) - ''' - return list(set(result)) - - def getImage(self, imagepage): - ''' - Get the image from the wiki - ''' - url = imagepage.fileUrl() - uo = wikipedia.MyURLopener() - - file = uo.open(url)
- if 'text/html' in file.info().getheader('Content-Type'): - wikipedia.output(u'Couldn't download the image: the requested URL was not found on this server.') - return - - image = file.read() - file.close() - - return image +def filterBlacklist(categories): + result = [] + for cat in categories: + if (cat not in category_blacklist): + result.append(cat) + return list(set(result))
-class putThread (threading.Thread): + +def filterDisambiguation(categories): + result = [] + return result + + +def filterRedirects(categories): + result = [] + return result + + +def filterCountries(categories): + result = [] + return result + + +def filterParents(categories): ''' - class to do the actual changing of images + Remove the current categories from the suggestions and remove blacklisted cats. ''' - def __init__ (self, userToPutQueue): - self.userToPutQueue = userToPutQueue - self.item = None - self.imagepage = None - self.newcats = [] - self.newtext = u'' - threading.Thread.__init__ ( self ) - - def run(self): + result = [] + toFilter = u''
- while True: - self.item = self.userToPutQueue.get() - if self.item is None: - break - else: - (self.imagepage, self.newcats)=self.item - self.newtext = wikipedia.removeCategoryLinks(self.imagepage.get(), self.imagepage.site()) - self.newtext = self.removeUncat(self.newtext) + u'{{subst:chc}}\n' - for category in self.newcats: - self.newtext = self.newtext + u'[[Category:' + category + u']]\n' - - wikipedia.showDiff(self.imagepage.get(), self.newtext) - #Should change this for not autonomous operation. - self.imagepage.put(self.newtext, u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]') - return - def removeUncat(self, oldtext = u''): - result = u'' - result = re.sub(u'{{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*}}', u'', oldtext) - result = re.sub(u'<!-- Remove this line once you have added categories -->', u'', result) - #wikipedia.showDiff(oldtext, result) - return result + for cat in categories: + cat = cat.replace('_',' ') + toFilter = toFilter + "[[Category:" + cat + "]]\n" + #try: + parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'), 'bot' : '1'}) + filterCategoriesPage = urllib.urlopen("http://toolserver.org/~multichill/filtercats.php?%s" % parameters) + #print filterCategoriesPage.read().decode('utf-8') + filterCategoriesRe = re.compile('[[Category:([^]]*)]]') + result = filterCategoriesRe.findall(filterCategoriesPage.read().decode('utf-8')) + #except: + + return result + + +def saveImagePage(imagepage, newcats): + newtext = wikipedia.removeCategoryLinks(imagepage.get(), imagepage.site()) + newtext = removeTemplates(newtext) + u'{{subst:chc}}\n' + for category in newcats: + newtext = newtext + u'[[Category:' + category + u']]\n'
+ wikipedia.showDiff(imagepage.get(), newtext) + #imagepage.put(newtext, u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]') + return + + +def removeTemplates(oldtext = u''): + result = u'' + result = re.sub(u'{{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*}}', u'', oldtext) + result = re.sub(u'<!-- Remove this line once you have added categories -->', u'', result) + result = re.sub(u'{{\s*[Cc]heck categories[^}]*}}', u'', oldtext) + return result + + def main(args): ''' Main loop. Get a generator. Set up the 3 threads and the 2 queue's and fire everything up. @@ -218,7 +150,7 @@ generator = None;
genFactory = pagegenerators.GeneratorFactory() - #global autonomous + site = wikipedia.getSite(u'commons', u'commons') wikipedia.setSite(site) for arg in wikipedia.handleArgs(): @@ -227,32 +159,15 @@ generator = [wikipedia.Page(site, wikipedia.input(u'What page do you want to use?'))] else: generator = [wikipedia.Page(site, arg[6:])] - elif arg == '-autonomous': - autonomous = True else: generator = genFactory.handleArg(arg) if not generator: - generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site, u'Category:Media needing categories')) - #raise add_text.NoEnoughData('You have to specify the generator you want to use for the script!') + generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site, u'Category:Media needing categories'), recurse=True)
+ categorizeImages(generator) + + wikipedia.output(u'All done')
- prefetchToPutQueue=Queue.Queue() - - # Start the prefetch thread - prefetchThread(generator, prefetchToPutQueue).start() - - # Start the user thread - # userThread(prefetchToUserQueue, userToPutQueue).start() - - # Start the put thread - putThread(prefetchToPutQueue).start() - - # Wait for all threads to finish - for openthread in threading.enumerate(): - if openthread != threading.currentThread(): - openthread.join() - wikipedia.output(u'All threads are done') - if __name__ == "__main__": try: main(sys.argv[1:])