Revision: 5798
Author: multichill
Date: 2008-08-15 13:51:19 +0000 (Fri, 15 Aug 2008)
Log Message:
-----------
Building this program in a different way. Only going to run autonomous. Still have to
remove the threading part, it's now kind of useless.
Modified Paths:
--------------
trunk/pywikipedia/imagerecat.py
Modified: trunk/pywikipedia/imagerecat.py
===================================================================
--- trunk/pywikipedia/imagerecat.py 2008-08-15 02:35:56 UTC (rev 5797)
+++ trunk/pywikipedia/imagerecat.py 2008-08-15 13:51:19 UTC (rev 5798)
@@ -32,29 +32,29 @@
# Distributed under the terms of the MIT license.
#
#
-
-from Tkinter import *
-from PIL import Image, ImageTk
import os, sys, re, codecs
import urllib, httplib, urllib2
-import catlib, thread, webbrowser
+import catlib, thread
import time, threading
import wikipedia, config
import pagegenerators, add_text, Queue, StringIO
-exitProgram = 0
+exitProgram = False
+#autonomous = False
+category_blacklist = [u'Hidden categories']
+
class prefetchThread (threading.Thread):
'''
Class to fetch al the info for the user. This thread gets the imagepage, the
commonshelper suggestions and the image.
The thread puts this item in a queue. When there are no more pages left the thread
puts a None object in the queue and exits.
'''
- def __init__ (self, generator, prefetchToUserQueue):
+ def __init__ (self, generator, prefetchToPutQueue):
'''
Get the thread ready
'''
self.generator = generator
- self.prefetchToUserQueue = prefetchToUserQueue
+ self.prefetchToPutQueue = prefetchToPutQueue
self.currentCats = []
self.commonshelperCats = []
self.image = None
@@ -63,19 +63,28 @@
threading.Thread.__init__ ( self )
def run(self):
-
- global exitProgram
+ global exitProgram
+ #global autonomous
for page in self.pregenerator:
- if exitProgram != 0:
+ if exitProgram:
break;
if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage())
:
self.imagepage = wikipedia.ImagePage(page.site(), page.title())
self.imagepage.get()
+ wikipedia.output(u'Working on ' + self.imagepage.title());
self.currentCats = self.getCurrentCats(self.imagepage)
- self.commonshelperCats = self.filterCommonsHelperCats(self.currentCats,
self.getCommonshelperCats(self.imagepage))
- self.image = self.getImage(self.imagepage)
- self.prefetchToUserQueue.put((self.imagepage, self.currentCats,
self.commonshelperCats, self.image))
- self.prefetchToUserQueue.put(None)
+ self.commonshelperCats = self.filterCats(self.currentCats,
self.getCommonshelperCats(self.imagepage))
+
+ #if not autonomous:
+ # self.image = self.getImage(self.imagepage)
+ #self.prefetchToUserQueue.put((self.imagepage, self.currentCats,
self.commonshelperCats, self.image))
+
+ if len(self.commonshelperCats) > 0:
+ for cat in self.commonshelperCats:
+ wikipedia.output(u' Found new cat: ' + cat);
+ self.prefetchToPutQueue.put((self.imagepage,
self.commonshelperCats))
+
+ self.prefetchToPutQueue.put(None)
return
def getCurrentCats(self, imagepage):
@@ -91,29 +100,61 @@
'''
Get category suggestions from commonshelper. Parse them and return a list of
suggestions.
'''
- parameters = urllib.urlencode({'i' : imagepage.titleWithoutNamespace(),
'r' : 'on', 'go-clean' : 'Find+Categories'})
- commonsHelperPage =
urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.ph… %
parameters)
-
+ parameters = urllib.urlencode({'i' :
imagepage.titleWithoutNamespace().encode('utf-8'), 'r' : 'on',
'go-clean' : 'Find+Categories', 'cl' : 'li'})
commonsenseRe =
re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usage>(\d)+)\)(.*)#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)(.*)#EOF$',
re.MULTILINE + re.DOTALL)
- matches = commonsenseRe.search(commonsHelperPage.read())
+ gotInfo = False;
+
+ while(not gotInfo):
+ try:
+ commonsHelperPage =
urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.ph… %
parameters)
+ matches =
commonsenseRe.search(commonsHelperPage.read().decode('utf-8'))
+ gotInfo = True;
+ except IOError:
+ wikipedia.output(u'Got an IOError, let\'s try again')
+ except socket.timeout:
+ wikipedia.output(u'Got a timeout, let\'s try again')
+
if matches:
if(matches.group('catnum') > 0):
return matches.group('cats').splitlines()
else:
return []
- def filterCommonsHelperCats(self, currentCats, commonshelperCats):
+ def filterCats(self, currentCats, commonshelperCats):
'''
- Remove the current categories from the suggestions.
+ Remove the current categories from the suggestions and remove blacklisted cats.
'''
result = []
- currentCatsSet = set(currentCats)
+ toFilter = ""
+
+ for cat in currentCats:
+ cat = cat.replace('_',' ')
+ toFilter = toFilter + "[[Category:" + cat + "]]\n"
for cat in commonshelperCats:
cat = cat.replace('_',' ')
- if cat not in currentCatsSet:
+ toFilter = toFilter + "[[Category:" + cat + "]]\n"
+ parameters = urllib.urlencode({'source' :
toFilter.encode('utf-8'), 'bot' : '1'})
+ filterCategoriesPage =
urllib.urlopen("http://toolserver.org/~multichill/filtercats.php?%s&qu… %
parameters)
+ #print filterCategoriesPage.read().decode('utf-8')
+ filterCategoriesRe = re.compile('\[\[Category:([^\]]*)\]\]')
+ result =
filterCategoriesRe.findall(filterCategoriesPage.read().decode('utf-8'))
+ #print matches
+ '''
+ if matches:
+ print "Found matches"
+ if(matches.group('cats') > 0):
+ print matches.group('cats').splitlines()
+ '''
+ '''
+
+ #currentCatsSet = set(currentCats)
+ for cat in commonshelperCats:
+ cat = cat.replace('_',' ')
+ if (cat not in currentCatsSet) and (cat not in category_blacklist):
result.append(cat)
- return result
+ '''
+ return list(set(result))
def getImage(self, imagepage):
'''
@@ -132,176 +173,51 @@
file.close()
return image
-
-class userThread (threading.Thread):
- def __init__ (self, prefetchToUserQueue, userToPutQueue):
- self.prefetchToUserQueue = prefetchToUserQueue
- self.userToPutQueue = userToPutQueue
- self.item = None
- self.imagepage = None
- self.image = None
- self.currentCats = []
- self.commonshelperCats = []
- self.newCats = []
- self.skip = 0
-
- threading.Thread.__init__ ( self )
-
- def run(self):
-
- global exitProgram
- while exitProgram == 0:
- self.item = self.prefetchToUserQueue.get()
- if self.item is None:
- break
- else:
- (self.imagepage, self.currentCats, self.commonshelperCats, self.image) =
self.item
- (self.skip, exitProgram, self.newCats) =
Tkdialog(self.imagepage.titleWithoutNamespace(), self.image, self.imagepage.get(),
self.currentCats, self.commonshelperCats, self.imagepage.permalink()).run()
- if not self.skip:
- self.userToPutQueue.put((self.imagepage, self.newCats))
- self.userToPutQueue.put(None)
- return
-
class putThread (threading.Thread):
'''
class to do the actual changing of images
'''
def __init__ (self, userToPutQueue):
self.userToPutQueue = userToPutQueue
+ self.item = None
+ self.imagepage = None
+ self.newcats = []
+ self.newtext = u''
threading.Thread.__init__ ( self )
def run(self):
- item = None
- imagepage = None
- newtext = u''
+
while True:
- item = self.userToPutQueue.get()
- if item is None:
+ self.item = self.userToPutQueue.get()
+ if self.item is None:
break
else:
- (imagepage, newtext)=item
- #wikipedia.showDiff(imagepage.get(), newtext)
- #imagepage.put(newtext, u'Recat by bot')
+ (self.imagepage, self.newcats)=self.item
+ self.newtext = wikipedia.removeCategoryLinks(self.imagepage.get(),
self.imagepage.site())
+ self.newtext = self.removeUncat(self.newtext) +
u'{{subst:chc}}\n'
+ for category in self.newcats:
+ self.newtext = self.newtext + u'[[Category:' + category +
u']]\n'
+
+ wikipedia.showDiff(self.imagepage.get(), self.newtext)
+ #Should change this for not autonomous operation.
+ #self.imagepage.put(self.newtext, u'Image is categorized by a bot
using data from [[Commons:Tools#CommonSense|CommonSense]]')
return
-
-class Tkdialog:
- '''
- The Tk dialog presented to the user. The user can add and remove categories. View the
images in a webbrowser, skip the image, apply the changes or exit.
- '''
- def __init__(self, image_title = u'', image = None, pagetext=u'',
currentCats = [], commonsHelperCats = [], url= ''):
- self.newCats = currentCats
- self.url = url
- self.skip = 0
- self.exit = 0
- self.root=Tk()
- self.root.title(image_title)
- w = 1600 #image1.width()
- h = 900 #image1.height()
- x = 50
- y = 50
- self.root.geometry("%dx%d+%d+%d" % (w, h, x, y))
- self.root.rowconfigure( 0, weight = 1 )
- self.root.columnconfigure( 0, weight = 1 )
-
- image1 = self.getImage(image, 800, 600)
-
- panel1 = Label(self.root, image=image1)
- panel1.grid(row=0, column=2, rowspan=11, columnspan=11)
- panel1.image = image1
-
- self.cb = []
- self.cbstate = []
- self.entry = []
- for i in range(0, 10):
- self.cbstate.append(IntVar())
- self.cb.append(Checkbutton (self.root, variable=self.cbstate[i]))
- self.entry.append(Entry (self.root, width=50))
- self.cb[i].grid(row=i, column=0)
- self.entry[i].grid(row=i, column=1)
-
- catindex = 0
-
- for cat in currentCats:
- self.entry[catindex].delete(0, END)
- self.entry[catindex].insert(0, cat)
- self.entry[catindex].config(background="green")
- self.cb[catindex].select()
- catindex = catindex + 1
+ def removeUncat(self, oldtext = u''):
+ result = u''
+ result = re.sub(u'\{\{\s*([Uu]ncat(egori[sz]ed(
image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'', oldtext)
+ result = re.sub(u'<!-- Remove this line once you have added categories
-->', u'', result)
+ #wikipedia.showDiff(oldtext, result)
+ return result
- for cat in commonsHelperCats:
- self.entry[catindex].delete(0, END)
- self.entry[catindex].insert(0, cat)
- self.entry[catindex].config(background="yellow")
- self.cb[catindex].deselect()
- catindex = catindex + 1
-
- textarea=Text(self.root)
- scrollbar=Scrollbar(self.root, orient=VERTICAL)
- textarea.insert(END, pagetext.encode('utf-8'))
- textarea.config(state=DISABLED, height=12, width=80, padx=0, pady=0, wrap=WORD,
yscrollcommand=scrollbar.set)
-
- scrollbar.config(command=textarea.yview)
-
- browserButton=Button(self.root, text='View in browser',
command=self.openInBrowser)
- skipButton=Button(self.root, text="Skip", command=self.skipFile)
- okButton=Button(self.root, text="OK", command=self.okFile)
- exitButton=Button(self.root, text="EXIT", command=self.exitProgram)
-
- textarea.grid(row=12, column=4, columnspan=10)
- scrollbar.grid(row=12, column=3)
-
- okButton.grid(row=20, column=0, rowspan=2)
- skipButton.grid(row=20, column=1, rowspan=2)
- browserButton.grid(row=20, column=2, rowspan=2)
- exitButton.grid(row=20, column=3, rowspan=2)
-
- def getImage(self, image, width, height):
- output = StringIO.StringIO(image)
- image2 = Image.open(output)
- image2.thumbnail((width, height))
- imageTk = ImageTk.PhotoImage(image2)
- return imageTk
-
- def okFile(self):
- '''
- The user pressed the OK button.
- '''
- #Read what the user has entered
- self.root.destroy()
-
- def skipFile(self):
- '''
- The user pressed the Skip button.
- '''
- self.skip=1
- self.root.destroy()
-
- def openInBrowser(self):
- '''
- The user pressed the View in browser button.
- '''
- webbrowser.open(self.url)
-
- def exitProgram(self):
- '''
- Exit the program
- '''
- self.skip=1
- self.exit=1
- self.root.destroy()
-
- def run (self):
- self.root.mainloop()
- return (self.skip, self.exit, self.newCats)
-
def main(args):
'''
Main loop. Get a generator. Set up the 3 threads and the 2 queue's and fire
everything up.
'''
generator = None;
+
genFactory = pagegenerators.GeneratorFactory()
-
+ #global autonomous
site = wikipedia.getSite(u'commons', u'commons')
wikipedia.setSite(site)
for arg in wikipedia.handleArgs():
@@ -310,25 +226,25 @@
generator = [wikipedia.Page(site, wikipedia.input(u'What page do you
want to use?'))]
else:
generator = [wikipedia.Page(site, arg[6:])]
- elif arg == '-always':
- always = True
+ elif arg == '-autonomous':
+ autonomous = True
else:
generator = genFactory.handleArg(arg)
if not generator:
generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site,
u'Category:Media needing categories'))
#raise add_text.NoEnoughData('You have to specify the generator you want to
use for the script!')
- prefetchToUserQueue=Queue.Queue()
- userToPutQueue=Queue.Queue()
+
+ prefetchToPutQueue=Queue.Queue()
# Start the prefetch thread
- prefetchThread(generator, prefetchToUserQueue).start()
+ prefetchThread(generator, prefetchToPutQueue).start()
# Start the user thread
- userThread(prefetchToUserQueue, userToPutQueue).start()
+ # userThread(prefetchToUserQueue, userToPutQueue).start()
# Start the put thread
- putThread(userToPutQueue).start()
+ putThread(prefetchToPutQueue).start()
# Wait for all threads to finish
for openthread in threading.enumerate():