Revision: 8530 Author: xqt Date: 2010-09-12 12:19:44 +0000 (Sun, 12 Sep 2010)
Log Message: ----------- import wikipedia as pywikibot for merging to rewrite branch; use wikipedia lib for categoryRedirects; some speedup fixes
Modified Paths: -------------- trunk/pywikipedia/imagerecat.py
Modified: trunk/pywikipedia/imagerecat.py =================================================================== --- trunk/pywikipedia/imagerecat.py 2010-09-12 11:41:43 UTC (rev 8529) +++ trunk/pywikipedia/imagerecat.py 2010-09-12 12:19:44 UTC (rev 8530) @@ -3,13 +3,16 @@ Program to (re)categorize images at commons.
The program uses commonshelper for category suggestions. -It takes the suggestions and the current categories. Put the categories through some filters and adds the result. +It takes the suggestions and the current categories. Put the categories through +some filters and adds the result.
The following command line parameters are supported:
--onlyfilter Don't use Commonsense to get categories, just filter the current categories +-onlyfilter Don't use Commonsense to get categories, just filter the current + categories
--onlyuncat Only work on uncategorized images. Will prevent the bot from working on an image multiple times. +-onlyuncat Only work on uncategorized images. Will prevent the bot from + working on an image multiple times.
-hint Give Commonsense a hint. For example -hint:li.wikipedia.org @@ -32,9 +35,11 @@ import urllib, httplib, urllib2 import catlib import time -import wikipedia, config -import pagegenerators, StringIO import socket +import StringIO +import wikipedia as pywikibot +import config +import pagegenerators
category_blacklist = [] countries = [] @@ -49,29 +54,33 @@ global category_blacklist global countries
- blacklistPage = wikipedia.Page(wikipedia.getSite(u'commons', u'commons'), u'User:Multichill/Category_blacklist') + blacklistPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'), + u'User:Multichill/Category_blacklist') for cat in blacklistPage.linkedPages(): category_blacklist.append(cat.titleWithoutNamespace())
- countryPage = wikipedia.Page(wikipedia.getSite(u'commons', u'commons'), u'User:Multichill/Countries') + countryPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'), + u'User:Multichill/Countries') for country in countryPage.linkedPages(): countries.append(country.titleWithoutNamespace()) return
def categorizeImages(generator, onlyFilter, onlyUncat): + ''' Loop over all images in generator and try to categorize them. Get + category suggestions from CommonSense. + ''' - Loop over all images in generator and try to categorize them. Get category suggestions from CommonSense. - ''' for page in generator: - if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage()): - imagepage = wikipedia.ImagePage(page.site(), page.title()) - wikipedia.output(u'Working on ' + imagepage.title()) + if page.exists() and (page.namespace() == 6) and \ + (not page.isRedirectPage()): + imagepage = pywikibot.ImagePage(page.site(), page.title()) + pywikibot.output(u'Working on ' + imagepage.title())
- if(onlyUncat and not(u'Uncategorized' in imagepage.templates())): - wikipedia.output(u'No Uncategorized template found') + if (onlyUncat and not(u'Uncategorized' in imagepage.templates())): + pywikibot.output(u'No Uncategorized template found') else: currentCats = getCurrentCats(imagepage) - if(onlyFilter): + if onlyFilter: commonshelperCats = [] usage = [] galleries = [] @@ -81,25 +90,22 @@
if (len(newcats) > 0 and not(set(currentCats)==set(newcats))): for cat in newcats: - wikipedia.output(u' Found new cat: ' + cat); - saveImagePage(imagepage, newcats, usage, galleries, onlyFilter) + pywikibot.output(u' Found new cat: ' + cat); + saveImagePage(imagepage, newcats, usage, galleries, + onlyFilter)
- - def getCurrentCats(imagepage): - ''' - Get the categories currently on the image - ''' + ''' Get the categories currently on the image ''' result = [] for cat in imagepage.categories(): result.append(cat.titleWithoutNamespace()) return list(set(result))
- def getCommonshelperCats(imagepage): + ''' Get category suggestions from CommonSense. Parse them and return a list + of suggestions. + ''' - Get category suggestions from CommonSense. Parse them and return a list of suggestions. - ''' commonshelperCats = [] usage = [] galleries = [] @@ -116,82 +122,76 @@ else: #Cant handle other sites atm return ([], [], []) - + commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+((?P<usagenum>(\d)+))\s(?P<usage>(.*))\s#KEYWORDS(\s)+((?P<keywords>(\d)+))(.*)#CATEGORIES(\s)+((?P<catnum>(\d)+))\s(?P<cats>(.*))\s#GALLERIES(\s)+((?P<galnum>(\d)+))\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL)
gotInfo = False matches = None maxtries = 10 tries = 0 - while(not gotInfo): try: if ( tries < maxtries ): tries = tries + 1 - commonsHelperPage = urllib.urlopen("http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters) - matches = commonsenseRe.search(commonsHelperPage.read().decode('utf-8')) + commonsHelperPage = urllib.urlopen( + "http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters) + matches = commonsenseRe.search( + commonsHelperPage.read().decode('utf-8')) gotInfo = True else: break except IOError: - wikipedia.output(u'Got an IOError, let's try again') + pywikibot.output(u'Got an IOError, let's try again') except socket.timeout: - wikipedia.output(u'Got a timeout, let's try again') + pywikibot.output(u'Got a timeout, let's try again')
if (matches and gotInfo): - if(matches.group('usagenum') > 0): + if (matches.group('usagenum') > 0): used = matches.group('usage').splitlines() for use in used: usage= usage + getUsage(use) - #wikipedia.output(use) - if(matches.group('catnum') > 0): + #pywikibot.output(use) + if (matches.group('catnum') > 0): cats = matches.group('cats').splitlines() for cat in cats: commonshelperCats.append(cat.replace('_',' ')) - wikipedia.output(u'category : ' + cat) - if(matches.group('galnum') > 0): + pywikibot.output(u'category : ' + cat) + if (matches.group('galnum') > 0): gals = matches.group('gals').splitlines() for gal in gals: galleries.append(gal.replace('_',' ')) - wikipedia.output(u'gallery : ' + gal) + pywikibot.output(u'gallery : ' + gal) commonshelperCats = list(set(commonshelperCats)) galleries = list(set(galleries)) for (lang, project, article) in usage: - wikipedia.output(lang + project + article) - + pywikibot.output(lang + project + article) return (commonshelperCats, usage, galleries)
def getUsage(use): - ''' - Parse the Commonsense output to get the usage - ''' + ''' Parse the Commonsense output to get the usage ''' result = [] lang = '' project = '' article = '' - usageRe = re.compile('^(?P<lang>([\w]+)).(?P<project>([\w]+)).org:(?P<articles>\s(.*))') + usageRe = re.compile( + '^(?P<lang>([\w]+)).(?P<project>([\w]+)).org:(?P<articles>\s(.*))') matches = usageRe.search(use) if matches: - if(matches.group('lang')): + if (matches.group('lang')): lang = matches.group('lang') - #wikipedia.output(lang) - if(matches.group('project')): + #pywikibot.output(lang) + if (matches.group('project')): project = matches.group('project') - #wikipedia.output(project) - if(matches.group('articles')): + #pywikibot.output(project) + if (matches.group('articles')): articles = matches.group('articles') - #wikipedia.output(articles) + #pywikibot.output(articles) for article in articles.split(): result.append((lang, project, article)) - return result - -
def applyAllFilters(categories): - ''' - Apply all filters on categories. - ''' + ''' Apply all filters on categories. ''' result = [] result = filterBlacklist(categories) result = filterDisambiguation(result) @@ -200,11 +200,8 @@ result = filterParents(result) return result
- def filterBlacklist(categories): - ''' - Filter out categories which are on the blacklist. - ''' + ''' Filter out categories which are on the blacklist. ''' result = [] for cat in categories: cat = cat.replace('_', ' ') @@ -212,40 +209,36 @@ result.append(cat) return list(set(result))
- def filterDisambiguation(categories): - ''' - Filter out disambiguation categories. - ''' + ''' Filter out disambiguation categories. ''' result = [] for cat in categories: - if(not wikipedia.Page(wikipedia.getSite(u'commons', u'commons'), u'Category:' + cat).isDisambig()): + if (not pywikibot.Page(pywikibot.getSite(u'commons', u'commons'), + cat, defaultNamespace=14).isDisambig()): result.append(cat) return result
def followRedirects(categories): - ''' - If a category is a redirect, replace the category with the target. - ''' + ''' If a category is a redirect, replace the category with the target. ''' result = [] for cat in categories: - categoryPage = wikipedia.Page(wikipedia.getSite(u'commons', u'commons'), u'Category:' + cat) - if u'Category redirect' in categoryPage.templates() or u'Seecat' in categoryPage.templates(): - for template in categoryPage.templatesWithParams(): - if ((template[0]==u'Category redirect' or template[0]==u'Seecat') and (len(template[1]) > 0)): - result.append(template[1][0]) + categoryPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'), + cat, defaultNamespace=14) + if categoryPage.isCategoryRedirect(): + result.append(getCategoryRedirectTarget(), + categoryPage.titleWithoutNamespace()) else: result.append(cat) return result
- def filterCountries(categories): + ''' Try to filter out ...by country categories. + First make a list of any ...by country categories and try to find some + countries. If a by country category has a subcategoy containing one of the + countries found, add it. The ...by country categories remain in the set and + should be filtered out by filterParents. + ''' - Try to filter out ...by country categories. - First make a list of any ...by country categories and try to find some countries. - If a by country category has a subcategoy containing one of the countries found, add it. - The ...by country categories remain in the set and should be filtered out by filterParents. - ''' result = categories listByCountry = [] listCountries = [] @@ -259,76 +252,70 @@ for country in countries: if country in cat: listCountries.append(country) - if(len(listByCountry) > 0): for bc in listByCountry: - category = catlib.Category(wikipedia.getSite(u'commons', u'commons'), u'Category:' + bc) + category = catlib.Category( + pywikibot.getSite(u'commons', u'commons'), u'Category:' + bc) for subcategory in category.subcategories(): for country in listCountries: if (subcategory.titleWithoutNamespace().endswith(country)): result.append(subcategory.titleWithoutNamespace()) - return list(set(result))
def filterParents(categories): + ''' Remove all parent categories from the set to prevent overcategorization. + ''' - Remove all parent categories from the set to prevent overcategorization. - ''' result = [] toFilter = u'' - for cat in categories: cat = cat.replace('_',' ') toFilter = toFilter + "[[Category:" + cat + "]]\n" - #try: - parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'), 'bot' : '1'}) + parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'), + 'bot' : '1'}) filterCategoriesRe = re.compile('[[Category:([^]]*)]]') try: - filterCategoriesPage = urllib.urlopen("http://toolserver.org/~multichill/filtercats.php?%s" % parameters) - result = filterCategoriesRe.findall(filterCategoriesPage.read().decode('utf-8')) + filterCategoriesPage = urllib.urlopen( + "http://toolserver.org/~multichill/filtercats.php?%s" % parameters) + result = filterCategoriesRe.findall( + filterCategoriesPage.read().decode('utf-8')) except IOError: #Something is wrong, forget about this filter and just return the input return categories - + if not result: #Is empty, dont want to remove all categories return categories return result
- def saveImagePage(imagepage, newcats, usage, galleries, onlyFilter): - ''' - Remove the old categories and add the new categories to the image. - ''' - newtext = wikipedia.removeCategoryLinks(imagepage.get(), imagepage.site()) - + ''' Remove the old categories and add the new categories to the image. ''' + newtext = pywikibot.removeCategoryLinks(imagepage.get(), imagepage.site()) if not(onlyFilter): newtext = removeTemplates(newtext) - newtext = newtext + getCheckCategoriesTemplate(usage, galleries, len(newcats)) - + newtext = newtext + getCheckCategoriesTemplate(usage, galleries, + len(newcats)) newtext = newtext + u'\n' - for category in newcats: newtext = newtext + u'[[Category:' + category + u']]\n' - if(onlyFilter): comment = u'Filtering categories' else: comment = u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]' - - wikipedia.showDiff(imagepage.get(), newtext) + pywikibot.showDiff(imagepage.get(), newtext) imagepage.put(newtext, comment) return
- def removeTemplates(oldtext = u''): ''' Remove {{Uncategorized}} and {{Check categories}} templates ''' result = u'' - result = re.sub(u'{{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*}}', u'', oldtext) - result = re.sub(u'<!-- Remove this line once you have added categories -->', u'', result) + result = re.sub( + u'{{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*}}', u'', oldtext) + result = re.sub(u'<!-- Remove this line once you have added categories -->', + u'', result) result = re.sub(u'{{\s*[Cc]heck categories[^}]*}}', u'', result) return result
@@ -337,25 +324,21 @@ Build the check categories template with all parameters ''' result = u'{{Check categories|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}\n' - usageCounter = 1 for (lang, project, article) in usage: - result = result + u'|lang' + str(usageCounter) + u'=' + lang - result = result + u'|wiki' + str(usageCounter) + u'=' + project - result = result + u'|article' + str(usageCounter) + u'=' + article - result = result + u'\n' + result += u'|lang%d=' % (usageCounter, lang) + result += u'|wiki%d=' % (usageCounter, project) + result += u'|article%d=' % (usageCounter, article) + result += u'\n' usageCounter = usageCounter + 1 - galleryCounter = 1 for gallery in galleries: - result = result + u'|gallery' + str(galleryCounter) + u'=' + gallery.replace('_', ' ') + u'\n' + result += u'|gallery%d=%s' % (galleryCounter, gallery.replace('_', ' ')) + u'\n' galleryCounter = galleryCounter + 1 - - result = result + u'|ncats='+ str(ncats) + u'\n' - result = result + u'}}\n' + result += u'|ncats=%d\n' % ncats + result += u'}}\n' return result
- def main(args): ''' Main loop. Get a generator and options. Work on all images in the generator. @@ -368,9 +351,9 @@ global search_wikis global hint_wiki
- site = wikipedia.getSite(u'commons', u'commons') - wikipedia.setSite(site) - for arg in wikipedia.handleArgs(): + site = pywikibot.getSite(u'commons', u'commons') + pywikibot.setSite(site) + for arg in pywikibot.handleArgs(): if arg == '-onlyfilter': onlyFilter = True elif arg == '-onlyuncat': @@ -384,15 +367,15 @@
generator = genFactory.getCombinedGenerator() if not generator: - generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site, u'Category:Media needing categories'), recurse=True) - + generator = pagegenerators.CategorizedPageGenerator( + catlib.Category(site, u'Category:Media needing categories'), + recurse=True) initLists() categorizeImages(generator, onlyFilter, onlyUncat) + pywikibot.output(u'All done')
- wikipedia.output(u'All done') - if __name__ == "__main__": try: main(sys.argv[1:]) finally: - wikipedia.stopme() + pywikibot.stopme()
pywikipedia-svn@lists.wikimedia.org