jenkins-bot has submitted this change and it was merged.
Change subject: Port imagerecat.py from compat
......................................................................
Port imagerecat.py from compat
Change-Id: I9e5f5a1fb1823ec85378d3bf9d7c67592139face
---
A scripts/imagerecat.py
1 file changed, 465 insertions(+), 0 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/imagerecat.py b/scripts/imagerecat.py
new file mode 100644
index 0000000..7c91106
--- /dev/null
+++ b/scripts/imagerecat.py
@@ -0,0 +1,465 @@
+# -*- coding: utf-8 -*-
+"""
+Program to (re)categorize images at commons.
+
+The program uses commonshelper for category suggestions.
+It takes the suggestions and the current categories. Put the categories through
+some filters and adds the result.
+
+The following command line parameters are supported:
+
+-onlyfilter Don't use Commonsense to get categories, just filter the current
+ categories
+
+-onlyuncat Only work on uncategorized images. Will prevent the bot from
+ working on an image multiple times.
+
+-hint Give Commonsense a hint.
+ For example -hint:li.wikipedia.org
+
+-onlyhint Give Commonsense a hint. And only work on this hint.
+ Syntax is the same as -hint. Some special hints are possible:
+ _20 : Work on the top 20 wikipedia's
+ _80 : Work on the top 80 wikipedia's
+ wps : Work on all wikipedia's
+
+"""
+__version__ = '$Id$'
+#
+# (C) Multichill 2008-2011
+# (C) Pywikipedia bot team, 2008-2013
+#
+# Distributed under the terms of the MIT license.
+#
+#
+import os, sys, re, codecs
+import urllib, httplib, urllib2
+import time
+import socket
+import StringIO
+import pywikibot
+from pywikibot import config
+from pywikibot import pagegenerators
+import xml.etree.ElementTree
+
+category_blacklist = []
+countries = []
+
+search_wikis=u'_20'
+hint_wiki=u''
+
+def initLists():
+ '''
+ Get the list of countries & the blacklist from Commons.
+ '''
+ global category_blacklist
+ global countries
+
+ blacklistPage = pywikibot.Page(pywikibot.Site(u'commons',
u'commons'),
+ u'User:Multichill/Category_blacklist')
+ for cat in blacklistPage.linkedPages():
+ category_blacklist.append(cat.title(withNamespace=False))
+
+ countryPage = pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
+ u'User:Multichill/Countries')
+ for country in countryPage.linkedPages():
+ countries.append(country.title(withNamespace=False))
+ return
+
+def categorizeImages(generator, onlyFilter, onlyUncat):
+ ''' Loop over all images in generator and try to categorize them. Get
+ category suggestions from CommonSense.
+
+ '''
+ for page in generator:
+ if page.exists() and (page.namespace() == 6) and \
+ (not page.isRedirectPage()):
+ imagepage = pywikibot.ImagePage(page.site(), page.title())
+ pywikibot.output(u'Working on ' + imagepage.title())
+
+ if (onlyUncat and not(u'Uncategorized' in imagepage.templates())):
+ pywikibot.output(u'No Uncategorized template found')
+ else:
+ currentCats = getCurrentCats(imagepage)
+ if onlyFilter:
+ commonshelperCats = []
+ usage = []
+ galleries = []
+ else:
+ (commonshelperCats, usage, galleries) =
getCommonshelperCats(imagepage)
+ newcats = applyAllFilters(commonshelperCats+currentCats)
+
+ if (len(newcats) > 0 and not(set(currentCats)==set(newcats))):
+ for cat in newcats:
+ pywikibot.output(u' Found new cat: ' + cat);
+ saveImagePage(imagepage, newcats, usage, galleries,
+ onlyFilter)
+
+def getCurrentCats(imagepage):
+ ''' Get the categories currently on the image '''
+ result = []
+ for cat in imagepage.categories():
+ result.append(cat.title(withNamespace=False))
+ return list(set(result))
+
+def getCommonshelperCats(imagepage):
+ ''' Get category suggestions from CommonSense. Parse them and return a
list
+ of suggestions.
+
+ '''
+ commonshelperCats = []
+ usage = []
+ galleries = []
+
+ global search_wikis
+ global hint_wiki
+ site = imagepage.site
+ lang = site.language()
+ family = site.family.name
+ if lang==u'commons' and family==u'commons':
+ parameters = urllib.urlencode(
+ {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r' : 'on',
+ 'go-clean' : 'Find+Categories',
+ 'p' : search_wikis,
+ 'cl' : hint_wiki})
+ elif family==u'wikipedia':
+ parameters = urllib.urlencode(
+ {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r' : 'on',
+ 'go-move' : 'Find+Categories',
+ 'p' : search_wikis,
+ 'cl' : hint_wiki,
+ 'w' : lang})
+ else:
+ #Cant handle other sites atm
+ return ([], [], [])
+
+ commonsenseRe =
re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$',
re.MULTILINE + re.DOTALL)
+
+ gotInfo = False
+ matches = None
+ maxtries = 10
+ tries = 0
+ while(not gotInfo):
+ try:
+ if ( tries < maxtries ):
+ tries = tries + 1
+ commonsHelperPage = urllib.urlopen(
+
"http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)
+ matches = commonsenseRe.search(
+ commonsHelperPage.read().decode('utf-8'))
+ gotInfo = True
+ else:
+ break
+ except IOError:
+ pywikibot.output(u'Got an IOError, let\'s try again')
+ except socket.timeout:
+ pywikibot.output(u'Got a timeout, let\'s try again')
+
+ if (matches and gotInfo):
+ if (matches.group('usagenum') > 0):
+ used = matches.group('usage').splitlines()
+ for use in used:
+ usage= usage + getUsage(use)
+ #pywikibot.output(use)
+ if (matches.group('catnum') > 0):
+ cats = matches.group('cats').splitlines()
+ for cat in cats:
+ commonshelperCats.append(cat.replace('_', ' '))
+ pywikibot.output(u'category : ' + cat)
+ if (matches.group('galnum') > 0):
+ gals = matches.group('gals').splitlines()
+ for gal in gals:
+ galleries.append(gal.replace('_', ' '))
+ pywikibot.output(u'gallery : ' + gal)
+ commonshelperCats = list(set(commonshelperCats))
+ galleries = list(set(galleries))
+ for (lang, project, article) in usage:
+ pywikibot.output(lang + project + article)
+ return (commonshelperCats, usage, galleries)
+
+def getOpenStreetMapCats(latitude, longitude):
+ '''
+ Get a list of location categories based on the OSM nomatim tool
+ '''
+ result = []
+ locationList = getOpenStreetMap(latitude, longitude)
+ for i in range(0, len(locationList)):
+ #print 'Working on ' + locationList[i]
+ if i <= len(locationList)-3:
+ category = getCategoryByName(name=locationList[i], parent=locationList[i+1],
grandparent=locationList[i+2])
+ elif i == len(locationList)-2:
+ category = getCategoryByName(name=locationList[i], parent=locationList[i+1])
+ else:
+ category = getCategoryByName(name=locationList[i])
+ if category and not category==u'':
+ result.append(category)
+ #print result
+ return result
+
+
+def getOpenStreetMap(latitude, longitude):
+ '''
+ Get the result from
http://nominatim.openstreetmap.org/reverse
+ and put it in a list of tuples to play around with
+ '''
+ result = []
+ gotInfo = False
+ parameters = urllib.urlencode({'lat' : latitude, 'lon' : longitude,
'accept-language' : 'en'})
+ while(not gotInfo):
+ try:
+ page =
urllib.urlopen("http://nominatim.openstreetmap.org/reverse?format=xml&… %
parameters)
+ et = xml.etree.ElementTree.parse(page)
+ gotInfo=True
+ except IOError:
+ pywikibot.output(u'Got an IOError, let\'s try again')
+ time.sleep(30)
+ except socket.timeout:
+ pywikibot.output(u'Got a timeout, let\'s try again')
+ time.sleep(30)
+ validParts = [u'hamlet', u'village', u'city',
u'county', u'country']
+ invalidParts = [u'path', u'road', u'suburb',
u'state', u'country_code']
+ addressparts = et.find('addressparts')
+ #xml.etree.ElementTree.dump(et)
+
+ for addresspart in addressparts.getchildren():
+ if addresspart.tag in validParts:
+ result.append(addresspart.text)
+ elif addresspart.tag in invalidParts:
+ pywikibot.output(u'Dropping %s, %s' % (addresspart.tag, addresspart.text))
+ else:
+ pywikibot.warning(u'%s, %s is not in addressparts lists' % (addresspart.tag,
addresspart.text))
+ #print result
+ return result
+
+def getCategoryByName(name, parent=u'', grandparent=u''):
+
+ if not parent==u'':
+ workname = name.strip() + u',_' + parent.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ if not grandparent==u'':
+ workname = name.strip() + u',_' + grandparent.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ workname = name.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ return u''
+
+
+def getUsage(use):
+ ''' Parse the Commonsense output to get the usage '''
+ result = []
+ lang = ''
+ project = ''
+ article = ''
+ usageRe = re.compile(
+
'^(?P<lang>([\w-]+))\.(?P<project>([\w]+))\.org:(?P<articles>\s(.*))')
+ matches = usageRe.search(use)
+ if matches:
+ if (matches.group('lang')):
+ lang = matches.group('lang')
+ #pywikibot.output(lang)
+ if (matches.group('project')):
+ project = matches.group('project')
+ #pywikibot.output(project)
+ if (matches.group('articles')):
+ articles = matches.group('articles')
+ #pywikibot.output(articles)
+ for article in articles.split():
+ result.append((lang, project, article))
+ return result
+
+def applyAllFilters(categories):
+ ''' Apply all filters on categories. '''
+ result = []
+ result = filterDisambiguation(categories)
+ result = followRedirects(result)
+ result = filterBlacklist(result)
+ result = filterCountries(result)
+ result = filterParents(result)
+ return result
+
+def filterBlacklist(categories):
+ ''' Filter out categories which are on the blacklist. '''
+ result = []
+ for cat in categories:
+ cat = cat.replace('_', ' ')
+ if (cat not in category_blacklist):
+ result.append(cat)
+ return list(set(result))
+
+def filterDisambiguation(categories):
+ ''' Filter out disambiguation categories. '''
+ result = []
+ for cat in categories:
+ if (not pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
+ cat, ns=14).isDisambig()):
+ result.append(cat)
+ return result
+
+def followRedirects(categories):
+ ''' If a category is a redirect, replace the category with the target.
'''
+ result = []
+ for cat in categories:
+ categoryPage = pywikibot.Page(pywikibot.getSite(u'commons',
u'commons'),
+ cat, ns=14)
+ if categoryPage.isCategoryRedirect():
+ result.append(
+ categoryPage.getCategoryRedirectTarget().title(
+ withNamespace=False))
+ else:
+ result.append(cat)
+ return result
+
+def filterCountries(categories):
+ ''' Try to filter out ...by country categories.
+ First make a list of any ...by country categories and try to find some
+ countries. If a by country category has a subcategoy containing one of the
+ countries found, add it. The ...by country categories remain in the set and
+ should be filtered out by filterParents.
+
+ '''
+ result = categories
+ listByCountry = []
+ listCountries = []
+ for cat in categories:
+ if (cat.endswith(u'by country')):
+ listByCountry.append(cat)
+
+ #If cat contains 'by country' add it to the list
+ #If cat contains the name of a country add it to the list
+ else:
+ for country in countries:
+ if country in cat:
+ listCountries.append(country)
+ if(len(listByCountry) > 0):
+ for bc in listByCountry:
+ category = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'),
u'Category:' + bc)
+ for subcategory in category.subcategories():
+ for country in listCountries:
+ if (subcategory.title(withNamespace=False).endswith(country)):
+ result.append(subcategory.title(withNamespace=False))
+ return list(set(result))
+
+def filterParents(categories):
+ ''' Remove all parent categories from the set to prevent
overcategorization.
+
+ '''
+ result = []
+ toFilter = u''
+ for cat in categories:
+ cat = cat.replace('_', ' ')
+ toFilter = toFilter + "[[Category:" + cat + "]]\n"
+ parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'),
+ 'bot' : '1'})
+ filterCategoriesRe = re.compile('\[\[Category:([^\]]*)\]\]')
+ try:
+ filterCategoriesPage = urllib.urlopen(
+ "http://toolserver.org/~multichill/filtercats.php?%s" %
parameters)
+ result = filterCategoriesRe.findall(
+ filterCategoriesPage.read().decode('utf-8'))
+ except IOError:
+ #Something is wrong, forget about this filter and just return the input
+ return categories
+
+ if not result:
+ #Is empty, dont want to remove all categories
+ return categories
+ return result
+
+def saveImagePage(imagepage, newcats, usage, galleries, onlyFilter):
+ ''' Remove the old categories and add the new categories to the image.
'''
+ newtext = pywikibot.removeCategoryLinks(imagepage.get(), imagepage.site())
+ if not(onlyFilter):
+ newtext = removeTemplates(newtext)
+ newtext = newtext + getCheckCategoriesTemplate(usage, galleries,
+ len(newcats))
+ newtext = newtext + u'\n'
+ for category in newcats:
+ newtext = newtext + u'[[Category:' + category + u']]\n'
+ if(onlyFilter):
+ comment = u'Filtering categories'
+ else:
+ comment = u'Image is categorized by a bot using data from
[[Commons:Tools#CommonSense|CommonSense]]'
+ pywikibot.showDiff(imagepage.get(), newtext)
+ imagepage.put(newtext, comment)
+ return
+
+def removeTemplates(oldtext = u''):
+ '''
+ Remove {{Uncategorized}} and {{Check categories}} templates
+ '''
+ result = u''
+ result = re.sub(
+ u'\{\{\s*([Uu]ncat(egori[sz]ed(
image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'', oldtext)
+ result = re.sub(u'<!-- Remove this line once you have added categories
-->',
+ u'', result)
+ result = re.sub(u'\{\{\s*[Cc]heck categories[^}]*\}\}', u'', result)
+ return result
+
+def getCheckCategoriesTemplate(usage, galleries, ncats):
+ '''
+ Build the check categories template with all parameters
+ '''
+ result = u'{{Check
categories|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}\n'
+ usageCounter = 1
+ for (lang, project, article) in usage:
+ result += u'|lang%d=%s' % (usageCounter, lang)
+ result += u'|wiki%d=%s' % (usageCounter, project)
+ result += u'|article%d=%s' % (usageCounter, article)
+ result += u'\n'
+ usageCounter = usageCounter + 1
+ galleryCounter = 1
+ for gallery in galleries:
+ result += u'|gallery%d=%s' % (galleryCounter,
gallery.replace('_', ' ')) + u'\n'
+ galleryCounter = galleryCounter + 1
+ result += u'|ncats=%d\n' % ncats
+ result += u'}}\n'
+ return result
+
+def main(args):
+ '''
+ Main loop. Get a generator and options. Work on all images in the generator.
+ '''
+ generator = None
+ onlyFilter = False
+ onlyUncat = False
+ genFactory = pagegenerators.GeneratorFactory()
+
+ global search_wikis
+ global hint_wiki
+
+ site = pywikibot.getSite(u'commons', u'commons')
+ for arg in pywikibot.handleArgs():
+ if arg == '-onlyfilter':
+ onlyFilter = True
+ elif arg == '-onlyuncat':
+ onlyUncat = True
+ elif arg.startswith('-hint:'):
+ hint_wiki = arg [len('-hint:'):]
+ elif arg.startswith('-onlyhint'):
+ search_wikis = arg [len('-onlyhint:'):]
+ else:
+ genFactory.handleArg(arg)
+
+ generator = genFactory.getCombinedGenerator()
+ if not generator:
+ generator = pagegenerators.CategorizedPageGenerator(
+ pywikibot.Category(site, u'Category:Media needing categories'),
+ recurse=True)
+ initLists()
+ categorizeImages(generator, onlyFilter, onlyUncat)
+ pywikibot.output(u'All done')
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
--
To view, visit
https://gerrit.wikimedia.org/r/86621
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9e5f5a1fb1823ec85378d3bf9d7c67592139face
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: jenkins-bot