jenkins-bot has submitted this change and it was merged.
Change subject: PEP8-ify most of imagerecat.py
......................................................................
PEP8-ify most of imagerecat.py
Change-Id: I5a4a7dd85eadf7233fe7e388644dadd5ba1e5ffc
---
M scripts/imagerecat.py
1 file changed, 105 insertions(+), 92 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/imagerecat.py b/scripts/imagerecat.py
index 9a6ee16..df04190 100644
--- a/scripts/imagerecat.py
+++ b/scripts/imagerecat.py
@@ -45,13 +45,14 @@
category_blacklist = []
countries = []
-search_wikis=u'_20'
-hint_wiki=u''
+search_wikis = u'_20'
+hint_wiki = u''
+
def initLists():
- '''
+ """
Get the list of countries & the blacklist from Commons.
- '''
+ """
global category_blacklist
global countries
@@ -66,18 +67,19 @@
countries.append(country.title(withNamespace=False))
return
+
def categorizeImages(generator, onlyFilter, onlyUncat):
- ''' Loop over all images in generator and try to categorize them. Get
+ """ Loop over all images in generator and try to categorize them. Get
category suggestions from CommonSense.
- '''
+ """
for page in generator:
if page.exists() and (page.namespace() == 6) and \
(not page.isRedirectPage()):
imagepage = pywikibot.ImagePage(page.site(), page.title())
pywikibot.output(u'Working on ' + imagepage.title())
- if (onlyUncat and not(u'Uncategorized' in imagepage.templates())):
+ if onlyUncat and not(u'Uncategorized' in imagepage.templates()):
pywikibot.output(u'No Uncategorized template found')
else:
currentCats = getCurrentCats(imagepage)
@@ -87,26 +89,28 @@
galleries = []
else:
(commonshelperCats, usage, galleries) = getCommonshelperCats(imagepage)
- newcats = applyAllFilters(commonshelperCats+currentCats)
+ newcats = applyAllFilters(commonshelperCats + currentCats)
- if (len(newcats) > 0 and not(set(currentCats)==set(newcats))):
+ if len(newcats) > 0 and not(set(currentCats) == set(newcats)):
for cat in newcats:
- pywikibot.output(u' Found new cat: ' + cat);
+ pywikibot.output(u' Found new cat: ' + cat)
saveImagePage(imagepage, newcats, usage, galleries,
onlyFilter)
+
def getCurrentCats(imagepage):
- ''' Get the categories currently on the image '''
+ """ Get the categories currently on the image """
result = []
for cat in imagepage.categories():
result.append(cat.title(withNamespace=False))
return list(set(result))
+
def getCommonshelperCats(imagepage):
- ''' Get category suggestions from CommonSense. Parse them and return a list
+ """ Get category suggestions from CommonSense. Parse them and return a list
of suggestions.
- '''
+ """
commonshelperCats = []
usage = []
galleries = []
@@ -116,35 +120,35 @@
site = imagepage.site
lang = site.language()
family = site.family.name
- if lang==u'commons' and family==u'commons':
+ if lang == u'commons' and family == u'commons':
parameters = urllib.urlencode(
- {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
- 'r' : 'on',
- 'go-clean' : 'Find+Categories',
- 'p' : search_wikis,
- 'cl' : hint_wiki})
- elif family==u'wikipedia':
+ {'i': imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r': 'on',
+ 'go-clean': 'Find+Categories',
+ 'p': search_wikis,
+ 'cl': hint_wiki})
+ elif family == u'wikipedia':
parameters = urllib.urlencode(
- {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
- 'r' : 'on',
- 'go-move' : 'Find+Categories',
- 'p' : search_wikis,
- 'cl' : hint_wiki,
- 'w' : lang})
+ {'i': imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r': 'on',
+ 'go-move': 'Find+Categories',
+ 'p': search_wikis,
+ 'cl': hint_wiki,
+ 'w': lang})
else:
#Cant handle other sites atm
- return ([], [], [])
+ return [], [], []
- commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL)
+ commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL) # noqa
gotInfo = False
matches = None
maxtries = 10
tries = 0
- while(not gotInfo):
+ while not gotInfo:
try:
- if ( tries < maxtries ):
- tries = tries + 1
+ if tries < maxtries:
+ tries += 1
commonsHelperPage = urllib.urlopen(
"http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)
matches = commonsenseRe.search(
@@ -157,18 +161,18 @@
except socket.timeout:
pywikibot.output(u'Got a timeout, let\'s try again')
- if (matches and gotInfo):
- if (matches.group('usagenum') > 0):
+ if matches and gotInfo:
+ if matches.group('usagenum') > 0:
used = matches.group('usage').splitlines()
for use in used:
- usage= usage + getUsage(use)
+ usage = usage + getUsage(use)
#pywikibot.output(use)
- if (matches.group('catnum') > 0):
+ if matches.group('catnum') > 0:
cats = matches.group('cats').splitlines()
for cat in cats:
commonshelperCats.append(cat.replace('_', ' '))
pywikibot.output(u'category : ' + cat)
- if (matches.group('galnum') > 0):
+ if matches.group('galnum') > 0:
gals = matches.group('gals').splitlines()
for gal in gals:
galleries.append(gal.replace('_', ' '))
@@ -177,41 +181,45 @@
galleries = list(set(galleries))
for (lang, project, article) in usage:
pywikibot.output(lang + project + article)
- return (commonshelperCats, usage, galleries)
+ return commonshelperCats, usage, galleries
+
def getOpenStreetMapCats(latitude, longitude):
- '''
+ """
Get a list of location categories based on the OSM nomatim tool
- '''
+ """
result = []
locationList = getOpenStreetMap(latitude, longitude)
for i in range(0, len(locationList)):
#print 'Working on ' + locationList[i]
- if i <= len(locationList)-3:
- category = getCategoryByName(name=locationList[i], parent=locationList[i+1], grandparent=locationList[i+2])
- elif i == len(locationList)-2:
- category = getCategoryByName(name=locationList[i], parent=locationList[i+1])
+ if i <= len(locationList) - 3:
+ category = getCategoryByName(name=locationList[i],
+ parent=locationList[i + 1],
+ grandparent=locationList[i + 2])
+ elif i == len(locationList) - 2:
+ category = getCategoryByName(name=locationList[i],
+ parent=locationList[i + 1])
else:
category = getCategoryByName(name=locationList[i])
- if category and not category==u'':
+ if category and not category == u'':
result.append(category)
#print result
return result
def getOpenStreetMap(latitude, longitude):
- '''
+ """
Get the result from http://nominatim.openstreetmap.org/reverse
and put it in a list of tuples to play around with
- '''
+ """
result = []
gotInfo = False
- parameters = urllib.urlencode({'lat' : latitude, 'lon' : longitude, 'accept-language' : 'en'})
- while(not gotInfo):
+ parameters = urllib.urlencode({'lat': latitude, 'lon': longitude, 'accept-language': 'en'})
+ while not gotInfo:
try:
page = urllib.urlopen("http://nominatim.openstreetmap.org/reverse?format=xml&%s" % parameters)
et = xml.etree.ElementTree.parse(page)
- gotInfo=True
+ gotInfo = True
except IOError:
pywikibot.output(u'Got an IOError, let\'s try again')
time.sleep(30)
@@ -233,30 +241,28 @@
#print result
return result
+
def getCategoryByName(name, parent=u'', grandparent=u''):
- if not parent==u'':
+ if not parent == u'':
workname = name.strip() + u',_' + parent.strip()
- workcat = pywikibot.Category(
- pywikibot.Site(u'commons', u'commons'), workname)
+ workcat = pywikibot.Category(pywikibot.Site(u'commons', u'commons'), workname)
if workcat.exists():
return workname
- if not grandparent==u'':
+ if not grandparent == u'':
workname = name.strip() + u',_' + grandparent.strip()
- workcat = pywikibot.Category(
- pywikibot.Site(u'commons', u'commons'), workname)
+ workcat = pywikibot.Category(pywikibot.Site(u'commons', u'commons'), workname)
if workcat.exists():
return workname
workname = name.strip()
- workcat = pywikibot.Category(
- pywikibot.Site(u'commons', u'commons'), workname)
+ workcat = pywikibot.Category(pywikibot.Site(u'commons', u'commons'), workname)
if workcat.exists():
return workname
return u''
def getUsage(use):
- ''' Parse the Commonsense output to get the usage '''
+ """ Parse the Commonsense output to get the usage """
result = []
lang = ''
project = ''
@@ -265,21 +271,22 @@
'^(?P<lang>([\w-]+))\.(?P<project>([\w]+))\.org:(?P<articles>\s(.*))')
matches = usageRe.search(use)
if matches:
- if (matches.group('lang')):
+ if matches.group('lang'):
lang = matches.group('lang')
#pywikibot.output(lang)
- if (matches.group('project')):
+ if matches.group('project'):
project = matches.group('project')
#pywikibot.output(project)
- if (matches.group('articles')):
+ if matches.group('articles'):
articles = matches.group('articles')
#pywikibot.output(articles)
for article in articles.split():
result.append((lang, project, article))
return result
+
def applyAllFilters(categories):
- ''' Apply all filters on categories. '''
+ """ Apply all filters on categories. """
result = []
result = filterDisambiguation(categories)
result = followRedirects(result)
@@ -288,17 +295,19 @@
result = filterParents(result)
return result
+
def filterBlacklist(categories):
- ''' Filter out categories which are on the blacklist. '''
+ """ Filter out categories which are on the blacklist. """
result = []
for cat in categories:
cat = cat.replace('_', ' ')
- if (cat not in category_blacklist):
+ if not (cat in category_blacklist):
result.append(cat)
return list(set(result))
+
def filterDisambiguation(categories):
- ''' Filter out disambiguation categories. '''
+ """ Filter out disambiguation categories. """
result = []
for cat in categories:
if (not pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
@@ -306,8 +315,9 @@
result.append(cat)
return result
+
def followRedirects(categories):
- ''' If a category is a redirect, replace the category with the target. '''
+ """ If a category is a redirect, replace the category with the target. """
result = []
for cat in categories:
categoryPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'),
@@ -320,19 +330,20 @@
result.append(cat)
return result
+
def filterCountries(categories):
- ''' Try to filter out ...by country categories.
+ """ Try to filter out ...by country categories.
First make a list of any ...by country categories and try to find some
countries. If a by country category has a subcategoy containing one of the
countries found, add it. The ...by country categories remain in the set and
should be filtered out by filterParents.
- '''
+ """
result = categories
listByCountry = []
listCountries = []
for cat in categories:
- if (cat.endswith(u'by country')):
+ if cat.endswith(u'by country'):
listByCountry.append(cat)
#If cat contains 'by country' add it to the list
@@ -341,27 +352,26 @@
for country in countries:
if country in cat:
listCountries.append(country)
- if(len(listByCountry) > 0):
+ if len(listByCountry) > 0:
for bc in listByCountry:
category = pywikibot.Category(
pywikibot.Site(u'commons', u'commons'), u'Category:' + bc)
for subcategory in category.subcategories():
for country in listCountries:
- if (subcategory.title(withNamespace=False).endswith(country)):
+ if subcategory.title(withNamespace=False).endswith(country):
result.append(subcategory.title(withNamespace=False))
return list(set(result))
-def filterParents(categories):
- ''' Remove all parent categories from the set to prevent overcategorization.
- '''
+def filterParents(categories):
+ """ Remove all parent categories from the set to prevent overcategorization. """
result = []
toFilter = u''
for cat in categories:
cat = cat.replace('_', ' ')
toFilter = toFilter + "[[Category:" + cat + "]]\n"
- parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'),
- 'bot' : '1'})
+ parameters = urllib.urlencode({'source': toFilter.encode('utf-8'),
+ 'bot': '1'})
filterCategoriesRe = re.compile('\[\[Category:([^\]]*)\]\]')
try:
filterCategoriesPage = urllib.urlopen(
@@ -377,17 +387,18 @@
return categories
return result
+
def saveImagePage(imagepage, newcats, usage, galleries, onlyFilter):
- ''' Remove the old categories and add the new categories to the image. '''
+ """ Remove the old categories and add the new categories to the image. """
newtext = pywikibot.removeCategoryLinks(imagepage.get(), imagepage.site())
- if not(onlyFilter):
+ if not onlyFilter:
newtext = removeTemplates(newtext)
newtext = newtext + getCheckCategoriesTemplate(usage, galleries,
len(newcats))
- newtext = newtext + u'\n'
+ newtext += u'\n'
for category in newcats:
newtext = newtext + u'[[Category:' + category + u']]\n'
- if(onlyFilter):
+ if onlyFilter:
comment = u'Filtering categories'
else:
comment = u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]'
@@ -395,11 +406,11 @@
imagepage.put(newtext, comment)
return
-def removeTemplates(oldtext = u''):
- '''
+
+def removeTemplates(oldtext=u''):
+ """
Remove {{Uncategorized}} and {{Check categories}} templates
- '''
- result = u''
+ """
result = re.sub(
u'\{\{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'', oldtext)
result = re.sub(u'<!-- Remove this line once you have added categories -->',
@@ -407,10 +418,11 @@
result = re.sub(u'\{\{\s*[Cc]heck categories[^}]*\}\}', u'', result)
return result
+
def getCheckCategoriesTemplate(usage, galleries, ncats):
- '''
+ """
Build the check categories template with all parameters
- '''
+ """
result = u'{{Check categories|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}\n'
usageCounter = 1
for (lang, project, article) in usage:
@@ -418,19 +430,20 @@
result += u'|wiki%d=%s' % (usageCounter, project)
result += u'|article%d=%s' % (usageCounter, article)
result += u'\n'
- usageCounter = usageCounter + 1
+ usageCounter += 1
galleryCounter = 1
for gallery in galleries:
result += u'|gallery%d=%s' % (galleryCounter, gallery.replace('_', ' ')) + u'\n'
- galleryCounter = galleryCounter + 1
+ galleryCounter += 1
result += u'|ncats=%d\n' % ncats
result += u'}}\n'
return result
+
def main(args):
- '''
+ """
Main loop. Get a generator and options. Work on all images in the generator.
- '''
+ """
generator = None
onlyFilter = False
onlyUncat = False
@@ -446,9 +459,9 @@
elif arg == '-onlyuncat':
onlyUncat = True
elif arg.startswith('-hint:'):
- hint_wiki = arg [len('-hint:'):]
+ hint_wiki = arg[len('-hint:'):]
elif arg.startswith('-onlyhint'):
- search_wikis = arg [len('-onlyhint:'):]
+ search_wikis = arg[len('-onlyhint:'):]
else:
genFactory.handleArg(arg)
--
To view, visit https://gerrit.wikimedia.org/r/86624
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I5a4a7dd85eadf7233fe7e388644dadd5ba1e5ffc
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged.
Change subject: Port imagerecat.py from compat
......................................................................
Port imagerecat.py from compat
Change-Id: I9e5f5a1fb1823ec85378d3bf9d7c67592139face
---
A scripts/imagerecat.py
1 file changed, 465 insertions(+), 0 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/imagerecat.py b/scripts/imagerecat.py
new file mode 100644
index 0000000..7c91106
--- /dev/null
+++ b/scripts/imagerecat.py
@@ -0,0 +1,465 @@
+# -*- coding: utf-8 -*-
+"""
+Program to (re)categorize images at commons.
+
+The program uses commonshelper for category suggestions.
+It takes the suggestions and the current categories. Put the categories through
+some filters and adds the result.
+
+The following command line parameters are supported:
+
+-onlyfilter Don't use Commonsense to get categories, just filter the current
+ categories
+
+-onlyuncat Only work on uncategorized images. Will prevent the bot from
+ working on an image multiple times.
+
+-hint Give Commonsense a hint.
+ For example -hint:li.wikipedia.org
+
+-onlyhint Give Commonsense a hint. And only work on this hint.
+ Syntax is the same as -hint. Some special hints are possible:
+ _20 : Work on the top 20 wikipedia's
+ _80 : Work on the top 80 wikipedia's
+ wps : Work on all wikipedia's
+
+"""
+__version__ = '$Id$'
+#
+# (C) Multichill 2008-2011
+# (C) Pywikipedia bot team, 2008-2013
+#
+# Distributed under the terms of the MIT license.
+#
+#
+import os, sys, re, codecs
+import urllib, httplib, urllib2
+import time
+import socket
+import StringIO
+import pywikibot
+from pywikibot import config
+from pywikibot import pagegenerators
+import xml.etree.ElementTree
+
+category_blacklist = []
+countries = []
+
+search_wikis=u'_20'
+hint_wiki=u''
+
+def initLists():
+ '''
+ Get the list of countries & the blacklist from Commons.
+ '''
+ global category_blacklist
+ global countries
+
+ blacklistPage = pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
+ u'User:Multichill/Category_blacklist')
+ for cat in blacklistPage.linkedPages():
+ category_blacklist.append(cat.title(withNamespace=False))
+
+ countryPage = pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
+ u'User:Multichill/Countries')
+ for country in countryPage.linkedPages():
+ countries.append(country.title(withNamespace=False))
+ return
+
+def categorizeImages(generator, onlyFilter, onlyUncat):
+ ''' Loop over all images in generator and try to categorize them. Get
+ category suggestions from CommonSense.
+
+ '''
+ for page in generator:
+ if page.exists() and (page.namespace() == 6) and \
+ (not page.isRedirectPage()):
+ imagepage = pywikibot.ImagePage(page.site(), page.title())
+ pywikibot.output(u'Working on ' + imagepage.title())
+
+ if (onlyUncat and not(u'Uncategorized' in imagepage.templates())):
+ pywikibot.output(u'No Uncategorized template found')
+ else:
+ currentCats = getCurrentCats(imagepage)
+ if onlyFilter:
+ commonshelperCats = []
+ usage = []
+ galleries = []
+ else:
+ (commonshelperCats, usage, galleries) = getCommonshelperCats(imagepage)
+ newcats = applyAllFilters(commonshelperCats+currentCats)
+
+ if (len(newcats) > 0 and not(set(currentCats)==set(newcats))):
+ for cat in newcats:
+ pywikibot.output(u' Found new cat: ' + cat);
+ saveImagePage(imagepage, newcats, usage, galleries,
+ onlyFilter)
+
+def getCurrentCats(imagepage):
+ ''' Get the categories currently on the image '''
+ result = []
+ for cat in imagepage.categories():
+ result.append(cat.title(withNamespace=False))
+ return list(set(result))
+
+def getCommonshelperCats(imagepage):
+ ''' Get category suggestions from CommonSense. Parse them and return a list
+ of suggestions.
+
+ '''
+ commonshelperCats = []
+ usage = []
+ galleries = []
+
+ global search_wikis
+ global hint_wiki
+ site = imagepage.site
+ lang = site.language()
+ family = site.family.name
+ if lang==u'commons' and family==u'commons':
+ parameters = urllib.urlencode(
+ {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r' : 'on',
+ 'go-clean' : 'Find+Categories',
+ 'p' : search_wikis,
+ 'cl' : hint_wiki})
+ elif family==u'wikipedia':
+ parameters = urllib.urlencode(
+ {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r' : 'on',
+ 'go-move' : 'Find+Categories',
+ 'p' : search_wikis,
+ 'cl' : hint_wiki,
+ 'w' : lang})
+ else:
+ #Cant handle other sites atm
+ return ([], [], [])
+
+ commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL)
+
+ gotInfo = False
+ matches = None
+ maxtries = 10
+ tries = 0
+ while(not gotInfo):
+ try:
+ if ( tries < maxtries ):
+ tries = tries + 1
+ commonsHelperPage = urllib.urlopen(
+ "http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)
+ matches = commonsenseRe.search(
+ commonsHelperPage.read().decode('utf-8'))
+ gotInfo = True
+ else:
+ break
+ except IOError:
+ pywikibot.output(u'Got an IOError, let\'s try again')
+ except socket.timeout:
+ pywikibot.output(u'Got a timeout, let\'s try again')
+
+ if (matches and gotInfo):
+ if (matches.group('usagenum') > 0):
+ used = matches.group('usage').splitlines()
+ for use in used:
+ usage= usage + getUsage(use)
+ #pywikibot.output(use)
+ if (matches.group('catnum') > 0):
+ cats = matches.group('cats').splitlines()
+ for cat in cats:
+ commonshelperCats.append(cat.replace('_', ' '))
+ pywikibot.output(u'category : ' + cat)
+ if (matches.group('galnum') > 0):
+ gals = matches.group('gals').splitlines()
+ for gal in gals:
+ galleries.append(gal.replace('_', ' '))
+ pywikibot.output(u'gallery : ' + gal)
+ commonshelperCats = list(set(commonshelperCats))
+ galleries = list(set(galleries))
+ for (lang, project, article) in usage:
+ pywikibot.output(lang + project + article)
+ return (commonshelperCats, usage, galleries)
+
+def getOpenStreetMapCats(latitude, longitude):
+ '''
+ Get a list of location categories based on the OSM nomatim tool
+ '''
+ result = []
+ locationList = getOpenStreetMap(latitude, longitude)
+ for i in range(0, len(locationList)):
+ #print 'Working on ' + locationList[i]
+ if i <= len(locationList)-3:
+ category = getCategoryByName(name=locationList[i], parent=locationList[i+1], grandparent=locationList[i+2])
+ elif i == len(locationList)-2:
+ category = getCategoryByName(name=locationList[i], parent=locationList[i+1])
+ else:
+ category = getCategoryByName(name=locationList[i])
+ if category and not category==u'':
+ result.append(category)
+ #print result
+ return result
+
+
+def getOpenStreetMap(latitude, longitude):
+ '''
+ Get the result from http://nominatim.openstreetmap.org/reverse
+ and put it in a list of tuples to play around with
+ '''
+ result = []
+ gotInfo = False
+ parameters = urllib.urlencode({'lat' : latitude, 'lon' : longitude, 'accept-language' : 'en'})
+ while(not gotInfo):
+ try:
+ page = urllib.urlopen("http://nominatim.openstreetmap.org/reverse?format=xml&%s" % parameters)
+ et = xml.etree.ElementTree.parse(page)
+ gotInfo=True
+ except IOError:
+ pywikibot.output(u'Got an IOError, let\'s try again')
+ time.sleep(30)
+ except socket.timeout:
+ pywikibot.output(u'Got a timeout, let\'s try again')
+ time.sleep(30)
+ validParts = [u'hamlet', u'village', u'city', u'county', u'country']
+ invalidParts = [u'path', u'road', u'suburb', u'state', u'country_code']
+ addressparts = et.find('addressparts')
+ #xml.etree.ElementTree.dump(et)
+
+ for addresspart in addressparts.getchildren():
+ if addresspart.tag in validParts:
+ result.append(addresspart.text)
+ elif addresspart.tag in invalidParts:
+ pywikibot.output(u'Dropping %s, %s' % (addresspart.tag, addresspart.text))
+ else:
+ pywikibot.warning(u'%s, %s is not in addressparts lists' % (addresspart.tag, addresspart.text))
+ #print result
+ return result
+
+def getCategoryByName(name, parent=u'', grandparent=u''):
+
+ if not parent==u'':
+ workname = name.strip() + u',_' + parent.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ if not grandparent==u'':
+ workname = name.strip() + u',_' + grandparent.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ workname = name.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ return u''
+
+
+def getUsage(use):
+ ''' Parse the Commonsense output to get the usage '''
+ result = []
+ lang = ''
+ project = ''
+ article = ''
+ usageRe = re.compile(
+ '^(?P<lang>([\w-]+))\.(?P<project>([\w]+))\.org:(?P<articles>\s(.*))')
+ matches = usageRe.search(use)
+ if matches:
+ if (matches.group('lang')):
+ lang = matches.group('lang')
+ #pywikibot.output(lang)
+ if (matches.group('project')):
+ project = matches.group('project')
+ #pywikibot.output(project)
+ if (matches.group('articles')):
+ articles = matches.group('articles')
+ #pywikibot.output(articles)
+ for article in articles.split():
+ result.append((lang, project, article))
+ return result
+
+def applyAllFilters(categories):
+ ''' Apply all filters on categories. '''
+ result = []
+ result = filterDisambiguation(categories)
+ result = followRedirects(result)
+ result = filterBlacklist(result)
+ result = filterCountries(result)
+ result = filterParents(result)
+ return result
+
+def filterBlacklist(categories):
+ ''' Filter out categories which are on the blacklist. '''
+ result = []
+ for cat in categories:
+ cat = cat.replace('_', ' ')
+ if (cat not in category_blacklist):
+ result.append(cat)
+ return list(set(result))
+
+def filterDisambiguation(categories):
+ ''' Filter out disambiguation categories. '''
+ result = []
+ for cat in categories:
+ if (not pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
+ cat, ns=14).isDisambig()):
+ result.append(cat)
+ return result
+
+def followRedirects(categories):
+ ''' If a category is a redirect, replace the category with the target. '''
+ result = []
+ for cat in categories:
+ categoryPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'),
+ cat, ns=14)
+ if categoryPage.isCategoryRedirect():
+ result.append(
+ categoryPage.getCategoryRedirectTarget().title(
+ withNamespace=False))
+ else:
+ result.append(cat)
+ return result
+
+def filterCountries(categories):
+ ''' Try to filter out ...by country categories.
+ First make a list of any ...by country categories and try to find some
+ countries. If a by country category has a subcategoy containing one of the
+ countries found, add it. The ...by country categories remain in the set and
+ should be filtered out by filterParents.
+
+ '''
+ result = categories
+ listByCountry = []
+ listCountries = []
+ for cat in categories:
+ if (cat.endswith(u'by country')):
+ listByCountry.append(cat)
+
+ #If cat contains 'by country' add it to the list
+ #If cat contains the name of a country add it to the list
+ else:
+ for country in countries:
+ if country in cat:
+ listCountries.append(country)
+ if(len(listByCountry) > 0):
+ for bc in listByCountry:
+ category = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), u'Category:' + bc)
+ for subcategory in category.subcategories():
+ for country in listCountries:
+ if (subcategory.title(withNamespace=False).endswith(country)):
+ result.append(subcategory.title(withNamespace=False))
+ return list(set(result))
+
+def filterParents(categories):
+ ''' Remove all parent categories from the set to prevent overcategorization.
+
+ '''
+ result = []
+ toFilter = u''
+ for cat in categories:
+ cat = cat.replace('_', ' ')
+ toFilter = toFilter + "[[Category:" + cat + "]]\n"
+ parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'),
+ 'bot' : '1'})
+ filterCategoriesRe = re.compile('\[\[Category:([^\]]*)\]\]')
+ try:
+ filterCategoriesPage = urllib.urlopen(
+ "http://toolserver.org/~multichill/filtercats.php?%s" % parameters)
+ result = filterCategoriesRe.findall(
+ filterCategoriesPage.read().decode('utf-8'))
+ except IOError:
+ #Something is wrong, forget about this filter and just return the input
+ return categories
+
+ if not result:
+ #Is empty, dont want to remove all categories
+ return categories
+ return result
+
+def saveImagePage(imagepage, newcats, usage, galleries, onlyFilter):
+ ''' Remove the old categories and add the new categories to the image. '''
+ newtext = pywikibot.removeCategoryLinks(imagepage.get(), imagepage.site())
+ if not(onlyFilter):
+ newtext = removeTemplates(newtext)
+ newtext = newtext + getCheckCategoriesTemplate(usage, galleries,
+ len(newcats))
+ newtext = newtext + u'\n'
+ for category in newcats:
+ newtext = newtext + u'[[Category:' + category + u']]\n'
+ if(onlyFilter):
+ comment = u'Filtering categories'
+ else:
+ comment = u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]'
+ pywikibot.showDiff(imagepage.get(), newtext)
+ imagepage.put(newtext, comment)
+ return
+
+def removeTemplates(oldtext = u''):
+ '''
+ Remove {{Uncategorized}} and {{Check categories}} templates
+ '''
+ result = u''
+ result = re.sub(
+ u'\{\{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'', oldtext)
+ result = re.sub(u'<!-- Remove this line once you have added categories -->',
+ u'', result)
+ result = re.sub(u'\{\{\s*[Cc]heck categories[^}]*\}\}', u'', result)
+ return result
+
+def getCheckCategoriesTemplate(usage, galleries, ncats):
+ '''
+ Build the check categories template with all parameters
+ '''
+ result = u'{{Check categories|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}\n'
+ usageCounter = 1
+ for (lang, project, article) in usage:
+ result += u'|lang%d=%s' % (usageCounter, lang)
+ result += u'|wiki%d=%s' % (usageCounter, project)
+ result += u'|article%d=%s' % (usageCounter, article)
+ result += u'\n'
+ usageCounter = usageCounter + 1
+ galleryCounter = 1
+ for gallery in galleries:
+ result += u'|gallery%d=%s' % (galleryCounter, gallery.replace('_', ' ')) + u'\n'
+ galleryCounter = galleryCounter + 1
+ result += u'|ncats=%d\n' % ncats
+ result += u'}}\n'
+ return result
+
+def main(args):
+ '''
+ Main loop. Get a generator and options. Work on all images in the generator.
+ '''
+ generator = None
+ onlyFilter = False
+ onlyUncat = False
+ genFactory = pagegenerators.GeneratorFactory()
+
+ global search_wikis
+ global hint_wiki
+
+ site = pywikibot.getSite(u'commons', u'commons')
+ for arg in pywikibot.handleArgs():
+ if arg == '-onlyfilter':
+ onlyFilter = True
+ elif arg == '-onlyuncat':
+ onlyUncat = True
+ elif arg.startswith('-hint:'):
+ hint_wiki = arg [len('-hint:'):]
+ elif arg.startswith('-onlyhint'):
+ search_wikis = arg [len('-onlyhint:'):]
+ else:
+ genFactory.handleArg(arg)
+
+ generator = genFactory.getCombinedGenerator()
+ if not generator:
+ generator = pagegenerators.CategorizedPageGenerator(
+ pywikibot.Category(site, u'Category:Media needing categories'),
+ recurse=True)
+ initLists()
+ categorizeImages(generator, onlyFilter, onlyUncat)
+ pywikibot.output(u'All done')
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
--
To view, visit https://gerrit.wikimedia.org/r/86621
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9e5f5a1fb1823ec85378d3bf9d7c67592139face
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: jenkins-bot