jenkins-bot has submitted this change and it was merged.
Change subject: PEP8-ify most of imagerecat.py
......................................................................
PEP8-ify most of imagerecat.py
Change-Id: I5a4a7dd85eadf7233fe7e388644dadd5ba1e5ffc
---
M scripts/imagerecat.py
1 file changed, 105 insertions(+), 92 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/imagerecat.py b/scripts/imagerecat.py
index 9a6ee16..df04190 100644
--- a/scripts/imagerecat.py
+++ b/scripts/imagerecat.py
@@ -45,13 +45,14 @@
category_blacklist = []
countries = []
-search_wikis=u'_20'
-hint_wiki=u''
+search_wikis = u'_20'
+hint_wiki = u''
+
def initLists():
- '''
+ """
Get the list of countries & the blacklist from Commons.
- '''
+ """
global category_blacklist
global countries
@@ -66,18 +67,19 @@
countries.append(country.title(withNamespace=False))
return
+
def categorizeImages(generator, onlyFilter, onlyUncat):
- ''' Loop over all images in generator and try to categorize them. Get
+ """ Loop over all images in generator and try to categorize them. Get
category suggestions from CommonSense.
- '''
+ """
for page in generator:
if page.exists() and (page.namespace() == 6) and \
(not page.isRedirectPage()):
imagepage = pywikibot.ImagePage(page.site(), page.title())
pywikibot.output(u'Working on ' + imagepage.title())
- if (onlyUncat and not(u'Uncategorized' in imagepage.templates())):
+ if onlyUncat and not(u'Uncategorized' in imagepage.templates()):
pywikibot.output(u'No Uncategorized template found')
else:
currentCats = getCurrentCats(imagepage)
@@ -87,26 +89,28 @@
galleries = []
else:
(commonshelperCats, usage, galleries) = getCommonshelperCats(imagepage)
- newcats = applyAllFilters(commonshelperCats+currentCats)
+ newcats = applyAllFilters(commonshelperCats + currentCats)
- if (len(newcats) > 0 and not(set(currentCats)==set(newcats))):
+ if len(newcats) > 0 and not(set(currentCats) == set(newcats)):
for cat in newcats:
- pywikibot.output(u' Found new cat: ' + cat);
+ pywikibot.output(u' Found new cat: ' + cat)
saveImagePage(imagepage, newcats, usage, galleries,
onlyFilter)
+
def getCurrentCats(imagepage):
- ''' Get the categories currently on the image '''
+ """ Get the categories currently on the image """
result = []
for cat in imagepage.categories():
result.append(cat.title(withNamespace=False))
return list(set(result))
+
def getCommonshelperCats(imagepage):
- ''' Get category suggestions from CommonSense. Parse them and return a list
+ """ Get category suggestions from CommonSense. Parse them and return a list
of suggestions.
- '''
+ """
commonshelperCats = []
usage = []
galleries = []
@@ -116,35 +120,35 @@
site = imagepage.site
lang = site.language()
family = site.family.name
- if lang==u'commons' and family==u'commons':
+ if lang == u'commons' and family == u'commons':
parameters = urllib.urlencode(
- {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
- 'r' : 'on',
- 'go-clean' : 'Find+Categories',
- 'p' : search_wikis,
- 'cl' : hint_wiki})
- elif family==u'wikipedia':
+ {'i': imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r': 'on',
+ 'go-clean': 'Find+Categories',
+ 'p': search_wikis,
+ 'cl': hint_wiki})
+ elif family == u'wikipedia':
parameters = urllib.urlencode(
- {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
- 'r' : 'on',
- 'go-move' : 'Find+Categories',
- 'p' : search_wikis,
- 'cl' : hint_wiki,
- 'w' : lang})
+ {'i': imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r': 'on',
+ 'go-move': 'Find+Categories',
+ 'p': search_wikis,
+ 'cl': hint_wiki,
+ 'w': lang})
else:
#Cant handle other sites atm
- return ([], [], [])
+ return [], [], []
- commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL)
+ commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL) # noqa
gotInfo = False
matches = None
maxtries = 10
tries = 0
- while(not gotInfo):
+ while not gotInfo:
try:
- if ( tries < maxtries ):
- tries = tries + 1
+ if tries < maxtries:
+ tries += 1
commonsHelperPage = urllib.urlopen(
"http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)
matches = commonsenseRe.search(
@@ -157,18 +161,18 @@
except socket.timeout:
pywikibot.output(u'Got a timeout, let\'s try again')
- if (matches and gotInfo):
- if (matches.group('usagenum') > 0):
+ if matches and gotInfo:
+ if matches.group('usagenum') > 0:
used = matches.group('usage').splitlines()
for use in used:
- usage= usage + getUsage(use)
+ usage = usage + getUsage(use)
#pywikibot.output(use)
- if (matches.group('catnum') > 0):
+ if matches.group('catnum') > 0:
cats = matches.group('cats').splitlines()
for cat in cats:
commonshelperCats.append(cat.replace('_', ' '))
pywikibot.output(u'category : ' + cat)
- if (matches.group('galnum') > 0):
+ if matches.group('galnum') > 0:
gals = matches.group('gals').splitlines()
for gal in gals:
galleries.append(gal.replace('_', ' '))
@@ -177,41 +181,45 @@
galleries = list(set(galleries))
for (lang, project, article) in usage:
pywikibot.output(lang + project + article)
- return (commonshelperCats, usage, galleries)
+ return commonshelperCats, usage, galleries
+
def getOpenStreetMapCats(latitude, longitude):
- '''
+ """
Get a list of location categories based on the OSM nomatim tool
- '''
+ """
result = []
locationList = getOpenStreetMap(latitude, longitude)
for i in range(0, len(locationList)):
#print 'Working on ' + locationList[i]
- if i <= len(locationList)-3:
- category = getCategoryByName(name=locationList[i], parent=locationList[i+1], grandparent=locationList[i+2])
- elif i == len(locationList)-2:
- category = getCategoryByName(name=locationList[i], parent=locationList[i+1])
+ if i <= len(locationList) - 3:
+ category = getCategoryByName(name=locationList[i],
+ parent=locationList[i + 1],
+ grandparent=locationList[i + 2])
+ elif i == len(locationList) - 2:
+ category = getCategoryByName(name=locationList[i],
+ parent=locationList[i + 1])
else:
category = getCategoryByName(name=locationList[i])
- if category and not category==u'':
+ if category and not category == u'':
result.append(category)
#print result
return result
def getOpenStreetMap(latitude, longitude):
- '''
+ """
Get the result from http://nominatim.openstreetmap.org/reverse
and put it in a list of tuples to play around with
- '''
+ """
result = []
gotInfo = False
- parameters = urllib.urlencode({'lat' : latitude, 'lon' : longitude, 'accept-language' : 'en'})
- while(not gotInfo):
+ parameters = urllib.urlencode({'lat': latitude, 'lon': longitude, 'accept-language': 'en'})
+ while not gotInfo:
try:
page = urllib.urlopen("http://nominatim.openstreetmap.org/reverse?format=xml&%s" % parameters)
et = xml.etree.ElementTree.parse(page)
- gotInfo=True
+ gotInfo = True
except IOError:
pywikibot.output(u'Got an IOError, let\'s try again')
time.sleep(30)
@@ -233,30 +241,28 @@
#print result
return result
+
def getCategoryByName(name, parent=u'', grandparent=u''):
- if not parent==u'':
+ if not parent == u'':
workname = name.strip() + u',_' + parent.strip()
- workcat = pywikibot.Category(
- pywikibot.Site(u'commons', u'commons'), workname)
+ workcat = pywikibot.Category(pywikibot.Site(u'commons', u'commons'), workname)
if workcat.exists():
return workname
- if not grandparent==u'':
+ if not grandparent == u'':
workname = name.strip() + u',_' + grandparent.strip()
- workcat = pywikibot.Category(
- pywikibot.Site(u'commons', u'commons'), workname)
+ workcat = pywikibot.Category(pywikibot.Site(u'commons', u'commons'), workname)
if workcat.exists():
return workname
workname = name.strip()
- workcat = pywikibot.Category(
- pywikibot.Site(u'commons', u'commons'), workname)
+ workcat = pywikibot.Category(pywikibot.Site(u'commons', u'commons'), workname)
if workcat.exists():
return workname
return u''
def getUsage(use):
- ''' Parse the Commonsense output to get the usage '''
+ """ Parse the Commonsense output to get the usage """
result = []
lang = ''
project = ''
@@ -265,21 +271,22 @@
'^(?P<lang>([\w-]+))\.(?P<project>([\w]+))\.org:(?P<articles>\s(.*))')
matches = usageRe.search(use)
if matches:
- if (matches.group('lang')):
+ if matches.group('lang'):
lang = matches.group('lang')
#pywikibot.output(lang)
- if (matches.group('project')):
+ if matches.group('project'):
project = matches.group('project')
#pywikibot.output(project)
- if (matches.group('articles')):
+ if matches.group('articles'):
articles = matches.group('articles')
#pywikibot.output(articles)
for article in articles.split():
result.append((lang, project, article))
return result
+
def applyAllFilters(categories):
- ''' Apply all filters on categories. '''
+ """ Apply all filters on categories. """
result = []
result = filterDisambiguation(categories)
result = followRedirects(result)
@@ -288,17 +295,19 @@
result = filterParents(result)
return result
+
def filterBlacklist(categories):
- ''' Filter out categories which are on the blacklist. '''
+ """ Filter out categories which are on the blacklist. """
result = []
for cat in categories:
cat = cat.replace('_', ' ')
- if (cat not in category_blacklist):
+ if not (cat in category_blacklist):
result.append(cat)
return list(set(result))
+
def filterDisambiguation(categories):
- ''' Filter out disambiguation categories. '''
+ """ Filter out disambiguation categories. """
result = []
for cat in categories:
if (not pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
@@ -306,8 +315,9 @@
result.append(cat)
return result
+
def followRedirects(categories):
- ''' If a category is a redirect, replace the category with the target. '''
+ """ If a category is a redirect, replace the category with the target. """
result = []
for cat in categories:
categoryPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'),
@@ -320,19 +330,20 @@
result.append(cat)
return result
+
def filterCountries(categories):
- ''' Try to filter out ...by country categories.
+ """ Try to filter out ...by country categories.
First make a list of any ...by country categories and try to find some
countries. If a by country category has a subcategoy containing one of the
countries found, add it. The ...by country categories remain in the set and
should be filtered out by filterParents.
- '''
+ """
result = categories
listByCountry = []
listCountries = []
for cat in categories:
- if (cat.endswith(u'by country')):
+ if cat.endswith(u'by country'):
listByCountry.append(cat)
#If cat contains 'by country' add it to the list
@@ -341,27 +352,26 @@
for country in countries:
if country in cat:
listCountries.append(country)
- if(len(listByCountry) > 0):
+ if len(listByCountry) > 0:
for bc in listByCountry:
category = pywikibot.Category(
pywikibot.Site(u'commons', u'commons'), u'Category:' + bc)
for subcategory in category.subcategories():
for country in listCountries:
- if (subcategory.title(withNamespace=False).endswith(country)):
+ if subcategory.title(withNamespace=False).endswith(country):
result.append(subcategory.title(withNamespace=False))
return list(set(result))
-def filterParents(categories):
- ''' Remove all parent categories from the set to prevent overcategorization.
- '''
+def filterParents(categories):
+ """ Remove all parent categories from the set to prevent overcategorization. """
result = []
toFilter = u''
for cat in categories:
cat = cat.replace('_', ' ')
toFilter = toFilter + "[[Category:" + cat + "]]\n"
- parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'),
- 'bot' : '1'})
+ parameters = urllib.urlencode({'source': toFilter.encode('utf-8'),
+ 'bot': '1'})
filterCategoriesRe = re.compile('\[\[Category:([^\]]*)\]\]')
try:
filterCategoriesPage = urllib.urlopen(
@@ -377,17 +387,18 @@
return categories
return result
+
def saveImagePage(imagepage, newcats, usage, galleries, onlyFilter):
- ''' Remove the old categories and add the new categories to the image. '''
+ """ Remove the old categories and add the new categories to the image. """
newtext = pywikibot.removeCategoryLinks(imagepage.get(), imagepage.site())
- if not(onlyFilter):
+ if not onlyFilter:
newtext = removeTemplates(newtext)
newtext = newtext + getCheckCategoriesTemplate(usage, galleries,
len(newcats))
- newtext = newtext + u'\n'
+ newtext += u'\n'
for category in newcats:
newtext = newtext + u'[[Category:' + category + u']]\n'
- if(onlyFilter):
+ if onlyFilter:
comment = u'Filtering categories'
else:
comment = u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]'
@@ -395,11 +406,11 @@
imagepage.put(newtext, comment)
return
-def removeTemplates(oldtext = u''):
- '''
+
+def removeTemplates(oldtext=u''):
+ """
Remove {{Uncategorized}} and {{Check categories}} templates
- '''
- result = u''
+ """
result = re.sub(
u'\{\{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'', oldtext)
result = re.sub(u'<!-- Remove this line once you have added categories -->',
@@ -407,10 +418,11 @@
result = re.sub(u'\{\{\s*[Cc]heck categories[^}]*\}\}', u'', result)
return result
+
def getCheckCategoriesTemplate(usage, galleries, ncats):
- '''
+ """
Build the check categories template with all parameters
- '''
+ """
result = u'{{Check categories|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}\n'
usageCounter = 1
for (lang, project, article) in usage:
@@ -418,19 +430,20 @@
result += u'|wiki%d=%s' % (usageCounter, project)
result += u'|article%d=%s' % (usageCounter, article)
result += u'\n'
- usageCounter = usageCounter + 1
+ usageCounter += 1
galleryCounter = 1
for gallery in galleries:
result += u'|gallery%d=%s' % (galleryCounter, gallery.replace('_', ' ')) + u'\n'
- galleryCounter = galleryCounter + 1
+ galleryCounter += 1
result += u'|ncats=%d\n' % ncats
result += u'}}\n'
return result
+
def main(args):
- '''
+ """
Main loop. Get a generator and options. Work on all images in the generator.
- '''
+ """
generator = None
onlyFilter = False
onlyUncat = False
@@ -446,9 +459,9 @@
elif arg == '-onlyuncat':
onlyUncat = True
elif arg.startswith('-hint:'):
- hint_wiki = arg [len('-hint:'):]
+ hint_wiki = arg[len('-hint:'):]
elif arg.startswith('-onlyhint'):
- search_wikis = arg [len('-onlyhint:'):]
+ search_wikis = arg[len('-onlyhint:'):]
else:
genFactory.handleArg(arg)
--
To view, visit https://gerrit.wikimedia.org/r/86624
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I5a4a7dd85eadf7233fe7e388644dadd5ba1e5ffc
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged.
Change subject: Port imagerecat.py from compat
......................................................................
Port imagerecat.py from compat
Change-Id: I9e5f5a1fb1823ec85378d3bf9d7c67592139face
---
A scripts/imagerecat.py
1 file changed, 465 insertions(+), 0 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/imagerecat.py b/scripts/imagerecat.py
new file mode 100644
index 0000000..7c91106
--- /dev/null
+++ b/scripts/imagerecat.py
@@ -0,0 +1,465 @@
+# -*- coding: utf-8 -*-
+"""
+Program to (re)categorize images at commons.
+
+The program uses commonshelper for category suggestions.
+It takes the suggestions and the current categories. Put the categories through
+some filters and adds the result.
+
+The following command line parameters are supported:
+
+-onlyfilter Don't use Commonsense to get categories, just filter the current
+ categories
+
+-onlyuncat Only work on uncategorized images. Will prevent the bot from
+ working on an image multiple times.
+
+-hint Give Commonsense a hint.
+ For example -hint:li.wikipedia.org
+
+-onlyhint Give Commonsense a hint. And only work on this hint.
+ Syntax is the same as -hint. Some special hints are possible:
+ _20 : Work on the top 20 wikipedia's
+ _80 : Work on the top 80 wikipedia's
+ wps : Work on all wikipedia's
+
+"""
+__version__ = '$Id$'
+#
+# (C) Multichill 2008-2011
+# (C) Pywikipedia bot team, 2008-2013
+#
+# Distributed under the terms of the MIT license.
+#
+#
+import os, sys, re, codecs
+import urllib, httplib, urllib2
+import time
+import socket
+import StringIO
+import pywikibot
+from pywikibot import config
+from pywikibot import pagegenerators
+import xml.etree.ElementTree
+
+category_blacklist = []
+countries = []
+
+search_wikis=u'_20'
+hint_wiki=u''
+
+def initLists():
+ '''
+ Get the list of countries & the blacklist from Commons.
+ '''
+ global category_blacklist
+ global countries
+
+ blacklistPage = pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
+ u'User:Multichill/Category_blacklist')
+ for cat in blacklistPage.linkedPages():
+ category_blacklist.append(cat.title(withNamespace=False))
+
+ countryPage = pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
+ u'User:Multichill/Countries')
+ for country in countryPage.linkedPages():
+ countries.append(country.title(withNamespace=False))
+ return
+
+def categorizeImages(generator, onlyFilter, onlyUncat):
+ ''' Loop over all images in generator and try to categorize them. Get
+ category suggestions from CommonSense.
+
+ '''
+ for page in generator:
+ if page.exists() and (page.namespace() == 6) and \
+ (not page.isRedirectPage()):
+ imagepage = pywikibot.ImagePage(page.site(), page.title())
+ pywikibot.output(u'Working on ' + imagepage.title())
+
+ if (onlyUncat and not(u'Uncategorized' in imagepage.templates())):
+ pywikibot.output(u'No Uncategorized template found')
+ else:
+ currentCats = getCurrentCats(imagepage)
+ if onlyFilter:
+ commonshelperCats = []
+ usage = []
+ galleries = []
+ else:
+ (commonshelperCats, usage, galleries) = getCommonshelperCats(imagepage)
+ newcats = applyAllFilters(commonshelperCats+currentCats)
+
+ if (len(newcats) > 0 and not(set(currentCats)==set(newcats))):
+ for cat in newcats:
+ pywikibot.output(u' Found new cat: ' + cat);
+ saveImagePage(imagepage, newcats, usage, galleries,
+ onlyFilter)
+
+def getCurrentCats(imagepage):
+ ''' Get the categories currently on the image '''
+ result = []
+ for cat in imagepage.categories():
+ result.append(cat.title(withNamespace=False))
+ return list(set(result))
+
+def getCommonshelperCats(imagepage):
+ ''' Get category suggestions from CommonSense. Parse them and return a list
+ of suggestions.
+
+ '''
+ commonshelperCats = []
+ usage = []
+ galleries = []
+
+ global search_wikis
+ global hint_wiki
+ site = imagepage.site
+ lang = site.language()
+ family = site.family.name
+ if lang==u'commons' and family==u'commons':
+ parameters = urllib.urlencode(
+ {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r' : 'on',
+ 'go-clean' : 'Find+Categories',
+ 'p' : search_wikis,
+ 'cl' : hint_wiki})
+ elif family==u'wikipedia':
+ parameters = urllib.urlencode(
+ {'i' : imagepage.title(withNamespace=False).encode('utf-8'),
+ 'r' : 'on',
+ 'go-move' : 'Find+Categories',
+ 'p' : search_wikis,
+ 'cl' : hint_wiki,
+ 'w' : lang})
+ else:
+ #Cant handle other sites atm
+ return ([], [], [])
+
+ commonsenseRe = re.compile('^#COMMONSENSE(.*)#USAGE(\s)+\((?P<usagenum>(\d)+)\)\s(?P<usage>(.*))\s#KEYWORDS(\s)+\((?P<keywords>(\d)+)\)(.*)#CATEGORIES(\s)+\((?P<catnum>(\d)+)\)\s(?P<cats>(.*))\s#GALLERIES(\s)+\((?P<galnum>(\d)+)\)\s(?P<gals>(.*))\s(.*)#EOF$', re.MULTILINE + re.DOTALL)
+
+ gotInfo = False
+ matches = None
+ maxtries = 10
+ tries = 0
+ while(not gotInfo):
+ try:
+ if ( tries < maxtries ):
+ tries = tries + 1
+ commonsHelperPage = urllib.urlopen(
+ "http://toolserver.org/~daniel/WikiSense/CommonSense.php?%s" % parameters)
+ matches = commonsenseRe.search(
+ commonsHelperPage.read().decode('utf-8'))
+ gotInfo = True
+ else:
+ break
+ except IOError:
+ pywikibot.output(u'Got an IOError, let\'s try again')
+ except socket.timeout:
+ pywikibot.output(u'Got a timeout, let\'s try again')
+
+ if (matches and gotInfo):
+ if (matches.group('usagenum') > 0):
+ used = matches.group('usage').splitlines()
+ for use in used:
+ usage= usage + getUsage(use)
+ #pywikibot.output(use)
+ if (matches.group('catnum') > 0):
+ cats = matches.group('cats').splitlines()
+ for cat in cats:
+ commonshelperCats.append(cat.replace('_', ' '))
+ pywikibot.output(u'category : ' + cat)
+ if (matches.group('galnum') > 0):
+ gals = matches.group('gals').splitlines()
+ for gal in gals:
+ galleries.append(gal.replace('_', ' '))
+ pywikibot.output(u'gallery : ' + gal)
+ commonshelperCats = list(set(commonshelperCats))
+ galleries = list(set(galleries))
+ for (lang, project, article) in usage:
+ pywikibot.output(lang + project + article)
+ return (commonshelperCats, usage, galleries)
+
+def getOpenStreetMapCats(latitude, longitude):
+ '''
+ Get a list of location categories based on the OSM nomatim tool
+ '''
+ result = []
+ locationList = getOpenStreetMap(latitude, longitude)
+ for i in range(0, len(locationList)):
+ #print 'Working on ' + locationList[i]
+ if i <= len(locationList)-3:
+ category = getCategoryByName(name=locationList[i], parent=locationList[i+1], grandparent=locationList[i+2])
+ elif i == len(locationList)-2:
+ category = getCategoryByName(name=locationList[i], parent=locationList[i+1])
+ else:
+ category = getCategoryByName(name=locationList[i])
+ if category and not category==u'':
+ result.append(category)
+ #print result
+ return result
+
+
+def getOpenStreetMap(latitude, longitude):
+ '''
+ Get the result from http://nominatim.openstreetmap.org/reverse
+ and put it in a list of tuples to play around with
+ '''
+ result = []
+ gotInfo = False
+ parameters = urllib.urlencode({'lat' : latitude, 'lon' : longitude, 'accept-language' : 'en'})
+ while(not gotInfo):
+ try:
+ page = urllib.urlopen("http://nominatim.openstreetmap.org/reverse?format=xml&%s" % parameters)
+ et = xml.etree.ElementTree.parse(page)
+ gotInfo=True
+ except IOError:
+ pywikibot.output(u'Got an IOError, let\'s try again')
+ time.sleep(30)
+ except socket.timeout:
+ pywikibot.output(u'Got a timeout, let\'s try again')
+ time.sleep(30)
+ validParts = [u'hamlet', u'village', u'city', u'county', u'country']
+ invalidParts = [u'path', u'road', u'suburb', u'state', u'country_code']
+ addressparts = et.find('addressparts')
+ #xml.etree.ElementTree.dump(et)
+
+ for addresspart in addressparts.getchildren():
+ if addresspart.tag in validParts:
+ result.append(addresspart.text)
+ elif addresspart.tag in invalidParts:
+ pywikibot.output(u'Dropping %s, %s' % (addresspart.tag, addresspart.text))
+ else:
+ pywikibot.warning(u'%s, %s is not in addressparts lists' % (addresspart.tag, addresspart.text))
+ #print result
+ return result
+
+def getCategoryByName(name, parent=u'', grandparent=u''):
+
+ if not parent==u'':
+ workname = name.strip() + u',_' + parent.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ if not grandparent==u'':
+ workname = name.strip() + u',_' + grandparent.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ workname = name.strip()
+ workcat = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), workname)
+ if workcat.exists():
+ return workname
+ return u''
+
+
+def getUsage(use):
+ ''' Parse the Commonsense output to get the usage '''
+ result = []
+ lang = ''
+ project = ''
+ article = ''
+ usageRe = re.compile(
+ '^(?P<lang>([\w-]+))\.(?P<project>([\w]+))\.org:(?P<articles>\s(.*))')
+ matches = usageRe.search(use)
+ if matches:
+ if (matches.group('lang')):
+ lang = matches.group('lang')
+ #pywikibot.output(lang)
+ if (matches.group('project')):
+ project = matches.group('project')
+ #pywikibot.output(project)
+ if (matches.group('articles')):
+ articles = matches.group('articles')
+ #pywikibot.output(articles)
+ for article in articles.split():
+ result.append((lang, project, article))
+ return result
+
+def applyAllFilters(categories):
+ ''' Apply all filters on categories. '''
+ result = []
+ result = filterDisambiguation(categories)
+ result = followRedirects(result)
+ result = filterBlacklist(result)
+ result = filterCountries(result)
+ result = filterParents(result)
+ return result
+
+def filterBlacklist(categories):
+ ''' Filter out categories which are on the blacklist. '''
+ result = []
+ for cat in categories:
+ cat = cat.replace('_', ' ')
+ if (cat not in category_blacklist):
+ result.append(cat)
+ return list(set(result))
+
+def filterDisambiguation(categories):
+ ''' Filter out disambiguation categories. '''
+ result = []
+ for cat in categories:
+ if (not pywikibot.Page(pywikibot.Site(u'commons', u'commons'),
+ cat, ns=14).isDisambig()):
+ result.append(cat)
+ return result
+
+def followRedirects(categories):
+ ''' If a category is a redirect, replace the category with the target. '''
+ result = []
+ for cat in categories:
+ categoryPage = pywikibot.Page(pywikibot.getSite(u'commons', u'commons'),
+ cat, ns=14)
+ if categoryPage.isCategoryRedirect():
+ result.append(
+ categoryPage.getCategoryRedirectTarget().title(
+ withNamespace=False))
+ else:
+ result.append(cat)
+ return result
+
+def filterCountries(categories):
+ ''' Try to filter out ...by country categories.
+ First make a list of any ...by country categories and try to find some
+ countries. If a by country category has a subcategoy containing one of the
+ countries found, add it. The ...by country categories remain in the set and
+ should be filtered out by filterParents.
+
+ '''
+ result = categories
+ listByCountry = []
+ listCountries = []
+ for cat in categories:
+ if (cat.endswith(u'by country')):
+ listByCountry.append(cat)
+
+ #If cat contains 'by country' add it to the list
+ #If cat contains the name of a country add it to the list
+ else:
+ for country in countries:
+ if country in cat:
+ listCountries.append(country)
+ if(len(listByCountry) > 0):
+ for bc in listByCountry:
+ category = pywikibot.Category(
+ pywikibot.Site(u'commons', u'commons'), u'Category:' + bc)
+ for subcategory in category.subcategories():
+ for country in listCountries:
+ if (subcategory.title(withNamespace=False).endswith(country)):
+ result.append(subcategory.title(withNamespace=False))
+ return list(set(result))
+
+def filterParents(categories):
+ ''' Remove all parent categories from the set to prevent overcategorization.
+
+ '''
+ result = []
+ toFilter = u''
+ for cat in categories:
+ cat = cat.replace('_', ' ')
+ toFilter = toFilter + "[[Category:" + cat + "]]\n"
+ parameters = urllib.urlencode({'source' : toFilter.encode('utf-8'),
+ 'bot' : '1'})
+ filterCategoriesRe = re.compile('\[\[Category:([^\]]*)\]\]')
+ try:
+ filterCategoriesPage = urllib.urlopen(
+ "http://toolserver.org/~multichill/filtercats.php?%s" % parameters)
+ result = filterCategoriesRe.findall(
+ filterCategoriesPage.read().decode('utf-8'))
+ except IOError:
+ #Something is wrong, forget about this filter and just return the input
+ return categories
+
+ if not result:
+ #Is empty, dont want to remove all categories
+ return categories
+ return result
+
+def saveImagePage(imagepage, newcats, usage, galleries, onlyFilter):
+ ''' Remove the old categories and add the new categories to the image. '''
+ newtext = pywikibot.removeCategoryLinks(imagepage.get(), imagepage.site())
+ if not(onlyFilter):
+ newtext = removeTemplates(newtext)
+ newtext = newtext + getCheckCategoriesTemplate(usage, galleries,
+ len(newcats))
+ newtext = newtext + u'\n'
+ for category in newcats:
+ newtext = newtext + u'[[Category:' + category + u']]\n'
+ if(onlyFilter):
+ comment = u'Filtering categories'
+ else:
+ comment = u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]'
+ pywikibot.showDiff(imagepage.get(), newtext)
+ imagepage.put(newtext, comment)
+ return
+
+def removeTemplates(oldtext = u''):
+ '''
+ Remove {{Uncategorized}} and {{Check categories}} templates
+ '''
+ result = u''
+ result = re.sub(
+ u'\{\{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*\}\}', u'', oldtext)
+ result = re.sub(u'<!-- Remove this line once you have added categories -->',
+ u'', result)
+ result = re.sub(u'\{\{\s*[Cc]heck categories[^}]*\}\}', u'', result)
+ return result
+
+def getCheckCategoriesTemplate(usage, galleries, ncats):
+ '''
+ Build the check categories template with all parameters
+ '''
+ result = u'{{Check categories|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}\n'
+ usageCounter = 1
+ for (lang, project, article) in usage:
+ result += u'|lang%d=%s' % (usageCounter, lang)
+ result += u'|wiki%d=%s' % (usageCounter, project)
+ result += u'|article%d=%s' % (usageCounter, article)
+ result += u'\n'
+ usageCounter = usageCounter + 1
+ galleryCounter = 1
+ for gallery in galleries:
+ result += u'|gallery%d=%s' % (galleryCounter, gallery.replace('_', ' ')) + u'\n'
+ galleryCounter = galleryCounter + 1
+ result += u'|ncats=%d\n' % ncats
+ result += u'}}\n'
+ return result
+
+def main(args):
+ '''
+ Main loop. Get a generator and options. Work on all images in the generator.
+ '''
+ generator = None
+ onlyFilter = False
+ onlyUncat = False
+ genFactory = pagegenerators.GeneratorFactory()
+
+ global search_wikis
+ global hint_wiki
+
+ site = pywikibot.getSite(u'commons', u'commons')
+ for arg in pywikibot.handleArgs():
+ if arg == '-onlyfilter':
+ onlyFilter = True
+ elif arg == '-onlyuncat':
+ onlyUncat = True
+ elif arg.startswith('-hint:'):
+ hint_wiki = arg [len('-hint:'):]
+ elif arg.startswith('-onlyhint'):
+ search_wikis = arg [len('-onlyhint:'):]
+ else:
+ genFactory.handleArg(arg)
+
+ generator = genFactory.getCombinedGenerator()
+ if not generator:
+ generator = pagegenerators.CategorizedPageGenerator(
+ pywikibot.Category(site, u'Category:Media needing categories'),
+ recurse=True)
+ initLists()
+ categorizeImages(generator, onlyFilter, onlyUncat)
+ pywikibot.output(u'All done')
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
--
To view, visit https://gerrit.wikimedia.org/r/86621
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9e5f5a1fb1823ec85378d3bf9d7c67592139face
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: jenkins-bot
jenkins-bot has submitted this change and it was merged.
Change subject: Don't use set literals in tests
......................................................................
Don't use set literals in tests
Set literals are not available in Python 2.6.
This doesn't change the fact that the tests don't work on 2.6 but at
least it stops the message about illegal syntax popping up every time
one installs the module.
Change-Id: I9b8674af99bc3acd418b2a378679e7254e1dee79
---
M tests/page_tests.py
1 file changed, 2 insertions(+), 2 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/tests/page_tests.py b/tests/page_tests.py
index 48f40fe..5a60678 100644
--- a/tests/page_tests.py
+++ b/tests/page_tests.py
@@ -253,9 +253,9 @@
if not site.hasExtension('Disambiguator', False):
raise unittest.SkipTest('Disambiguator extension not loaded on test site')
pg = pywikibot.Page(site, 'Random')
- pg._pageprops = {'disambiguation', ''}
+ pg._pageprops = set(['disambiguation', ''])
self.assertTrue(pg.isDisambig())
- pg._pageprops = {}
+ pg._pageprops = set()
self.assertFalse(pg.isDisambig())
def testReferences(self):
--
To view, visit https://gerrit.wikimedia.org/r/86433
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9b8674af99bc3acd418b2a378679e7254e1dee79
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mineo <themineo(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: jenkins-bot