Revision: 5826 Author: multichill Date: 2008-08-21 14:26:24 +0000 (Thu, 21 Aug 2008)
Log Message: ----------- Country filter and option to only filter.
Modified Paths: -------------- trunk/pywikipedia/imagerecat.py
Modified: trunk/pywikipedia/imagerecat.py =================================================================== --- trunk/pywikipedia/imagerecat.py 2008-08-21 13:23:10 UTC (rev 5825) +++ trunk/pywikipedia/imagerecat.py 2008-08-21 14:26:24 UTC (rev 5826) @@ -23,23 +23,41 @@ category_blacklist = [u'Hidden categories', u'Stub pictures']
-def categorizeImages(generator): +countries = [] + +def getCountries(): + ''' + Get the list of countries from Commons. + ''' + result = [] + countryPage = wikipedia.Page(wikipedia.getSite(), u'User:Multichill/Countries') + for country in countryPage.linkedPages(): + result.append(country.titleWithoutNamespace()) + return result + +def categorizeImages(generator, onlyfilter): + ''' + Loop over all images in generator and try to categorize them. Get category suggestions from CommonSense. + ''' for page in generator: if page.exists() and (page.namespace() == 6) and (not page.isRedirectPage()): imagepage = wikipedia.ImagePage(page.site(), page.title()) #imagepage.get() wikipedia.output(u'Working on ' + imagepage.title()); currentCats = getCurrentCats(imagepage) - commonshelperCats = getCommonshelperCats(imagepage) + if(onlyfilter): + commonshelperCats = [] + else: + commonshelperCats = getCommonshelperCats(imagepage) newcats = filterBlacklist(commonshelperCats+currentCats) newcats = filterDisambiguation(newcats) newcats = followRedirects(newcats) - #newcats = filterCountries(newcats) + newcats = filterCountries(newcats) newcats = filterParents(newcats) - if len(newcats) > 0: + if (len(newcats) > 0 and not(set(currentCats)==set(newcats))): for cat in newcats: wikipedia.output(u' Found new cat: ' + cat); - saveImagePage(imagepage, newcats) + saveImagePage(imagepage, newcats, onlyfilter)
def getCurrentCats(imagepage): @@ -54,7 +72,7 @@
def getCommonshelperCats(imagepage): ''' - Get category suggestions from commonshelper. Parse them and return a list of suggestions. + Get category suggestions from CommonSense. Parse them and return a list of suggestions. ''' result = [] parameters = urllib.urlencode({'i' : imagepage.titleWithoutNamespace().encode('utf-8'), 'r' : 'on', 'go-clean' : 'Find+Categories', 'cl' : 'li'}) @@ -82,6 +100,9 @@
def filterBlacklist(categories): + ''' + Filter out categories which are on the blacklist. + ''' result = [] for cat in categories: if (cat not in category_blacklist): @@ -90,6 +111,9 @@
def filterDisambiguation(categories): + ''' + Filter out disambiguation categories. + ''' result = [] for cat in categories: if(not wikipedia.Page(wikipedia.getSite(), u'Category:' + cat).isDisambig()): @@ -97,6 +121,9 @@ return result
def followRedirects(categories): + ''' + If a category is a redirect, replace the category with the target. + ''' result = [] for cat in categories: categoryPage = wikipedia.Page(wikipedia.getSite(), u'Category:' + cat) @@ -110,13 +137,40 @@
def filterCountries(categories): - result = [] - return result + ''' + Try to filter out ...by country categories. + First make a list of any ...by country categories and try to find some countries. + If a by country category has a subcategoy containing one of the countries found, add it. + The ...by country categories remain in the set and should be filtered out by filterParents. + ''' + result = categories + listByCountry = [] + listCountries = [] + for cat in categories: + if (cat.endswith(u'by country')): + listByCountry.append(cat) + + #If cat contains 'by country' add it to the list + #If cat contains the name of a country add it to the list + else: + for country in countries: + if not(cat.find(country)==-1): + listCountries.append(country) + + if(len(listByCountry) > 0): + for bc in listByCountry: + category = catlib.Category(wikipedia.getSite(), u'Category:' + bc) + for subcategory in category.subcategories(): + for country in listCountries: + if (subcategory.titleWithoutNamespace().endswith(country)): + result.append(subcategory.titleWithoutNamespace()) + + return list(set(result))
def filterParents(categories): ''' - Remove the current categories from the suggestions and remove blacklisted cats. + Remove all parent categories from the set to prevent overcategorization. ''' result = [] toFilter = u'' @@ -135,18 +189,29 @@ return result
-def saveImagePage(imagepage, newcats): +def saveImagePage(imagepage, newcats, onlyfilter): + ''' + Remove the old categories and add the new categories to the image. + ''' newtext = wikipedia.removeCategoryLinks(imagepage.get(), imagepage.site()) newtext = removeTemplates(newtext) + u'{{subst:chc}}\n' for category in newcats: newtext = newtext + u'[[Category:' + category + u']]\n' + + if(onlyfilter): + comment = u'Filtering categories' + else: + comment = u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]'
wikipedia.showDiff(imagepage.get(), newtext) - imagepage.put(newtext, u'Image is categorized by a bot using data from [[Commons:Tools#CommonSense|CommonSense]]') + imagepage.put(newtext, comment) return
def removeTemplates(oldtext = u''): + ''' + Remove {{Uncategorized}} and {{Check categories}} templates + ''' result = u'' result = re.sub(u'{{\s*([Uu]ncat(egori[sz]ed( image)?)?|[Nn]ocat|[Nn]eedscategory)[^}]*}}', u'', oldtext) result = re.sub(u'<!-- Remove this line once you have added categories -->', u'', result) @@ -158,8 +223,8 @@ ''' Main loop. Get a generator. Set up the 3 threads and the 2 queue's and fire everything up. ''' - generator = None; - + generator = None + onlyfilter = False genFactory = pagegenerators.GeneratorFactory()
site = wikipedia.getSite(u'commons', u'commons') @@ -170,12 +235,15 @@ generator = [wikipedia.Page(site, wikipedia.input(u'What page do you want to use?'))] else: generator = [wikipedia.Page(site, arg[6:])] + elif arg == '-onlyfilter': + onlyfilter = True else: generator = genFactory.handleArg(arg) if not generator: generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site, u'Category:Media needing categories'), recurse=True) - - categorizeImages(generator) + global countries + countries = getCountries() + categorizeImages(generator, onlyfilter)
wikipedia.output(u'All done')