Revision: 6259 Author: russblau Date: 2009-01-14 21:01:07 +0000 (Wed, 14 Jan 2009)
Log Message: ----------- Implement category redirect detection; category pages containing a listed redirect template will be treated as redirect pages (e.g., .IsRedirectPage() will return True)
Modified Paths: -------------- trunk/pywikipedia/families/commons_family.py trunk/pywikipedia/families/wikipedia_family.py trunk/pywikipedia/family.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/families/commons_family.py =================================================================== --- trunk/pywikipedia/families/commons_family.py 2009-01-14 18:22:16 UTC (rev 6258) +++ trunk/pywikipedia/families/commons_family.py 2009-01-14 21:01:07 UTC (rev 6259) @@ -28,10 +28,16 @@ }
self.interwiki_forward = 'wikipedia' + + self.category_redirect_templates = { + 'commons': ('Category redirect',), + } + self.disambiguationTemplates = { 'commons': [u'Disambig', u'Disambiguation', u'Razločitev', u'Begriffsklärung'] } + self.disambcatname = { 'commons': u'Disambiguation' }
Modified: trunk/pywikipedia/families/wikipedia_family.py =================================================================== --- trunk/pywikipedia/families/wikipedia_family.py 2009-01-14 18:22:16 UTC (rev 6258) +++ trunk/pywikipedia/families/wikipedia_family.py 2009-01-14 21:01:07 UTC (rev 6259) @@ -540,6 +540,48 @@ 'als': u'Nochricht Diskussion', }
+ self.category_redirect_templates = { + '_default': (), + 'ar': (u"تحويل تصنيف",), + 'arz': (u'تحويل تصنيف',), + 'cs': (u'Zastaralá kategorie',), + 'da': (u'Kategoriomdirigering',), + 'de': (u'Kategorieweiterleitung',), + 'en': (u"Category redirect", + u"Category redirect3", + ), + 'es': (u'Categoría redirigida',), + 'eu': (u'Kategoria redirect',), + 'fa': (u'رده بهتر', + u'انتقال رده', + u'فیلمهای امریکایی'), + 'fr': (u'Redirection de catégorie',), + 'hi': (u'श्रेणीअनुप्रेषित',), + 'id': (u'Alih kategori',), + # 'it' has removed its template + # 'ja' is discussing to remove this template + 'ja': (u"Category redirect",), + 'ko': (u'분류 넘겨주기',), + 'mk': (u'Премести категорија',), + 'ms': (u'Pengalihan kategori',), + 'mt': (u'Redirect kategorija',), + # 'nl' has removed its template + 'no': (u"Kategoriomdirigering",), + 'pl': (u'Przekierowanie kategorii',), + 'pt': (u'Redirecionamento de categoria',), + 'ro': (u'Redirect categorie',), + 'ru': (u'Переименованная категория',), + 'simple': (u"Category redirect",), + 'sq': (u'Kategori e zhvendosur',), + 'tl': (u'Category redirect',), + 'tr': (u'Kategori yönlendirme',), + 'uk': (u'Categoryredirect',), + 'vi': (u'Đổi hướng thể loại',), + 'yi': (u'קאטעגאריע אריבערפירן',), + 'zh': (u'分类重定向',), + 'zh-yue': (u'分類彈去',), + } + self.disambiguationTemplates = { # set value to None, instead of a list, to retrieve names from # the live wiki ([[MediaWiki:Disambiguationspage]]
Modified: trunk/pywikipedia/family.py =================================================================== --- trunk/pywikipedia/family.py 2009-01-14 18:22:16 UTC (rev 6258) +++ trunk/pywikipedia/family.py 2009-01-14 21:01:07 UTC (rev 6259) @@ -2851,6 +2851,12 @@ 'zzz wiki': 'zzz wiki', }
+ # A list of category redirect template names in different languages + # Note: It is *not* necessary to list template redirects here + self.category_redirect_templates = { + '_default': [] + } + # A list of disambiguation template names in different languages self.disambiguationTemplates = { '_default': [] @@ -3123,6 +3129,16 @@ # give up return None
+ def category_redirects(self, code, fallback="_default"): + if code in self.category_redirect_templates: + return self.category_redirect_templates[code] + elif fallback: + return self.category_redirect_templates[fallback] + else: + raise KeyError( +"ERROR: title for category redirect template in language '%s' unknown" + % code) + def disambig(self, code, fallback = '_default'): if self.disambiguationTemplates.has_key(code): return self.disambiguationTemplates[code]
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2009-01-14 18:22:16 UTC (rev 6258) +++ trunk/pywikipedia/wikipedia.py 2009-01-14 21:01:07 UTC (rev 6259) @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """ Library to get and put pages on a MediaWiki.
@@ -834,6 +834,10 @@ self._redirarg = redirtarget else: raise IsRedirectPage(redirtarget) + elif self.is_category_redirect(pagetext): # sets _redirarg + if not get_redirect: + self._getexception = IsRedirectPage + raise IsRedirectPage(self._redirarg) if self.section(): # TODO: What the hell is this? Docu please. m = re.search(".3D_*(.27.27+)?(.5B.5B)?_*%s_*(.5B.5B)?(.27.27+)?_*.3D" % re.escape(self.section()), sectionencode(text,self.site().encoding())) @@ -977,6 +981,40 @@ return False return False
+ def is_category_redirect(self, text=None): + """Return True if this is a category redirect. + + Category redirects are identified by the presence of any of the + templates found in self.site().category_redirects(), including + redirects to any of those templates, in the page text. + + """ + if not self.isCategory(): + return False + if not hasattr(self, "_catredirect"): + redir_list = [Page(self.site(), name, defaultNamespace=10) + for name in self.site().category_redirects()] + try: + templates_and_params = self.templatesWithParams( + thistxt=text, + get_redirect=True) + except Error: # couldn't retrieve templates + self._catredirect = False + else: + for item in templates_and_params: + tempname = item[0] + template = Page(self.site(), tempname, defaultNamespace=10) + while template.isRedirectPage(): + template = template.getRedirectTarget() + if template in redir_list: + self._catredirect = True + self._redirarg = Page(self.site(), item[1][0], + defaultNamespace=14).title() + # treat first template arg as name of target category + else: + self._catredirect = False + return self._catredirect + def isEmpty(self): """Return True if the page text has less than 4 characters.
@@ -2962,6 +3000,8 @@ page2._revisionId = revisionId page2._editTime = timestamp section = page2.section() + # Store the content + page2._contents = text m = self.site.redirectRegex().match(text) if m: ## output(u"%s is a redirect" % page2.aslink()) @@ -2970,26 +3010,36 @@ redirectto = redirectto+"#"+section page2._getexception = IsRedirectPage page2._redirarg = redirectto + elif page2.is_category_redirect(): + page2._getexception = IsRedirectPage + # This is used for checking deletion conflict. # Use the data loading time. - page2._startTime = time.strftime('%Y%m%d%H%M%S', time.gmtime()) + page2._startTime = time.strftime('%Y%m%d%H%M%S', + time.gmtime()) if section: - m = re.search(".3D_*(.27.27+)?(.5B.5B)?_*%s_*(.5B.5B)?(.27.27+)?_*.3D" % re.escape(section), sectionencode(text,page2.site().encoding())) + # WHAT IS THIS? + m = re.search( + ".3D_*(.27.27+)?(.5B.5B)?_*%s_*(.5B.5B)?(.27.27+)?_*.3D" + % re.escape(section), + sectionencode(text, page2.site().encoding())) if not m: try: page2._getexception - output(u"WARNING: Section not found: %s" % page2.aslink(forceInterwiki = True)) + output(u"WARNING: Section not found: %s" + % page2.aslink(forceInterwiki = True)) except AttributeError: # There is no exception yet page2._getexception = SectionError - # Store the content - page2._contents = text successful = True # Note that there is no break here. The reason is that there # might be duplicates in the pages list. if not successful: - output(u"BUG>> title %s (%s) not found in list" % (title, page.aslink(forceInterwiki=True))) - output(u'Expected one of: %s' % u','.join([page2.aslink(forceInterwiki=True) for page2 in self.pages])) + output(u"BUG>> title %s (%s) not found in list" + % (title, page.aslink(forceInterwiki=True))) + output(u'Expected one of: %s' + % u','.join([page2.aslink(forceInterwiki=True) + for page2 in self.pages])) raise PageNotFound
def headerDone(self, header): @@ -6084,6 +6134,9 @@ """Return list of language codes that can be used in interwiki links.""" return self._validlanguages
+ def category_redirects(self): + return self.family.category_redirects(self.lang, fallback="_default") + def disambcategory(self): """Return Category in which disambig pages are listed.""" import catlib @@ -6807,6 +6860,7 @@ raise return data
+ class MyURLopener(urllib.FancyURLopener): version="PythonWikipediaBot/1.0"
@@ -6817,7 +6871,6 @@ return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, errmsg, headers)
- # Special opener in case we are using a site with authentication if config.authenticate: import urllib2, cookielib