http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11195
Revision: 11195 Author: russblau Date: 2013-03-08 20:06:40 +0000 (Fri, 08 Mar 2013) Log Message: ----------- Fix Bug #3606570: Namespace alias error with preloading and page.exists(); this turned out to be a pervasive problem caused by an unannounced (IIRC) breaking change in the API. Formerly, the API always returned a page title that contained the site's canonical namespace prefix, for pages outside namespace 0; now, it may return a title using a namespace alias instead, as in the case of the gender-specific namespaces identified in the bug report. Because it is conceivable that other cases of non-canonical namespaces may exist either now or in the future, this is a more general solution that checks all possible namespace aliases before rejecting a page title.
Modified Paths: -------------- branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2013-03-08 08:12:45 UTC (rev 11194) +++ branches/rewrite/pywikibot/site.py 2013-03-08 20:06:40 UTC (rev 11195) @@ -392,6 +392,48 @@ % locals(), re.IGNORECASE | re.UNICODE | re.DOTALL)
+ def sametitle(self, title1, title2): + """Return True iff title1 and title2 identify the same wiki page.""" + # title1 and title2 may be unequal but still identify the same page, + # if they use different aliases for the same namespace + def valid_namespace(text, number): + """Return True iff text is a valid alias for namespace with given number.""" + for alias in self.namespace(number, all=True): + if text.lower() == alias.lower(): + return True + return False + if title1 == title2: + return True + # determine whether titles contain namespace prefixes + if ":" in title1: + ns1, name1 = title1.split(":", 1) + else: + ns1, name1 = 0, title1 + if ":" in title2: + ns2, name2 = title2.split(":", 1) + else: + ns2, name2 = 0, title2 + for space in self.namespaces(): # iterate over all valid namespaces + if type(ns1) is not int and valid_namespace(ns1, space): + ns1 = space + if type(ns2) is not int and valid_namespace(ns2, space): + ns2 = space + if type(ns1) is not int: + # no valid namespace prefix found, so the string followed by ":" + # must be part of the title + name1 = ns1 + ":" + name1 + ns1 = 0 + if type(ns2) is not int: + name2 = ns2 + ":" + name2 + ns2 = 0 + if ns1 != ns2: + # pages in different namespaces + return False + if self.case() == "first-letter": + name1 = name1[:1].upper() + name1[1:] + name2 = name2[:1].upper() + name2[1:] + return name1 == name2 + # namespace shortcuts for backwards-compatibility
def special_namespace(self): @@ -1132,19 +1174,10 @@ titles=title.encode(self.encoding()), inprop="protection") for pageitem in query: - if pageitem['title'] != title: - if pageitem['title'] in query.normalized \ - and query.normalized[pageitem['title']] == title: - # page title was normalized by api - # this should never happen because the Link() constructor - # normalizes the title - pywikibot.log( - u"loadpageinfo: Page title '%s' was normalized to '%s'" - % (title, pageitem['title'])) - else: - pywikibot.warning( - u"loadpageinfo: Query on %s returned data on '%s'" - % (page, pageitem['title'])) + if not self.sametitle(pageitem['title'], title): + pywikibot.warning( + u"loadpageinfo: Query on %s returned data on '%s'" + % (page, pageitem['title'])) continue api.update_page(page, pageitem)
@@ -1165,7 +1198,7 @@ "metadata", "archivename"], **args) for pageitem in query: - if pageitem['title'] != title: + if not self.sametitle(pageitem['title'], title): raise Error( u"loadimageinfo: Query on %s returned data on '%s'" % (page, pageitem['title'])) @@ -1236,7 +1269,7 @@ raise pywikibot.CircularRedirect(redirmap[title]) pagedata = result['query']['pages'].values()[0] # there should be only one value in 'pages', and it is the target - if pagedata['title'] == target_title: + if self.sametitle(pagedata['title'], target_title): target = pywikibot.Page(self, pagedata['title'], pagedata['ns']) api.update_page(target, pagedata) page._redirtarget = target @@ -1288,10 +1321,21 @@ pywikibot.debug(u"Preloading %s" % pagedata, _logger) try: if pagedata['title'] not in cache: - pywikibot.warning( - u"preloadpages: Query returned unexpected title '%s'" - % pagedata['title']) - continue +# API always returns a "normalized" title which is +# usually the same as the canonical form returned by +# page.title(), but sometimes not (e.g., +# gender-specific localizations of "User" namespace). +# This checks to see if there is a normalized title in +# the response that corresponds to the canonical form +# used in the query. + if pagedata['title'] in rvgen.normalized \ + and rvgen.normalized[pagedata['title']] in cache: + cache[pagedata['title']] = cache[rvgen.normalized[pagedata['title']]] + else: + pywikibot.warning( + u"preloadpages: Query returned unexpected title '%s'" + % pagedata['title']) + continue except KeyError: pywikibot.debug(u"No 'title' in %s" % pagedata, _logger) pywikibot.debug(u"pageids=%s" % pageids, _logger) @@ -1314,7 +1358,7 @@ intoken=tokentype, site=self) for item in query: - if item['title'] != page.title(withSection=False): + if not self.sametitle(item['title'], page.title(withSection=False)): raise Error( u"token: Query on page %s returned data on page [[%s]]" % (page.title(withSection=False, asLink=True), @@ -1705,21 +1749,11 @@ rvgen.continuekey = "revisions" for pagedata in rvgen: if page is not None: - if pagedata['title'] != page.title(withSection=False): - ok = False - namespace = page.namespace() - # gender settings ? - if namespace in [2, 3]: - ns, title = pagedata['title'].split(':', 1) - if ns in page.site.namespace(namespace, all=True) and \ - title == page.title(withSection=False, - withNamespace=False): - - ok = True - if not ok: - raise Error( - u"loadrevisions: Query on %s returned data on '%s'" - % (page, pagedata['title'])) + if not self.sametitle(pagedata['title'], + page.title(withSection=False)): + raise Error( + u"loadrevisions: Query on %s returned data on '%s'" + % (page, pagedata['title'])) if "missing" in pagedata: raise NoPage(page) else: @@ -1739,7 +1773,7 @@ titles=lltitle.encode(self.encoding()), step=step, total=total) for pageitem in llquery: - if pageitem['title'] != lltitle: + if not self.sametitle(pageitem['title'], lltitle): raise Error( u"getlanglinks: Query on %s returned data on '%s'" % (page, pageitem['title'])) @@ -1757,7 +1791,7 @@ titles=eltitle.encode(self.encoding()), step=step, total=total) for pageitem in elquery: - if pageitem['title'] != eltitle: + if not self.sametitle(pageitem['title'], eltitle): raise RuntimeError( "getlanglinks: Query on %s returned data on '%s'" % (page, pageitem['title'])) @@ -1773,7 +1807,7 @@ type_arg="categoryinfo", titles=cititle.encode(self.encoding())) for pageitem in ciquery: - if pageitem['title'] != cititle: + if not self.sametitle(pageitem['title'], cititle): raise Error( u"categoryinfo: Query on %s returned data on '%s'" % (category, pageitem['title'])) @@ -2591,7 +2625,7 @@ req['captchaword'] = input(captcha["question"]) continue elif "url" in captcha: - webbrowser.open(url) + webbrowser.open(captcha["url"]) req['captchaword'] = cap_answerwikipedia.input( "Please view CAPTCHA in your browser, then type answer here:") continue @@ -3352,3 +3386,5 @@ f = open(fn) self._cookies[index] = '; '.join([x.strip() for x in f.readlines()]) f.close() + +