Revision: 8759 Author: purodha Date: 2010-12-05 21:09:39 +0000 (Sun, 05 Dec 2010) Log Message: ----------- Make getLanguageLinks() deal correctly with forwarded interlanguage links, such as on Meta, Commons, Incubator, Species, and the like.
Modified Paths: -------------- trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py =================================================================== --- trunk/pywikipedia/pywikibot/textlib.py 2010-12-05 18:46:44 UTC (rev 8758) +++ trunk/pywikipedia/pywikibot/textlib.py 2010-12-05 21:09:39 UTC (rev 8759) @@ -268,12 +268,24 @@ #------------------------------------------------- # Functions dealing with interwiki language links #------------------------------------------------- -# Note - MediaWiki supports two kinds of interwiki links; interlanguage and -# interproject. These functions only deal with links to a -# corresponding page in another language on the same project (e.g., -# Wikipedia, Wiktionary, etc.) in another language. They do not find -# or change links to a different project, or any that are formatted -# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM) +# Note - MediaWiki supports several kinds of interwiki links; two kinds are +# interlanguage links. We deal here with those kinds only. +# A family has by definition only one kind of interlanguage links: +# 1 - interlanguage links inside the own family. +# They go to a corresponding page in another language in the same +# family, such as from 'en.wikipedia' to 'pt.wikipedia', or from +# 'es.wiktionary' to 'arz.wiktionary'. +# Families with this kind have several language-specific sites. +# They have their interwiki_forward attribute set to None +# 2 - language links forwarding to another family. +# They go to a corresponding page in another family, such as from +# 'commons' to 'zh.wikipedia, or from 'incubator' to 'en.wikipedia'. +# Families having those have one member only, and do not have +# language-specific sites. The name of the target family of their +# interlanguage links is kept in their interwiki_forward attribute. +# These functions only deal with links of these two kinds only. They +# do not find or change links of other kinds, nor any that are formatted +# as in-line interwiki links (e.g., "[[:es:Articulo]]".
def getLanguageLinks(text, insite=None, pageLink="[[]]", template_subpage=False): """ @@ -286,6 +298,10 @@ """ if insite is None: insite = pywikibot.getSite() + fam = insite.family + # when interwiki links forward to another family, retrieve pages & other infos there + if fam.interwiki_forward: + fam = pywikibot.Family(fam.interwiki_forward) result = {} # Ignore interwiki links within nowiki tags, includeonly tags, pre tags, # and HTML comments @@ -298,19 +314,21 @@ # interwiki link. # NOTE: language codes are case-insensitive and only consist of basic latin # letters and hyphens. + #TODO: currently, we do not have any, but BCP 47 allows digits, and underscores. + #TODO: There is no semantic difference between hyphens and underscores -> fold them. interwikiR = re.compile(r'[[([a-zA-Z-]+)\s?:([^[]\n]*)]]') for lang, pagetitle in interwikiR.findall(text): lang = lang.lower() # Check if it really is in fact an interwiki link to a known # language, or if it's e.g. a category tag or an internal link - if lang in insite.family.obsolete: - lang = insite.family.obsolete[lang] - if lang in insite.validLanguageLinks(): + if lang in fam.obsolete: + lang = fam.obsolete[lang] + if lang in fam.langs.keys(): if '|' in pagetitle: # ignore text after the pipe pagetitle = pagetitle[:pagetitle.index('|')] # we want the actual page objects rather than the titles - site = insite.getSite(code = lang) + site = pywikibot.getSite(code=lang, fam=fam) try: result[site] = pywikibot.Page(site, pagetitle, insite=insite) except pywikibot.InvalidTitle:
pywikipedia-svn@lists.wikimedia.org