http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11306
Revision: 11306 Author: xqt Date: 2013-03-30 16:21:10 +0000 (Sat, 30 Mar 2013) Log Message: ----------- PEP8 changes, update from trunk r11300, r11301, r11305, r10028
Modified Paths: -------------- branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py =================================================================== --- branches/rewrite/pywikibot/textlib.py 2013-03-30 16:17:48 UTC (rev 11305) +++ branches/rewrite/pywikibot/textlib.py 2013-03-30 16:21:10 UTC (rev 11306) @@ -7,7 +7,7 @@
""" # -# (C) Pywikipedia bot team, 2008-2011 +# (C) Pywikipedia bot team, 2008-2013 # # Distributed under the terms of the MIT license. # @@ -16,11 +16,10 @@
import pywikibot import re - -from pywikibot.i18n import translate from HTMLParser import HTMLParser import config2 as config
+ def unescape(s): """Replace escaped HTML-special characters by their originals""" if '&' not in s: @@ -29,12 +28,12 @@ s = s.replace(">", ">") s = s.replace("'", "'") s = s.replace(""", '"') - s = s.replace("&", "&") # Must be last + s = s.replace("&", "&") # Must be last return s
def replaceExcept(text, old, new, exceptions, caseInsensitive=False, - allowoverlap=False, marker = '', site = None): + allowoverlap=False, marker='', site=None): """ Return text with 'old' replaced by 'new', ignoring specified types of text.
@@ -61,21 +60,21 @@ site = pywikibot.getSite()
exceptionRegexes = { - 'comment': re.compile(r'(?s)<!--.*?-->'), + 'comment': re.compile(r'(?s)<!--.*?-->'), # section headers - 'header': re.compile(r'\r?\n=+.+=+ *\r?\n'), + 'header': re.compile(r'\r?\n=+.+=+ *\r?\n'), # preformatted text - 'pre': re.compile(r'(?ism)<pre>.*?</pre>'), - 'source': re.compile(r'(?is)<source .*?</source>'), + 'pre': re.compile(r'(?ism)<pre>.*?</pre>'), + 'source': re.compile(r'(?is)<source .*?</source>'), # inline references - 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'), + 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'), # lines that start with a space are shown in a monospace font and # have whitespace preserved. - 'startspace': re.compile(r'(?m)^ (.*?)$'), + 'startspace': re.compile(r'(?m)^ (.*?)$'), # tables often have whitespace that is used to improve wiki # source code readability. # TODO: handle nested tables. - 'table': re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'), + 'table': re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'), # templates with parameters often have whitespace that is used to # improve wiki source code readability. # 'template': re.compile(r'(?s){{.*?}}'), @@ -83,22 +82,25 @@ # templates cascaded up to level 2, but no deeper. For arbitrary # depth, we'd need recursion which can't be done in Python's re. # After all, the language of correct parenthesis words is not regular. - 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'), - 'hyperlink': compileLinkR(), - 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), + 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'), + 'hyperlink': compileLinkR(), + 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), # this matches internal wikilinks, but also interwiki, categories, and # images. - 'link': re.compile(r'[[[^]|]*(|[^]]*)?]]'), + 'link': re.compile(r'[[[^]|]*(|[^]]*)?]]'), # also finds links to foreign sites with preleading ":" - 'interwiki': re.compile(r'(?i)[[:?(%s)\s?:[^]]*]][\s]*' - % '|'.join(site.validLanguageLinks() - + site.family.obsolete.keys()) - ), + 'interwiki': re.compile(r'(?i)[[:?(%s)\s?:[^]]*]][\s]*' + % '|'.join(site.validLanguageLinks() + + site.family.obsolete.keys())), + # Wikidata property inclusions + 'property': re.compile(r'(?i){{\s*#property:\s*p\d+\s*}}'), + # Module invocations (currently only Lua) + 'invoke': re.compile(r'(?i){{\s*#invoke:.*?}}'),
}
# if we got a string, compile it as a regular expression - if type(old) in [str, unicode]: + if isinstance(old, basestring): if caseInsensitive: old = re.compile(old, re.IGNORECASE | re.UNICODE) else: @@ -175,9 +177,14 @@ break groupID = groupMatch.group('name') or \ int(groupMatch.group('number')) - replacement = replacement[:groupMatch.start()] + \ - match.group(groupID) + \ - replacement[groupMatch.end():] + try: + replacement = replacement[:groupMatch.start()] + \ + match.group(groupID) + \ + replacement[groupMatch.end():] + except IndexError: + print '\nInvalid group reference:', groupID + print 'Groups found:\n', match.groups() + raise IndexError text = text[:match.start()] + replacement + text[match.end():]
# continue the search on the remaining text @@ -190,7 +197,7 @@ return text
-def removeDisabledParts(text, tags = ['*']): +def removeDisabledParts(text, tags=['*']): """ Return text without portions where wiki markup is disabled
@@ -205,12 +212,12 @@
""" regexes = { - 'comments' : r'<!--.*?-->', - 'includeonly': r'<includeonly>.*?</includeonly>', - 'nowiki': r'<nowiki>.*?</nowiki>', - 'pre': r'<pre>.*?</pre>', - 'source': r'<source .*?</source>', - 'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>', + 'comments': r'<!--.*?-->', + 'includeonly': r'<includeonly>.*?</includeonly>', + 'nowiki': r'<nowiki>.*?</nowiki>', + 'pre': r'<pre>.*?</pre>', + 'source': r'<source .*?</source>', + 'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>', } if '*' in tags: tags = regexes.keys() @@ -223,7 +230,7 @@ return toRemoveR.sub('', text)
-def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small', 'sup']): +def removeHTMLParts(text, keeptags=['tt', 'nowiki', 'small', 'sup']): """ Return text without portions where HTML markup is disabled
@@ -232,9 +239,9 @@
The exact set of parts which should NOT be removed can be passed as the 'keeptags' parameter, which defaults to ['tt', 'nowiki', 'small', 'sup']. + """ # try to merge with 'removeDisabledParts()' above into one generic function - # thanks to http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-cl... parser = _GetDataHTML() parser.keeptags = keeptags @@ -242,6 +249,7 @@ parser.close() return parser.textdata
+ # thanks to http://docs.python.org/library/htmlparser.html class _GetDataHTML(HTMLParser): textdata = u'' @@ -251,17 +259,19 @@ self.textdata += data
def handle_starttag(self, tag, attrs): - if tag in self.keeptags: self.textdata += u"<%s>" % tag + if tag in self.keeptags: + self.textdata += u"<%s>" % tag
def handle_endtag(self, tag): - if tag in self.keeptags: self.textdata += u"</%s>" % tag + if tag in self.keeptags: + self.textdata += u"</%s>" % tag
-def isDisabled(text, index, tags = ['*']): +def isDisabled(text, index, tags=['*']): """ Return True if text[index] is disabled, e.g. by a comment or by nowiki tags. + For the tags parameter, see removeDisabledParts() above.
- For the tags parameter, see removeDisabledParts() above. """ # Find a marker that is not already in the text. marker = findmarker(text, '@@', '@') @@ -270,9 +280,9 @@ return (marker not in text)
-def findmarker(text, startwith = u'@', append = u'@'): +def findmarker(text, startwith=u'@', append=None): # find a string which is not part of text - if len(append) <= 0: + if not append: append = u'@' mymarker = startwith while mymarker in text: @@ -280,7 +290,7 @@ return mymarker
-def expandmarker(text, marker = '', separator = ''): +def expandmarker(text, marker='', separator=''): # set to remove any number of separator occurrences plus arbitrary # whitespace before, after, and between them, # by allowing to include them into marker. @@ -292,8 +302,8 @@ while firstinseparator > 0 and striploopcontinue: striploopcontinue = False if (firstinseparator >= lenseparator) and \ - (separator == text[firstinseparator - \ - lenseparator : firstinseparator]): + (separator == text[firstinseparator - + lenseparator:firstinseparator]): firstinseparator -= lenseparator striploopcontinue = True elif text[firstinseparator-1] < ' ': @@ -302,6 +312,7 @@ marker = text[firstinseparator:firstinmarker] + marker return marker
+ #------------------------------------------------- # Functions dealing with interwiki language links #------------------------------------------------- @@ -324,7 +335,8 @@ # do not find or change links of other kinds, nor any that are formatted # as in-line interwiki links (e.g., "[[:es:Articulo]]".
-def getLanguageLinks(text, insite=None, pageLink="[[]]", template_subpage=False): +def getLanguageLinks(text, insite=None, pageLink="[[]]", + template_subpage=False): """ Return a dict of interlanguage links found in text.
@@ -336,7 +348,8 @@ if insite is None: insite = pywikibot.getSite() fam = insite.family - # when interwiki links forward to another family, retrieve pages & other infos there + # when interwiki links forward to another family, retrieve pages & other + # infos there if fam.interwiki_forward: fam = pywikibot.Family(fam.interwiki_forward) result = {} @@ -351,8 +364,10 @@ # interwiki link. # NOTE: language codes are case-insensitive and only consist of basic latin # letters and hyphens. - #TODO: currently, we do not have any, but BCP 47 allows digits, and underscores. - #TODO: There is no semantic difference between hyphens and underscores -> fold them. + # TODO: currently, we do not have any, but BCP 47 allows digits, and + # underscores. + # TODO: There is no semantic difference between hyphens and + # underscores -> fold them. interwikiR = re.compile(r'[[([a-zA-Z-]+)\s?:([^[]\n]*)]]') for lang, pagetitle in interwikiR.findall(text): lang = lang.lower() @@ -369,14 +384,14 @@ try: result[site] = pywikibot.Page(site, pagetitle, insite=insite) except pywikibot.InvalidTitle: - pywikibot.output( - u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]." - % (lang, pagetitle)) + pywikibot.output(u'[getLanguageLinks] Text contains invalid ' + u'interwiki link [[%s:%s]].' + % (lang, pagetitle)) continue return result
-def removeLanguageLinks(text, site = None, marker = ''): +def removeLanguageLinks(text, site=None, marker=''): """Return text with all interlanguage links removed.
If a link to an unknown language is encountered, a warning is printed. @@ -391,7 +406,8 @@ return text # This regular expression will find every interwiki link, plus trailing # whitespace. - languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys()) + languages = '|'.join(site.validLanguageLinks() + + site.family.obsolete.keys()) interwikiR = re.compile(r'[[(%s)\s?:[^[]\n]*]][\s]*' % languages, re.IGNORECASE) text = replaceExcept(text, interwikiR, '', @@ -421,7 +437,7 @@
def replaceLanguageLinks(oldtext, new, site=None, addOnly=False, - template=False, template_subpage=False): + template=False, template_subpage=False): """Replace interlanguage links in the text with a new set of links.
'new' should be a dict with the Site objects as keys, and Page or Link @@ -442,7 +458,7 @@ else: s2 = removeLanguageLinksAndSeparator(oldtext, site=site, marker=marker, separator=separatorstripped) - s = interwikiFormat(new, insite = site) + s = interwikiFormat(new, insite=site) if s: if site.language() in site.family.interwiki_attop or \ u'<!-- interwiki at top -->' in oldtext: @@ -462,28 +478,28 @@ if "</noinclude>" in s2[firstafter:]: if separatorstripped: s = separator + s - newtext = s2[:firstafter].replace(marker,'') + s \ - + s2[firstafter:] + newtext = s2[:firstafter].replace(marker, '') + s + \ + s2[firstafter:] elif site.language() in site.family.categories_last: - cats = getCategoryLinks(s2, site = site) + cats = getCategoryLinks(s2, site=site) s2 = removeCategoryLinksAndSeparator( - s2.replace(marker, cseparatorstripped).strip(), - site) + separator + s + s2.replace(marker, cseparatorstripped).strip(), site) + \ + separator + s newtext = replaceCategoryLinks(s2, cats, site=site, addOnly=True) # for Wikitravel's language links position. # (not supported by rewrite - no API) elif site.family.name == 'wikitravel': s = separator + s + separator - newtext = s2[:firstafter].replace(marker,'') + s + \ + newtext = s2[:firstafter].replace(marker, '') + s + \ s2[firstafter:] else: if template or template_subpage: if template_subpage: - includeOn = '<includeonly>' + includeOn = '<includeonly>' includeOff = '</includeonly>' else: - includeOn = '<noinclude>' + includeOn = '<noinclude>' includeOff = '</noinclude>' separator = '' # Do we have a noinclude at the end of the template? @@ -495,16 +511,16 @@ newtext = regexp.sub(s + includeOff, s2) else: # Put the langlinks at the end, inside noinclude's - newtext = s2.replace(marker,'').strip() + separator + \ + newtext = s2.replace(marker, '').strip() + separator + \ u'%s\n%s%s\n' % (includeOn, s, includeOff) else: - newtext = s2.replace(marker,'').strip() + separator + s + newtext = s2.replace(marker, '').strip() + separator + s else: - newtext = s2.replace(marker,'') + newtext = s2.replace(marker, '') return newtext
-def interwikiFormat(links, insite = None): +def interwikiFormat(links, insite=None): """Convert interwiki link dict into a wikitext string.
'links' should be a dict with the Site objects as keys, and Page @@ -512,6 +528,7 @@
Return a unicode string that is formatted for inclusion in insite (defaulting to the current site). + """ if insite is None: insite = pywikibot.getSite() @@ -530,16 +547,16 @@ sep = u' ' else: sep = config.line_separator - s=sep.join(s) + config.line_separator + s = sep.join(s) + config.line_separator return s
# Sort sites according to local interwiki sort logic -def interwikiSort(sites, insite = None): +def interwikiSort(sites, insite=None): + if not sites: + return [] if insite is None: - insite = pywikibot.getSite() - if not sites: - return [] + insite = pywikibot.getSite()
sites.sort() putfirst = insite.interwiki_putfirst() @@ -547,11 +564,8 @@ #In this case I might have to change the order firstsites = [] for code in putfirst: - # The code may not exist in this family? -## if code in insite.family.obsolete: -## code = insite.family.obsolete[code] if code in insite.validLanguageLinks(): - site = insite.getSite(code = code) + site = insite.getSite(code=code) if site in sites: del sites[sites.index(site)] firstsites = firstsites + [site] @@ -561,6 +575,7 @@ sites = insite.interwiki_putfirst_doubled(sites) + sites return sites
+ #--------------------------------------- # Functions dealing with category links #--------------------------------------- @@ -588,7 +603,7 @@ '%s:%s' % (match.group('namespace'), match.group('catName')), site), - sortKey = match.group('sortKey')) + sortKey=match.group('sortKey')) result.append(cat) return result
@@ -658,9 +673,10 @@ # spaces and underscores in page titles are interchangeable and collapsible title = title.replace(r"\ ", "[ _]+").replace(r"_", "[ _]+") categoryR = re.compile(r'[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])' - % (catNamespace, title), re.I) - categoryRN = re.compile(r'^[^\S\n]*[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])[^\S\n]*\n' - % (catNamespace, title), re.I | re.M) + % (catNamespace, title), re.I) + categoryRN = re.compile( + r'^[^\S\n]*[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])[^\S\n]*\n' + % (catNamespace, title), re.I | re.M) if newcat is None: """ First go through and try the more restrictive regex that removes an entire line, if the category is the only thing on that line (this @@ -679,7 +695,7 @@ return text
-def replaceCategoryLinks(oldtext, new, site = None, addOnly = False): +def replaceCategoryLinks(oldtext, new, site=None, addOnly=False): """ Replace the category links given in the wikitext given in oldtext by the new links given in new. @@ -692,7 +708,7 @@
""" # Find a marker that is not already in the text. - marker = findmarker( oldtext, u'@@') + marker = findmarker(oldtext, u'@@') if site is None: site = pywikibot.getSite() if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext: @@ -711,7 +727,7 @@ else: s2 = removeCategoryLinksAndSeparator(oldtext, site=site, marker=marker, separator=separatorstripped) - s = categoryFormat(new, insite = site) + s = categoryFormat(new, insite=site) if s: if site.language() in site.family.category_attop: newtext = s + separator + s2 @@ -730,7 +746,7 @@ newtext = s2[:firstafter].replace(marker, '') + s + \ s2[firstafter:] elif site.language() in site.family.categories_last: - newtext = s2.replace(marker,'').strip() + separator + s + newtext = s2.replace(marker, '').strip() + separator + s else: interwiki = getLanguageLinks(s2) s2 = removeLanguageLinksAndSeparator(s2.replace(marker, ''), @@ -740,11 +756,11 @@ newtext = replaceLanguageLinks(s2, interwiki, site=site, addOnly=True) else: - newtext = s2.replace(marker,'') + newtext = s2.replace(marker, '') return newtext.strip()
-def categoryFormat(categories, insite = None): +def categoryFormat(categories, insite=None): """Return a string containing links to all categories in a list.
'categories' should be a list of Category objects or strings @@ -758,13 +774,14 @@ if insite is None: insite = pywikibot.getSite()
- if isinstance(categories[0],basestring): + if isinstance(categories[0], basestring): if categories[0][0] == '[': catLinks = categories else: catLinks = ['[[Category:'+category+']]' for category in categories] else: - catLinks = [category.aslink(noInterwiki=True) for category in categories] + catLinks = [category.aslink(noInterwiki=True) + for category in categories]
if insite.category_on_one_line(): sep = ' ' @@ -774,6 +791,7 @@ #catLinks.sort() return sep.join(catLinks) + config.line_separator
+ #--------------------------------------- # Functions dealing with external links #--------------------------------------- @@ -796,9 +814,9 @@ # not allowed inside links. For example, in this wiki text: # ''Please see http://www.example.org.'' # .'' shouldn't be considered as part of the link. - regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \ - + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside \ - + ']*[^' + notAtEnd + '])' + regex = r'(?P<url>http[s]?://[^%(notInside)s]*?[^%(notAtEnd)s]' \ + r'(?=[%(notAtEnd)s]*'')|http[s]?://[^%(notInside)s]*' \ + r'[^%(notAtEnd)s])' % {'notInside': notInside, 'notAtEnd': notAtEnd}
if withoutBracketed: regex = r'(?<![)' + regex @@ -807,6 +825,7 @@ linkR = re.compile(regex) return linkR
+ #---------------------------------- # Functions dealing with templates #---------------------------------- @@ -845,7 +864,7 @@ inside = {} count = 0 Rtemplate = re.compile( - ur'{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}') + ur'{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}') Rmath = re.compile(ur'<math>[^<]+</math>') Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker)) Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2)) @@ -892,8 +911,8 @@ for m2 in pywikibot.link_regex.finditer(paramString): count2 += 1 text = m2.group(0) - paramString = paramString.replace(text, - '%s%d%s' % (marker2, count2, marker2)) + paramString = paramString.replace( + text, '%s%d%s' % (marker2, count2, marker2)) links[count2] = text # Parse string markedParams = paramString.split('|') @@ -927,22 +946,21 @@ You can use items from extract_templates_and_params here to get an equivalent template wiki text (it may happen that the order of the params changes). + """ (template, params) = template_and_params - text = u'' for item in params: - text += u'|%s=%s\n' % (item, params[item]) + text += u'|%s=%s\n' % (item, params[item])
return u'{{%s\n%s}}' % (template, text)
+ #---------------------------------- # Page parsing functionality #----------------------------------
def does_text_contain_section(pagetext, section): - """ Determines whether the page text contains the given - section title. - """ + """Determines whether the page text contains the given section title.""" m = re.search("=+[ ']*%s[ ']*=+" % re.escape(section), pagetext) return bool(m)
pywikipedia-svn@lists.wikimedia.org