SVN: [11306] branches/rewrite/pywikibot/textlib.py - Pywikipedia-svn

30 Mar 2013

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11306
Revision: 11306
Author:   xqt
Date:     2013-03-30 16:21:10 +0000 (Sat, 30 Mar 2013)
Log Message:
-----------
PEP8 changes, update from trunk r11300, r11301, r11305, r10028
Modified Paths:
--------------
    branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================

--- branches/rewrite/pywikibot/textlib.py	2013-03-30 16:17:48 UTC (rev 11305)
+++ branches/rewrite/pywikibot/textlib.py	2013-03-30 16:21:10 UTC (rev 11306)
@@ -7,7 +7,7 @@
"""
 #
-# (C) Pywikipedia bot team, 2008-2011
+# (C) Pywikipedia bot team, 2008-2013
 #
 # Distributed under the terms of the MIT license.
 #
@@ -16,11 +16,10 @@
import pywikibot
 import re
-
-from pywikibot.i18n import translate
 from HTMLParser import HTMLParser
 import config2 as config
+
 def unescape(s):
     """Replace escaped HTML-special characters by their originals"""
     if '&' not in s:
@@ -29,12 +28,12 @@
     s = s.replace("&gt;", ">")
     s = s.replace("&apos;", "'")
     s = s.replace("&quot;", '"')
-    s = s.replace("&amp;", "&") # Must be last
+    s = s.replace("&amp;", "&")  # Must be last
     return s
def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
-                  allowoverlap=False, marker = '', site = None):
+                  allowoverlap=False, marker='', site=None):
     """
     Return text with 'old' replaced by 'new', ignoring specified types of text.
@@ -61,21 +60,21 @@
         site = pywikibot.getSite()
exceptionRegexes = {
-        'comment':     re.compile(r'(?s)<!--.*?-->'),
+        'comment':      re.compile(r'(?s)<!--.*?-->'),
         # section headers
-        'header':      re.compile(r'\r?\n=+.+=+ *\r?\n'),
+        'header':       re.compile(r'\r?\n=+.+=+ *\r?\n'),
         # preformatted text
-        'pre':         re.compile(r'(?ism)<pre>.*?</pre>'),
-        'source':      re.compile(r'(?is)<source .*?</source>'),
+        'pre':          re.compile(r'(?ism)<pre>.*?</pre>'),
+        'source':       re.compile(r'(?is)<source .*?</source>'),
         # inline references
-        'ref':         re.compile(r'(?ism)<ref[ >].*?</ref>'),
+        'ref':          re.compile(r'(?ism)<ref[ >].*?</ref>'),
         # lines that start with a space are shown in a monospace font and
         # have whitespace preserved.
-        'startspace':  re.compile(r'(?m)^ (.*?)$'),
+        'startspace':   re.compile(r'(?m)^ (.*?)$'),
         # tables often have whitespace that is used to improve wiki
         # source code readability.
         # TODO: handle nested tables.
-        'table':       re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'),
+        'table':        re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'),
         # templates with parameters often have whitespace that is used to
         # improve wiki source code readability.
         # 'template':    re.compile(r'(?s){{.*?}}'),
@@ -83,22 +82,25 @@
         # templates cascaded up to level 2, but no deeper. For arbitrary
         # depth, we'd need recursion which can't be done in Python's re.
         # After all, the language of correct parenthesis words is not regular.
-        'template':    re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
-        'hyperlink':   compileLinkR(),
-        'gallery':     re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
+        'template':     re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
+        'hyperlink':    compileLinkR(),
+        'gallery':      re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
         # this matches internal wikilinks, but also interwiki, categories, and
         # images.
-        'link':        re.compile(r'[[[^]|]*(|[^]]*)?]]'),
+        'link':         re.compile(r'[[[^]|]*(|[^]]*)?]]'),
         # also finds links to foreign sites with preleading ":"
-        'interwiki':   re.compile(r'(?i)[[:?(%s)\s?:[^]]*]][\s]*'
-                                   % '|'.join(site.validLanguageLinks()
-                                              + site.family.obsolete.keys())
-                                  ),
+        'interwiki':    re.compile(r'(?i)[[:?(%s)\s?:[^]]*]][\s]*'
+                                   % '|'.join(site.validLanguageLinks() +
+                                              site.family.obsolete.keys())),
+        # Wikidata property inclusions
+        'property':     re.compile(r'(?i){{\s*#property:\s*p\d+\s*}}'),
+        # Module invocations (currently only Lua)
+        'invoke':       re.compile(r'(?i){{\s*#invoke:.*?}}'),
}
# if we got a string, compile it as a regular expression
-    if type(old) in  [str, unicode]:
+    if isinstance(old, basestring):
         if caseInsensitive:
             old = re.compile(old, re.IGNORECASE | re.UNICODE)
         else:
@@ -175,9 +177,14 @@
                         break
                     groupID = groupMatch.group('name') or \
                               int(groupMatch.group('number'))
-                    replacement = replacement[:groupMatch.start()] + \
-                                  match.group(groupID) + \
-                                  replacement[groupMatch.end():]
+                    try:
+                        replacement = replacement[:groupMatch.start()] + \
+                                      match.group(groupID) + \
+                                      replacement[groupMatch.end():]
+                    except IndexError:
+                        print '\nInvalid group reference:', groupID
+                        print 'Groups found:\n', match.groups()
+                        raise IndexError
             text = text[:match.start()] + replacement + text[match.end():]
# continue the search on the remaining text
@@ -190,7 +197,7 @@
     return text
-def removeDisabledParts(text, tags = ['*']):
+def removeDisabledParts(text, tags=['*']):
     """
     Return text without portions where wiki markup is disabled
@@ -205,12 +212,12 @@
"""
     regexes = {
-            'comments' :       r'<!--.*?-->',
-            'includeonly':     r'<includeonly>.*?</includeonly>',
-            'nowiki':          r'<nowiki>.*?</nowiki>',
-            'pre':             r'<pre>.*?</pre>',
-            'source':          r'<source .*?</source>',
-            'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>',
+        'comments':        r'<!--.*?-->',
+        'includeonly':     r'<includeonly>.*?</includeonly>',
+        'nowiki':          r'<nowiki>.*?</nowiki>',
+        'pre':             r'<pre>.*?</pre>',
+        'source':          r'<source .*?</source>',
+        'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>',
     }
     if '*' in tags:
         tags = regexes.keys()
@@ -223,7 +230,7 @@
     return toRemoveR.sub('', text)
-def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small', 'sup']):
+def removeHTMLParts(text, keeptags=['tt', 'nowiki', 'small', 'sup']):
     """
     Return text without portions where HTML markup is disabled
@@ -232,9 +239,9 @@
The exact set of parts which should NOT be removed can be passed as the
     'keeptags' parameter, which defaults to ['tt', 'nowiki', 'small', 'sup'].
+
     """
     # try to merge with 'removeDisabledParts()' above into one generic function
-
     # thanks to http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-cl...
     parser = _GetDataHTML()
     parser.keeptags = keeptags
@@ -242,6 +249,7 @@
     parser.close()
     return parser.textdata
+
 # thanks to http://docs.python.org/library/htmlparser.html
 class _GetDataHTML(HTMLParser):
     textdata = u''
@@ -251,17 +259,19 @@
         self.textdata += data
def handle_starttag(self, tag, attrs):
-        if tag in self.keeptags: self.textdata += u"<%s>" % tag
+        if tag in self.keeptags:
+            self.textdata += u"<%s>" % tag
def handle_endtag(self, tag):
-        if tag in self.keeptags: self.textdata += u"</%s>" % tag
+        if tag in self.keeptags:
+            self.textdata += u"</%s>" % tag
-def isDisabled(text, index, tags = ['*']):
+def isDisabled(text, index, tags=['*']):
     """
     Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
+    For the tags parameter, see removeDisabledParts() above.
-    For the tags parameter, see removeDisabledParts() above.
     """
     # Find a marker that is not already in the text.
     marker = findmarker(text, '@@', '@')
@@ -270,9 +280,9 @@
     return (marker not in text)
-def findmarker(text, startwith = u'@', append = u'@'):
+def findmarker(text, startwith=u'@', append=None):
     # find a string which is not part of text
-    if len(append) <= 0:
+    if not append:
         append = u'@'
     mymarker = startwith
     while mymarker in text:
@@ -280,7 +290,7 @@
     return mymarker
-def expandmarker(text, marker = '', separator = ''):
+def expandmarker(text, marker='', separator=''):
     # set to remove any number of separator occurrences plus arbitrary
     # whitespace before, after, and between them,
     # by allowing to include them into marker.
@@ -292,8 +302,8 @@
         while firstinseparator > 0 and striploopcontinue:
             striploopcontinue = False
             if (firstinseparator >= lenseparator) and \
-               (separator == text[firstinseparator - \
-                                  lenseparator : firstinseparator]):
+               (separator == text[firstinseparator -
+                                  lenseparator:firstinseparator]):
                 firstinseparator -= lenseparator
                 striploopcontinue = True
             elif text[firstinseparator-1] < ' ':
@@ -302,6 +312,7 @@
         marker = text[firstinseparator:firstinmarker] + marker
     return marker
+
 #-------------------------------------------------
 # Functions dealing with interwiki language links
 #-------------------------------------------------
@@ -324,7 +335,8 @@
 #        do not find or change links of other kinds, nor any that are formatted
 #        as in-line interwiki links (e.g., "[[:es:Articulo]]".
-def getLanguageLinks(text, insite=None, pageLink="[[]]", template_subpage=False):
+def getLanguageLinks(text, insite=None, pageLink="[[]]",
+                     template_subpage=False):
     """
     Return a dict of interlanguage links found in text.
@@ -336,7 +348,8 @@
     if insite is None:
         insite = pywikibot.getSite()
     fam = insite.family
-    # when interwiki links forward to another family, retrieve pages & other infos there
+    # when interwiki links forward to another family, retrieve pages & other
+    # infos there
     if fam.interwiki_forward:
         fam = pywikibot.Family(fam.interwiki_forward)
     result = {}
@@ -351,8 +364,10 @@
     # interwiki link.
     # NOTE: language codes are case-insensitive and only consist of basic latin
     # letters and hyphens.
-    #TODO: currently, we do not have any, but BCP 47 allows digits, and underscores.
-    #TODO: There is no semantic difference between hyphens and underscores -> fold them.
+    # TODO: currently, we do not have any, but BCP 47 allows digits, and
+    #       underscores.
+    # TODO: There is no semantic difference between hyphens and
+    #       underscores -> fold them.
     interwikiR = re.compile(r'[[([a-zA-Z-]+)\s?:([^[]\n]*)]]')
     for lang, pagetitle in interwikiR.findall(text):
         lang = lang.lower()
@@ -369,14 +384,14 @@
             try:
                 result[site] = pywikibot.Page(site, pagetitle, insite=insite)
             except pywikibot.InvalidTitle:
-                pywikibot.output(
-        u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
-                           % (lang, pagetitle))
+                pywikibot.output(u'[getLanguageLinks] Text contains invalid '
+                                 u'interwiki link [[%s:%s]].'
+                                 % (lang, pagetitle))
                 continue
     return result
-def removeLanguageLinks(text, site = None, marker = ''):
+def removeLanguageLinks(text, site=None, marker=''):
     """Return text with all interlanguage links removed.
If a link to an unknown language is encountered, a warning is printed.
@@ -391,7 +406,8 @@
         return text
     # This regular expression will find every interwiki link, plus trailing
     # whitespace.
-    languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())
+    languages = '|'.join(site.validLanguageLinks() +
+                         site.family.obsolete.keys())
     interwikiR = re.compile(r'[[(%s)\s?:[^[]\n]*]][\s]*'
                             % languages, re.IGNORECASE)
     text = replaceExcept(text, interwikiR, '',
@@ -421,7 +437,7 @@
def replaceLanguageLinks(oldtext, new, site=None, addOnly=False,
-    template=False, template_subpage=False):
+                         template=False, template_subpage=False):
     """Replace interlanguage links in the text with a new set of links.
'new' should be a dict with the Site objects as keys, and Page or Link
@@ -442,7 +458,7 @@
     else:
         s2 = removeLanguageLinksAndSeparator(oldtext, site=site, marker=marker,
                                              separator=separatorstripped)
-    s = interwikiFormat(new, insite = site)
+    s = interwikiFormat(new, insite=site)
     if s:
         if site.language() in site.family.interwiki_attop or \
            u'<!-- interwiki at top -->' in oldtext:
@@ -462,28 +478,28 @@
             if "</noinclude>" in s2[firstafter:]:
                 if separatorstripped:
                     s = separator + s
-                newtext = s2[:firstafter].replace(marker,'') + s \
-                          + s2[firstafter:]
+                newtext = s2[:firstafter].replace(marker, '') + s + \
+                          s2[firstafter:]
             elif site.language() in site.family.categories_last:
-                cats = getCategoryLinks(s2, site = site)
+                cats = getCategoryLinks(s2, site=site)
                 s2 = removeCategoryLinksAndSeparator(
-                         s2.replace(marker, cseparatorstripped).strip(),
-                         site) + separator + s
+                    s2.replace(marker, cseparatorstripped).strip(), site) + \
+                    separator + s
                 newtext = replaceCategoryLinks(s2, cats, site=site,
                                                addOnly=True)
             # for Wikitravel's language links position.
             # (not supported by rewrite - no API)
             elif site.family.name == 'wikitravel':
                 s = separator + s + separator
-                newtext = s2[:firstafter].replace(marker,'') + s + \
+                newtext = s2[:firstafter].replace(marker, '') + s + \
                           s2[firstafter:]
             else:
                 if template or template_subpage:
                     if template_subpage:
-                        includeOn  = '<includeonly>'
+                        includeOn = '<includeonly>'
                         includeOff = '</includeonly>'
                     else:
-                        includeOn  = '<noinclude>'
+                        includeOn = '<noinclude>'
                         includeOff = '</noinclude>'
                         separator = ''
                     # Do we have a noinclude at the end of the template?
@@ -495,16 +511,16 @@
                         newtext = regexp.sub(s + includeOff, s2)
                     else:
                         # Put the langlinks at the end, inside noinclude's
-                        newtext = s2.replace(marker,'').strip() + separator + \
+                        newtext = s2.replace(marker, '').strip() + separator + \
                                   u'%s\n%s%s\n' % (includeOn, s, includeOff)
                 else:
-                    newtext = s2.replace(marker,'').strip() + separator + s
+                    newtext = s2.replace(marker, '').strip() + separator + s
     else:
-        newtext = s2.replace(marker,'')
+        newtext = s2.replace(marker, '')
     return newtext
-def interwikiFormat(links, insite = None):
+def interwikiFormat(links, insite=None):
     """Convert interwiki link dict into a wikitext string.
'links' should be a dict with the Site objects as keys, and Page
@@ -512,6 +528,7 @@
Return a unicode string that is formatted for inclusion in insite
     (defaulting to the current site).
+
     """
     if insite is None:
         insite = pywikibot.getSite()
@@ -530,16 +547,16 @@
         sep = u' '
     else:
         sep = config.line_separator
-    s=sep.join(s) + config.line_separator
+    s = sep.join(s) + config.line_separator
     return s
# Sort sites according to local interwiki sort logic
-def interwikiSort(sites, insite = None):
+def interwikiSort(sites, insite=None):
+    if not sites:
+        return []
     if insite is None:
-      insite = pywikibot.getSite()
-    if not sites:
-      return []
+        insite = pywikibot.getSite()
sites.sort()
     putfirst = insite.interwiki_putfirst()
@@ -547,11 +564,8 @@
         #In this case I might have to change the order
         firstsites = []
         for code in putfirst:
-            # The code may not exist in this family?
-##            if code in insite.family.obsolete:
-##                code = insite.family.obsolete[code]
             if code in insite.validLanguageLinks():
-                site = insite.getSite(code = code)
+                site = insite.getSite(code=code)
                 if site in sites:
                     del sites[sites.index(site)]
                     firstsites = firstsites + [site]
@@ -561,6 +575,7 @@
         sites = insite.interwiki_putfirst_doubled(sites) + sites
     return sites
+
 #---------------------------------------
 # Functions dealing with category links
 #---------------------------------------
@@ -588,7 +603,7 @@
                                  '%s:%s' % (match.group('namespace'),
                                             match.group('catName')),
                                  site),
-                                 sortKey = match.group('sortKey'))
+                                 sortKey=match.group('sortKey'))
         result.append(cat)
     return result
@@ -658,9 +673,10 @@
     # spaces and underscores in page titles are interchangeable and collapsible
     title = title.replace(r"\ ", "[ _]+").replace(r"_", "[ _]+")
     categoryR = re.compile(r'[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])'
-                            % (catNamespace, title), re.I)
-    categoryRN = re.compile(r'^[^\S\n]*[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])[^\S\n]*\n'
-                            % (catNamespace, title), re.I | re.M)
+                           % (catNamespace, title), re.I)
+    categoryRN = re.compile(
+        r'^[^\S\n]*[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])[^\S\n]*\n'
+        % (catNamespace, title), re.I | re.M)
     if newcat is None:
         """ First go through and try the more restrictive regex that removes
         an entire line, if the category is the only thing on that line (this
@@ -679,7 +695,7 @@
     return text
-def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
+def replaceCategoryLinks(oldtext, new, site=None, addOnly=False):
     """
     Replace the category links given in the wikitext given
     in oldtext by the new links given in new.
@@ -692,7 +708,7 @@
"""
     # Find a marker that is not already in the text.
-    marker = findmarker( oldtext, u'@@')
+    marker = findmarker(oldtext, u'@@')
     if site is None:
         site = pywikibot.getSite()
     if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext:
@@ -711,7 +727,7 @@
     else:
         s2 = removeCategoryLinksAndSeparator(oldtext, site=site, marker=marker,
                                              separator=separatorstripped)
-    s = categoryFormat(new, insite = site)
+    s = categoryFormat(new, insite=site)
     if s:
         if site.language() in site.family.category_attop:
             newtext = s + separator + s2
@@ -730,7 +746,7 @@
                 newtext = s2[:firstafter].replace(marker, '') + s + \
                           s2[firstafter:]
             elif site.language() in site.family.categories_last:
-                newtext = s2.replace(marker,'').strip() + separator + s
+                newtext = s2.replace(marker, '').strip() + separator + s
             else:
                 interwiki = getLanguageLinks(s2)
                 s2 = removeLanguageLinksAndSeparator(s2.replace(marker, ''),
@@ -740,11 +756,11 @@
                 newtext = replaceLanguageLinks(s2, interwiki, site=site,
                                                addOnly=True)
     else:
-        newtext = s2.replace(marker,'')
+        newtext = s2.replace(marker, '')
     return newtext.strip()
-def categoryFormat(categories, insite = None):
+def categoryFormat(categories, insite=None):
     """Return a string containing links to all categories in a list.
'categories' should be a list of Category objects or strings
@@ -758,13 +774,14 @@
     if insite is None:
         insite = pywikibot.getSite()
-    if isinstance(categories[0],basestring):
+    if isinstance(categories[0], basestring):
         if categories[0][0] == '[':
             catLinks = categories
         else:
             catLinks = ['[[Category:'+category+']]' for category in categories]
     else:
-        catLinks = [category.aslink(noInterwiki=True) for category in categories]
+        catLinks = [category.aslink(noInterwiki=True)
+                    for category in categories]
if insite.category_on_one_line():
         sep = ' '
@@ -774,6 +791,7 @@
     #catLinks.sort()
     return sep.join(catLinks) + config.line_separator
+
 #---------------------------------------
 # Functions dealing with external links
 #---------------------------------------
@@ -796,9 +814,9 @@
     # not allowed inside links. For example, in this wiki text:
     #       ''Please see http://www.example.org.''
     # .'' shouldn't be considered as part of the link.
-    regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
-            + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside \
-            + ']*[^' + notAtEnd + '])'
+    regex = r'(?P<url>http[s]?://[^%(notInside)s]*?[^%(notAtEnd)s]' \
+            r'(?=[%(notAtEnd)s]*'')|http[s]?://[^%(notInside)s]*' \
+            r'[^%(notAtEnd)s])' % {'notInside': notInside, 'notAtEnd': notAtEnd}
if withoutBracketed:
         regex = r'(?<![)' + regex
@@ -807,6 +825,7 @@
     linkR = re.compile(regex)
     return linkR
+
 #----------------------------------
 # Functions dealing with templates
 #----------------------------------
@@ -845,7 +864,7 @@
     inside = {}
     count = 0
     Rtemplate = re.compile(
-                ur'{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}')
+        ur'{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}')
     Rmath = re.compile(ur'<math>[^<]+</math>')
     Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
     Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2))
@@ -892,8 +911,8 @@
                 for m2 in pywikibot.link_regex.finditer(paramString):
                     count2 += 1
                     text = m2.group(0)
-                    paramString = paramString.replace(text,
-                                    '%s%d%s' % (marker2, count2, marker2))
+                    paramString = paramString.replace(
+                        text, '%s%d%s' % (marker2, count2, marker2))
                     links[count2] = text
                 # Parse string
                 markedParams = paramString.split('|')
@@ -927,22 +946,21 @@
     You can use items from extract_templates_and_params here to get
     an equivalent template wiki text (it may happen that the order
     of the params changes).
+
     """
     (template, params) = template_and_params
-
     text = u''
     for item in params:
-        text +=  u'|%s=%s\n' % (item, params[item])
+        text += u'|%s=%s\n' % (item, params[item])
return u'{{%s\n%s}}' % (template, text)
+
 #----------------------------------
 # Page parsing functionality
 #----------------------------------
def does_text_contain_section(pagetext, section):
-    """ Determines whether the page text contains the given
-        section title.
-    """
+    """Determines whether the page text contains the given section title."""
     m = re.search("=+[ ']*%s[ ']*=+" % re.escape(section), pagetext)
     return bool(m)