SVN: [10025] branches/rewrite/pywikibot/textlib.py - Pywikipedia-svn

19 Mar 2012

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10025

Revision: 10025
Author:   valhallasw
Date:     2012-03-18 17:23:48 +0000 (Sun, 18 Mar 2012)
Log Message:
-----------
(somewhat blunt) merge of trunk/pywikibot/textlib.py

Modified Paths:
--------------
    branches/rewrite/pywikibot/textlib.py

Modified: branches/rewrite/pywikibot/textlib.py
===================================================================

--- branches/rewrite/pywikibot/textlib.py	2012-03-18 17:08:41 UTC (rev 10024)
+++ branches/rewrite/pywikibot/textlib.py	2012-03-18 17:23:48 UTC (rev 10025)
@@ -18,6 +18,7 @@
 import re
 
 from pywikibot.i18n import translate
+from HTMLParser import HTMLParser
 
 def unescape(s):
     """Replace escaped HTML-special characters by their
originals"""
@@ -221,6 +222,40 @@
     return toRemoveR.sub('', text)
 
 
+def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small',
'sup']):
+    """
+    Return text without portions where HTML markup is disabled
+
+    Parts that can/will be removed are --
+    * HTML and all wiki tags
+
+    The exact set of parts which should NOT be removed can be passed as the
+    'keeptags' parameter, which defaults to ['tt', 'nowiki',
'small', 'sup'].
+    """
+    # try to merge with 'removeDisabledParts()' above into one generic function
+
+    # thanks to
http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-c…
+    parser = _GetDataHTML()
+    parser.keeptags = keeptags
+    parser.feed(text)
+    parser.close()
+    return parser.textdata
+
+# thanks to http://docs.python.org/library/htmlparser.html
+class _GetDataHTML(HTMLParser):
+    textdata = u''
+    keeptags = []
+
+    def handle_data(self, data):
+        self.textdata += data
+
+    def handle_starttag(self, tag, attrs):
+        if tag in self.keeptags: self.textdata += u"<%s>" % tag
+
+    def handle_endtag(self, tag):
+        if tag in self.keeptags: self.textdata += u"</%s>" % tag
+
+
 def isDisabled(text, index, tags = ['*']):
     """
     Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
@@ -269,12 +304,24 @@
 #-------------------------------------------------
 # Functions dealing with interwiki language links
 #-------------------------------------------------
-# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
-#        interproject.  These functions only deal with links to a
-#        corresponding page in another language on the same project (e.g.,
-#        Wikipedia, Wiktionary, etc.) in another language. They do not find
-#        or change links to a different project, or any that are formatted
-#        as in-line interwiki links (e.g., "[[:es:Articulo]]".  (CONFIRM)
+# Note - MediaWiki supports several kinds of interwiki links; two kinds are
+#        interlanguage links. We deal here with those kinds only.
+#        A family has by definition only one kind of interlanguage links:
+#        1 - interlanguage links inside the own family.
+#            They go to a corresponding page in another language in the same
+#            family, such as from 'en.wikipedia' to 'pt.wikipedia', or
from
+#            'es.wiktionary' to 'arz.wiktionary'.
+#            Families with this kind have several language-specific sites.
+#            They have their interwiki_forward attribute set to None
+#        2 - language links forwarding to another family.
+#            They go to a corresponding page in another family, such as from
+#            'commons' to 'zh.wikipedia, or from 'incubator' to
'en.wikipedia'.
+#            Families having those have one member only, and do not have
+#            language-specific sites. The name of the target family of their
+#            interlanguage links is kept in their interwiki_forward attribute.
+#        These functions only deal with links of these two kinds only.  They
+#        do not find or change links of other kinds, nor any that are formatted
+#        as in-line interwiki links (e.g., "[[:es:Articulo]]".
 
 def getLanguageLinks(text, insite=None, pageLink="[[]]",
template_subpage=False):
     """
@@ -287,6 +334,10 @@
     """
     if insite is None:
         insite = pywikibot.getSite()
+    fam = insite.family
+    # when interwiki links forward to another family, retrieve pages & other infos
there
+    if fam.interwiki_forward:
+        fam = pywikibot.Family(fam.interwiki_forward)
     result = {}
     # Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
     # and HTML comments
@@ -299,20 +350,28 @@
     # interwiki link.
     # NOTE: language codes are case-insensitive and only consist of basic latin
     # letters and hyphens.
+    #TODO: currently, we do not have any, but BCP 47 allows digits, and underscores.
+    #TODO: There is no semantic difference between hyphens and underscores -> fold
them.
     interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
     for lang, pagetitle in interwikiR.findall(text):
         lang = lang.lower()
         # Check if it really is in fact an interwiki link to a known
         # language, or if it's e.g. a category tag or an internal link
-        if lang in insite.family.obsolete:
-            lang = insite.family.obsolete[lang]
-        if lang in insite.validLanguageLinks():
+        if lang in fam.obsolete:
+            lang = fam.obsolete[lang]
+        if lang in fam.langs.keys():
             if '|' in pagetitle:
                 # ignore text after the pipe
                 pagetitle = pagetitle[:pagetitle.index('|')]
             # we want the actual page objects rather than the titles
-            site = insite.getSite(code = lang)
-            result[site] = pywikibot.Page(pywikibot.Link(pagetitle, site))
+            site = pywikibot.getSite(code=lang, fam=fam)
+            try:
+                result[site] = pywikibot.Page(site, pagetitle, insite=insite)
+            except pywikibot.InvalidTitle:
+                pywikibot.output(
+        u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
+                           % (lang, pagetitle))
+                continue
     return result
 
 
@@ -386,7 +445,11 @@
     if s:
         if site.language() in site.family.interwiki_attop or \
            u'<!-- interwiki at top -->' in oldtext:
-            newtext = s + separator + s2.replace(marker,'').strip()
+            #do not add separator if interiki links are on one line
+            newtext = s + \
+                      [separator, u''][site.language() in
+                                       site.family.interwiki_on_one_line] + \
+                      s2.replace(marker, '').strip()
         else:
             # calculate what was after the language links on the page
             firstafter = s2.find(marker)
@@ -407,6 +470,12 @@
                          site) + separator + s
                 newtext = replaceCategoryLinks(s2, cats, site=site,
                                                addOnly=True)
+            # for Wikitravel's language links position.
+            # (not supported by rewrite - no API)
+            elif site.family.name == 'wikitravel':
+                s = separator + s + separator
+                newtext = s2[:firstafter].replace(marker,'') + s + \
+                          s2[firstafter:]
             else:
                 if template or template_subpage:
                     if template_subpage:
@@ -451,13 +520,11 @@
     ar = interwikiSort(links.keys(), insite)
     s = []
     for site in ar:
-        obj = links[site]
-        if isinstance(obj, pywikibot.Link):
-            link = obj.astext(insite)
-        else:
-            # Page
-            link = obj.title(asLink=True, forceInterwiki=True)
-        s.append(link)
+        try:
+            link = unicode(links[site]).replace('[[:', '[[')
+            s.append(link)
+        except AttributeError:
+            s.append(getSite(site).linkto(links[site], othersite=insite))
     if insite.lang in insite.family.interwiki_on_one_line:
         sep = u' '
     else:
@@ -497,7 +564,8 @@
 # Functions dealing with category links
 #---------------------------------------
 
-def getCategoryLinks(text, site):
+def getCategoryLinks(text, site=None):
+    import catlib
     """Return a list of category links found in text.
 
     List contains Category objects.
@@ -505,6 +573,8 @@
 
     """
     result = []
+    if site is None:
+        site = pywikibot.getSite()
     # Ignore category links within nowiki tags, pre tags, includeonly tags,
     # and HTML comments
     text = removeDisabledParts(text)
@@ -522,7 +592,7 @@
     return result
 
 
-def removeCategoryLinks(text, site, marker=''):
+def removeCategoryLinks(text, site=None, marker=''):
     """Return text with all category links removed.
 
     Put the string marker after the last replacement (at the end of the text
@@ -533,6 +603,8 @@
     # interwiki link, plus trailing whitespace. The language code is grouped.
     # NOTE: This assumes that language codes only consist of non-capital
     # ASCII letters and hyphens.
+    if site is None:
+        site = pywikibot.getSite()
     catNamespace = '|'.join(site.category_namespaces())
     categoryR = re.compile(r'\[\[\s*(%s)\s*:.*?\]\]\s*' % catNamespace, re.I)
     text = replaceExcept(text, categoryR, '',
@@ -554,6 +626,8 @@
     if there is no replacement).
 
     """
+    if site is None:
+        site = pywikibot.getSite()
     if separator:
         mymarker = findmarker(text, u'@C@')
         newtext = removeCategoryLinks(text, site, mymarker)
@@ -609,7 +683,8 @@
     Replace the category links given in the wikitext given
     in oldtext by the new links given in new.
 
-    'new' should be a list of Category objects.
+    'new' should be a list of Category objects or strings
+          which can be either the raw name or [[Category:..]].
 
     If addOnly is True, the old category won't be deleted and the
     category(s) given will be added (and so they won't replace anything).
@@ -671,7 +746,8 @@
 def categoryFormat(categories, insite = None):
     """Return a string containing links to all categories in a list.
 
-    'categories' should be a list of Category objects.
+    'categories' should be a list of Category objects or strings
+        which can be either the raw name or [[Category:..]].
 
     The string is formatted for inclusion in insite.
 
@@ -680,7 +756,15 @@
         return ''
     if insite is None:
         insite = pywikibot.getSite()
-    catLinks = [category.aslink() for category in categories]
+
+    if isinstance(categories[0],basestring):
+        if categories[0][0] == '[':
+            catLinks = categories
+        else:
+            catLinks = ['[[Category:'+category+']]' for category in
categories]
+    else:
+        catLinks = [category.aslink(noInterwiki=True) for category in categories]
+
     if insite.category_on_one_line():
         sep = ' '
     else:
@@ -834,3 +918,19 @@
             # Add it to the result
             result.append((name, params))
     return result
+
+
+def glue_template_and_params(template_and_params):
+    """Return wiki text of template glued from params.
+
+    You can use items from extract_templates_and_params here to get
+    an equivalent template wiki text (it may happen that the order
+    of the params changes).
+    """
+    (template, params) = template_and_params
+
+    text = u''
+    for item in params:
+        text +=  u'|%s=%s\n' % (item, params[item])
+
+    return u'{{%s\n%s}}' % (template, text)