[Pywikipedia-l] SVN: [6104] branches/rewrite/pywikibot/textlib.py

russblau at svn.wikimedia.org russblau at svn.wikimedia.org
Mon Nov 17 19:48:45 UTC 2008


Revision: 6104
Author:   russblau
Date:     2008-11-17 19:48:45 +0000 (Mon, 17 Nov 2008)

Log Message:
-----------
port of templates_with_parameters method from wikipedia.py

Modified Paths:
--------------
    branches/rewrite/pywikibot/textlib.py

Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py	2008-11-16 11:53:14 UTC (rev 6103)
+++ branches/rewrite/pywikibot/textlib.py	2008-11-17 19:48:45 UTC (rev 6104)
@@ -564,3 +564,110 @@
     linkR = re.compile(regex)
     return linkR
 
+def extract_templates_and_params(text, get_redirect=False):
+    """Return list of template calls found in text.
+
+    Return value is a list of tuples. There is one tuple for each use of a
+    template in the page, with the template title as the first entry and a
+    dict of parameters as the second entry.  Positional parameters are
+    indexed by an int, named parameters by a str.
+
+    """
+    # remove commented-out stuff etc.
+    thistxt = removeDisabledParts(text)
+
+    # marker for inside templates or parameters
+    marker = u'@@'
+    while marker in thistxt:
+        marker += u'@'
+
+    # marker for links
+    marker2 = u'##'
+    while marker2 in thistxt:
+        marker2 += u'#'
+
+    # marker for math
+    marker3 = u'%%'
+    while marker2 in thistxt:
+        marker3 += u'%'
+
+    result = []
+    inside = {}
+    count = 0
+    Rtemplate = re.compile(
+                ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
+    Rlink = re.compile(ur'\[\[[^\]]+\]\]')
+    Rmath = re.compile(ur'<math>[^<]+</math>')
+    Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
+    Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2))
+    Rmarker3 = re.compile(ur'%s(\d+)%s' % (marker3, marker3))
+
+    # Replace math with markers
+    maths = {}
+    count = 0
+    for m in Rmath.finditer(thistxt):
+        count += 1
+        text = m.group()
+        thistxt = thistxt.replace(text, '%s%d%s' % (marker3, count, marker3))
+        maths[count] = text
+
+    while Rtemplate.search(thistxt) is not None:
+        for m in Rtemplate.finditer(thistxt):
+            # Make sure it is not detected again
+            count += 1
+            text = m.group()
+            thistxt = thistxt.replace(text,
+                                      '%s%d%s' % (marker, count, marker))
+            # Make sure stored templates don't contain markers
+            for m2 in Rmarker.finditer(text):
+                text = text.replace(m2.group(), inside[int(m2.group(1))])
+            for m2 in Rmarker3.finditer(text):
+                text = text.replace(m2.group(), maths[int(m2.group(1))])
+            inside[count] = text
+
+            # Name
+            name = m.group('name').strip()
+            m2 = Rmarker.search(name) or Rmath.search(name)
+            if m2 is not None:
+                # Doesn't detect templates whose name changes,
+                # or templates whose name contains math tags
+                continue
+            # Parameters
+            paramString = m.group('params')
+            params = {}
+            numbered_param = 1
+            if paramString:
+                # Replace links to markers
+                links = {}
+                count2 = 0
+                for m2 in Rlink.finditer(paramString):
+                    count2 += 1
+                    text = m2.group()
+                    paramString = paramString.replace(text,
+                                    '%s%d%s' % (marker2, count2, marker2))
+                    links[count2] = text
+                # Parse string
+                markedParams = paramString.split('|')
+                # Replace markers
+                for param in markedParams:
+                    if "=" in param:
+                        param_name, param_val = param.split("=", 1)
+                    else:
+                        param_name = numbered_param
+                        param_val = param
+                        numbered_param += 1
+                    for m2 in Rmarker.finditer(param_val):
+                        param_val = param_val.replace(m2.group(),
+                                                      inside[int(m2.group(1))])
+                    for m2 in Rmarker2.finditer(param_val):
+                        param_val = param_val.replace(m2.group(),
+                                                      links[int(m2.group(1))])
+                    for m2 in Rmarker3.finditer(param_val):
+                        param_val = param_val.replace(m2.group(),
+                                                      maths[int(m2.group(1))])
+                    params[param_name] = param_val
+
+            # Add it to the result
+            result.append((name, params))
+    return result
+





More information about the Pywikipedia-l mailing list