[Pywikipedia-l] SVN: [5750] trunk/pywikipedia/reflinks.py

nicdumz at svn.wikimedia.org nicdumz at svn.wikimedia.org
Thu Jul 24 12:50:39 UTC 2008


Revision: 5750
Author:   nicdumz
Date:     2008-07-24 12:50:38 +0000 (Thu, 24 Jul 2008)

Log Message:
-----------
* Several regex fixes (handle unquoted references names)
* Better var names to make script understanding easier
* Adding support for quoted/unquoted ref names. Will not add quotes when there were no quotes wrapping the ref name
* Detect when a reference name is used with several different references, and remember it to eventually keep one reference per name

Modified Paths:
--------------
    trunk/pywikipedia/reflinks.py

Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py	2008-07-23 16:59:16 UTC (rev 5749)
+++ trunk/pywikipedia/reflinks.py	2008-07-24 12:50:38 UTC (rev 5750)
@@ -229,18 +229,18 @@
     """
     def __init__(self):
         # Match references
-        self.REFS = re.compile(u'(?i)<ref(?P<name>[^>/]*)>(?P<content>.*?)</ref>')
-        self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.*?)\s*(?P=quote).*')
-        self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.*?)\s*(?P=quote).*')
+        self.REFS = re.compile(u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
+        self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
+        self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
 
     def process(self, text):
         # keys are ref groups
         # values are a dict where :
         #   keys are ref content
-        #   values are [name, [list of full ref matches]]
+        #   values are [name, [list of full ref matches], quoted, need_to_change]
         foundRefs = {}
-        foundRefNames = []
-        # Replace key by value
+        foundRefNames = {}
+        # Replace key by [value, quoted]
         namedRepl = {}
 
         for match in self.REFS.finditer(text):
@@ -248,8 +248,8 @@
             if not content.strip():
                 continue
 
-            name = match.group('name')
-            group = self.GROUPS.match(name)
+            params = match.group('params')
+            group = self.GROUPS.match(params)
             if not foundRefs.has_key(group):
                 foundRefs[group] = {}
 
@@ -258,19 +258,34 @@
                 v = groupdict[content]
                 v[1].append(match.group())
             else:
-                v = [None, [match.group()]]
-            n = self.NAMES.match(name)
-            if n:
-                n = n.group('name')
+                v = [None, [match.group()], False, False]
+            name = self.NAMES.match(params)
+            if name:
+                quoted = name.group('quote') == '"'
+                name = name.group('name')
                 if v[0]:
-                    namedRepl[n] = v[0]
+                    if v[0] != name:
+                        namedRepl[name] = [v[0], v[2]]
                 else:
-                    v[0] = n
-                foundRefNames.append(n)
+                    #First name associated with this content
+
+                    if name == 'population':
+                        wikipedia.output(content)
+                    if not foundRefNames.has_key(name):
+                        # first time ever we meet this name
+                        if name == 'population':
+                            print "in"
+                        v[2] = quoted
+                        v[0] = name
+                    else:
+                        # if has_key, means that this name is used
+                        # with another content. We'll need to change it
+                        v[3] = True
+                foundRefNames[name] = 1
             groupdict[content] = v
     
         id = 1
-        while 'autogenerated%s' % id in foundRefNames:
+        while foundRefNames.has_key('autogenerated%s' % id):
             id += 1
         for (g, d) in foundRefs.iteritems():
             if g:
@@ -279,13 +294,15 @@
                 group = ""
 
             for (k, v) in d.iteritems():
-                if len(v[1]) == 1:
+                if len(v[1]) == 1 and not v[3]:
                     continue
                 name = v[0]
                 if not name:
                     name = 'autogenerated%s' % id
                     id += 1
-                named = u'<ref %sname="%s">%s</ref>' % (group, name, k)
+                elif v[2]:
+                    name = u'"%s"' % name
+                named = u'<ref %sname=%s>%s</ref>' % (group, name, k)
                 text = text.replace(v[1][0], named, 1)
     
                 # make sure that the first (named ref) is not
@@ -294,14 +311,17 @@
                 header = text[:pos]
                 end = text[pos:]
     
-                unnamed = u'<ref %sname="%s" />' % (group, name)
+                unnamed = u'<ref %sname=%s />' % (group, name)
                 for ref in v[1][1:]:
                     end = end.replace(ref, unnamed)
                 text = header + end 
         
         for (k,v) in namedRepl.iteritems():
             # TODO : Support ref groups
-            text = re.sub(u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name="%s" />' % v, text)
+            name = v[0]
+            if v[1]:
+                name = u'"%s"' % name
+            text = re.sub(u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name=%s />' % name, text)
         return text        
 
 class ReferencesRobot:





More information about the Pywikipedia-l mailing list