[Pywikipedia-l] SVN: [5705] trunk/pywikipedia/reflinks.py

nicdumz at svn.wikimedia.org nicdumz at svn.wikimedia.org
Thu Jul 10 05:43:04 UTC 2008


Revision: 5705
Author:   nicdumz
Date:     2008-07-10 05:43:04 +0000 (Thu, 10 Jul 2008)

Log Message:
-----------
* Adding experimental feature : Duplicate references check
* Minor fixes

Modified Paths:
--------------
    trunk/pywikipedia/reflinks.py

Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py	2008-07-10 05:18:43 UTC (rev 5704)
+++ trunk/pywikipedia/reflinks.py	2008-07-10 05:43:04 UTC (rev 5705)
@@ -125,14 +125,13 @@
     """Xml generator that yiels pages containing bare references"""
 
     def __init__(self, xmlFilename, xmlStart, namespaces):
-        self.xmlFilename = xmlFilename
         self.xmlStart = xmlStart
         self.namespaces = namespaces
         self.skipping = bool(xmlStart)
         self.site = wikipedia.getSite()
 
         import xmlreader
-        dump = xmlreader.XmlDump(self.xmlFilename)
+        dump = xmlreader.XmlDump(xmlFilename)
         self.parser = dump.parse()
 
     def __iter__(self):
@@ -223,6 +222,63 @@
         if float(nb_upper)/(nb_letter+1) > .70:
             self.title = self.title.title()
 
+class DuplicateReferences:
+    """
+    When some references are duplicated in an article,
+    name the first, and remove the content of the others
+    """
+    def __init__(self):
+        # Match references
+        self.REFS = re.compile(u'(?i)<ref(?P<name>[^>]*)>(?P<content>.*?)</ref>')
+        self.NAMES = re.compile(u'(?i)\s*name\s*=\s*(?P<quote>"?)\s*(?P<name>.*?)\s*(?P=quote)\s*')
+
+    def process(self, text):
+        # key are ref contents
+        # values are [name, [list of full ref matches]]
+        foundRefs = {}
+        foundRefNames = []
+
+        for match in self.REFS.finditer(text):
+            content = match.group('content')
+            name = match.group('name')
+            if foundRefs.has_key(content):
+                v = foundRefs[content]
+                v[1].append(match.group())
+            else:
+                v = [None, [match.group()]]
+            if not v[0]:
+                n = self.NAMES.match(name)
+                if n:
+                    n = n.group('name')
+                    v[0] = n
+                    foundRefNames.append(n)
+            foundRefs[content] = v
+    
+        id = 1
+        while 'autogenerated%s' % id in foundRefNames:
+            id += 1
+        for (k, v) in foundRefs.iteritems():
+            if len(v[1]) == 1:
+                continue
+            name = v[0]
+            if not name:
+                name = 'autogenerated%s' % id
+                id += 1
+            named = u'<ref name="%s">%s</ref>' % (name, k)
+            text = text.replace(v[1][0], named, 1)
+
+            # make sure that the first (named ref) is not
+            # removed later :
+            pos = text.index(named) + len(named)
+            header = text[:pos]
+            end = text[pos:]
+
+            unnamed = u'<ref name="%s" />' % name
+            for ref in v[1][1:]:
+                end = end.replace(ref, unnamed)
+            text = header + end 
+        return text        
+
 class ReferencesRobot:
     def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ):
         """
@@ -243,6 +299,8 @@
                                  re.I | re.S | re.X)
         self.norefbot = noreferences.NoReferencesBot(None)
 
+        self.deduplicator = DuplicateReferences()
+        
         try :
             self.stopPageRevId = wikipedia.Page(self.site, 
                                                 self.stopPage).latestRevision()
@@ -262,8 +320,7 @@
         self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
 
         # Authorized mime types for HTML pages
-        str = ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml'
-        self.MIME = re.compile(str)
+        self.MIME = re.compile(ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
  
     def put_page(self, page, new):
         """
@@ -534,15 +591,18 @@
                 repl = ref.refTitle()
                 new_text = new_text.replace(match.group(), repl)
 
+            # Add <references/> when needed, but ignore templates !
+            if page.namespace != 10:
+                if self.norefbot.lacksReferences(new_text, verbose=False):
+                    new_text = self.norefbot.addReferences(new_text)
+
+            new_text = self.deduplicator.process(new_text)
+
             if new_text == page.get():
                 wikipedia.output('No changes were necessary in %s' 
                                  % page.aslink())
                 continue
 
-            # Add <references/> when needed, but ignore templates !
-            if page.namespace != 10:
-                if self.norefbot.lacksReferences(new_text, verbose=False):
-                    new_text = self.norefbot.addReferences(new_text)
             editedpages += 1
             self.put_page(page, new_text)
 





More information about the Pywikipedia-l mailing list