[Pywikipedia-l] SVN: [5705] trunk/pywikipedia/reflinks.py
nicdumz at svn.wikimedia.org
nicdumz at svn.wikimedia.org
Thu Jul 10 05:43:04 UTC 2008
Revision: 5705
Author: nicdumz
Date: 2008-07-10 05:43:04 +0000 (Thu, 10 Jul 2008)
Log Message:
-----------
* Adding experimental feature : Duplicate references check
* Minor fixes
Modified Paths:
--------------
trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py 2008-07-10 05:18:43 UTC (rev 5704)
+++ trunk/pywikipedia/reflinks.py 2008-07-10 05:43:04 UTC (rev 5705)
@@ -125,14 +125,13 @@
"""Xml generator that yiels pages containing bare references"""
def __init__(self, xmlFilename, xmlStart, namespaces):
- self.xmlFilename = xmlFilename
self.xmlStart = xmlStart
self.namespaces = namespaces
self.skipping = bool(xmlStart)
self.site = wikipedia.getSite()
import xmlreader
- dump = xmlreader.XmlDump(self.xmlFilename)
+ dump = xmlreader.XmlDump(xmlFilename)
self.parser = dump.parse()
def __iter__(self):
@@ -223,6 +222,63 @@
if float(nb_upper)/(nb_letter+1) > .70:
self.title = self.title.title()
+class DuplicateReferences:
+ """
+ When some references are duplicated in an article,
+ name the first, and remove the content of the others
+ """
+ def __init__(self):
+ # Match references
+ self.REFS = re.compile(u'(?i)<ref(?P<name>[^>]*)>(?P<content>.*?)</ref>')
+ self.NAMES = re.compile(u'(?i)\s*name\s*=\s*(?P<quote>"?)\s*(?P<name>.*?)\s*(?P=quote)\s*')
+
+ def process(self, text):
+ # key are ref contents
+ # values are [name, [list of full ref matches]]
+ foundRefs = {}
+ foundRefNames = []
+
+ for match in self.REFS.finditer(text):
+ content = match.group('content')
+ name = match.group('name')
+ if foundRefs.has_key(content):
+ v = foundRefs[content]
+ v[1].append(match.group())
+ else:
+ v = [None, [match.group()]]
+ if not v[0]:
+ n = self.NAMES.match(name)
+ if n:
+ n = n.group('name')
+ v[0] = n
+ foundRefNames.append(n)
+ foundRefs[content] = v
+
+ id = 1
+ while 'autogenerated%s' % id in foundRefNames:
+ id += 1
+ for (k, v) in foundRefs.iteritems():
+ if len(v[1]) == 1:
+ continue
+ name = v[0]
+ if not name:
+ name = 'autogenerated%s' % id
+ id += 1
+ named = u'<ref name="%s">%s</ref>' % (name, k)
+ text = text.replace(v[1][0], named, 1)
+
+ # make sure that the first (named ref) is not
+ # removed later :
+ pos = text.index(named) + len(named)
+ header = text[:pos]
+ end = text[pos:]
+
+ unnamed = u'<ref name="%s" />' % name
+ for ref in v[1][1:]:
+ end = end.replace(ref, unnamed)
+ text = header + end
+ return text
+
class ReferencesRobot:
def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ):
"""
@@ -243,6 +299,8 @@
re.I | re.S | re.X)
self.norefbot = noreferences.NoReferencesBot(None)
+ self.deduplicator = DuplicateReferences()
+
try :
self.stopPageRevId = wikipedia.Page(self.site,
self.stopPage).latestRevision()
@@ -262,8 +320,7 @@
self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
# Authorized mime types for HTML pages
- str = ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml'
- self.MIME = re.compile(str)
+ self.MIME = re.compile(ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def put_page(self, page, new):
"""
@@ -534,15 +591,18 @@
repl = ref.refTitle()
new_text = new_text.replace(match.group(), repl)
+ # Add <references/> when needed, but ignore templates !
+ if page.namespace != 10:
+ if self.norefbot.lacksReferences(new_text, verbose=False):
+ new_text = self.norefbot.addReferences(new_text)
+
+ new_text = self.deduplicator.process(new_text)
+
if new_text == page.get():
wikipedia.output('No changes were necessary in %s'
% page.aslink())
continue
- # Add <references/> when needed, but ignore templates !
- if page.namespace != 10:
- if self.norefbot.lacksReferences(new_text, verbose=False):
- new_text = self.norefbot.addReferences(new_text)
editedpages += 1
self.put_page(page, new_text)
More information about the Pywikipedia-l
mailing list