Revision: 5705 Author: nicdumz Date: 2008-07-10 05:43:04 +0000 (Thu, 10 Jul 2008)
Log Message: ----------- * Adding experimental feature : Duplicate references check * Minor fixes
Modified Paths: -------------- trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py =================================================================== --- trunk/pywikipedia/reflinks.py 2008-07-10 05:18:43 UTC (rev 5704) +++ trunk/pywikipedia/reflinks.py 2008-07-10 05:43:04 UTC (rev 5705) @@ -125,14 +125,13 @@ """Xml generator that yiels pages containing bare references"""
def __init__(self, xmlFilename, xmlStart, namespaces): - self.xmlFilename = xmlFilename self.xmlStart = xmlStart self.namespaces = namespaces self.skipping = bool(xmlStart) self.site = wikipedia.getSite()
import xmlreader - dump = xmlreader.XmlDump(self.xmlFilename) + dump = xmlreader.XmlDump(xmlFilename) self.parser = dump.parse()
def __iter__(self): @@ -223,6 +222,63 @@ if float(nb_upper)/(nb_letter+1) > .70: self.title = self.title.title()
+class DuplicateReferences: + """ + When some references are duplicated in an article, + name the first, and remove the content of the others + """ + def __init__(self): + # Match references + self.REFS = re.compile(u'(?i)<ref(?P<name>[^>]*)>(?P<content>.*?)</ref>') + self.NAMES = re.compile(u'(?i)\s*name\s*=\s*(?P<quote>"?)\s*(?P<name>.*?)\s*(?P=quote)\s*') + + def process(self, text): + # key are ref contents + # values are [name, [list of full ref matches]] + foundRefs = {} + foundRefNames = [] + + for match in self.REFS.finditer(text): + content = match.group('content') + name = match.group('name') + if foundRefs.has_key(content): + v = foundRefs[content] + v[1].append(match.group()) + else: + v = [None, [match.group()]] + if not v[0]: + n = self.NAMES.match(name) + if n: + n = n.group('name') + v[0] = n + foundRefNames.append(n) + foundRefs[content] = v + + id = 1 + while 'autogenerated%s' % id in foundRefNames: + id += 1 + for (k, v) in foundRefs.iteritems(): + if len(v[1]) == 1: + continue + name = v[0] + if not name: + name = 'autogenerated%s' % id + id += 1 + named = u'<ref name="%s">%s</ref>' % (name, k) + text = text.replace(v[1][0], named, 1) + + # make sure that the first (named ref) is not + # removed later : + pos = text.index(named) + len(named) + header = text[:pos] + end = text[pos:] + + unnamed = u'<ref name="%s" />' % name + for ref in v[1][1:]: + end = end.replace(ref, unnamed) + text = header + end + return text + class ReferencesRobot: def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ): """ @@ -243,6 +299,8 @@ re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None)
+ self.deduplicator = DuplicateReferences() + try : self.stopPageRevId = wikipedia.Page(self.site, self.stopPage).latestRevision() @@ -262,8 +320,7 @@ self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<![CDATA[.*?]]>')
# Authorized mime types for HTML pages - str = ur'application/(?:xhtml+xml|xml)|text/(?:ht|x)ml' - self.MIME = re.compile(str) + self.MIME = re.compile(ur'application/(?:xhtml+xml|xml)|text/(?:ht|x)ml')
def put_page(self, page, new): """ @@ -534,15 +591,18 @@ repl = ref.refTitle() new_text = new_text.replace(match.group(), repl)
+ # Add <references/> when needed, but ignore templates ! + if page.namespace != 10: + if self.norefbot.lacksReferences(new_text, verbose=False): + new_text = self.norefbot.addReferences(new_text) + + new_text = self.deduplicator.process(new_text) + if new_text == page.get(): wikipedia.output('No changes were necessary in %s' % page.aslink()) continue
- # Add <references/> when needed, but ignore templates ! - if page.namespace != 10: - if self.norefbot.lacksReferences(new_text, verbose=False): - new_text = self.norefbot.addReferences(new_text) editedpages += 1 self.put_page(page, new_text)
pywikipedia-l@lists.wikimedia.org