Revision: 5750 Author: nicdumz Date: 2008-07-24 12:50:38 +0000 (Thu, 24 Jul 2008)
Log Message: ----------- * Several regex fixes (handle unquoted references names) * Better var names to make script understanding easier * Adding support for quoted/unquoted ref names. Will not add quotes when there were no quotes wrapping the ref name * Detect when a reference name is used with several different references, and remember it to eventually keep one reference per name
Modified Paths: -------------- trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py =================================================================== --- trunk/pywikipedia/reflinks.py 2008-07-23 16:59:16 UTC (rev 5749) +++ trunk/pywikipedia/reflinks.py 2008-07-24 12:50:38 UTC (rev 5750) @@ -229,18 +229,18 @@ """ def __init__(self): # Match references - self.REFS = re.compile(u'(?i)<ref(?P<name>[^>/]*)>(?P<content>.*?)</ref>') - self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.*?)\s*(?P=quote).*') - self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.*?)\s*(?P=quote).*') + self.REFS = re.compile(u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>') + self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*') + self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
def process(self, text): # keys are ref groups # values are a dict where : # keys are ref content - # values are [name, [list of full ref matches]] + # values are [name, [list of full ref matches], quoted, need_to_change] foundRefs = {} - foundRefNames = [] - # Replace key by value + foundRefNames = {} + # Replace key by [value, quoted] namedRepl = {}
for match in self.REFS.finditer(text): @@ -248,8 +248,8 @@ if not content.strip(): continue
- name = match.group('name') - group = self.GROUPS.match(name) + params = match.group('params') + group = self.GROUPS.match(params) if not foundRefs.has_key(group): foundRefs[group] = {}
@@ -258,19 +258,34 @@ v = groupdict[content] v[1].append(match.group()) else: - v = [None, [match.group()]] - n = self.NAMES.match(name) - if n: - n = n.group('name') + v = [None, [match.group()], False, False] + name = self.NAMES.match(params) + if name: + quoted = name.group('quote') == '"' + name = name.group('name') if v[0]: - namedRepl[n] = v[0] + if v[0] != name: + namedRepl[name] = [v[0], v[2]] else: - v[0] = n - foundRefNames.append(n) + #First name associated with this content + + if name == 'population': + wikipedia.output(content) + if not foundRefNames.has_key(name): + # first time ever we meet this name + if name == 'population': + print "in" + v[2] = quoted + v[0] = name + else: + # if has_key, means that this name is used + # with another content. We'll need to change it + v[3] = True + foundRefNames[name] = 1 groupdict[content] = v
id = 1 - while 'autogenerated%s' % id in foundRefNames: + while foundRefNames.has_key('autogenerated%s' % id): id += 1 for (g, d) in foundRefs.iteritems(): if g: @@ -279,13 +294,15 @@ group = ""
for (k, v) in d.iteritems(): - if len(v[1]) == 1: + if len(v[1]) == 1 and not v[3]: continue name = v[0] if not name: name = 'autogenerated%s' % id id += 1 - named = u'<ref %sname="%s">%s</ref>' % (group, name, k) + elif v[2]: + name = u'"%s"' % name + named = u'<ref %sname=%s>%s</ref>' % (group, name, k) text = text.replace(v[1][0], named, 1)
# make sure that the first (named ref) is not @@ -294,14 +311,17 @@ header = text[:pos] end = text[pos:]
- unnamed = u'<ref %sname="%s" />' % (group, name) + unnamed = u'<ref %sname=%s />' % (group, name) for ref in v[1][1:]: end = end.replace(ref, unnamed) text = header + end
for (k,v) in namedRepl.iteritems(): # TODO : Support ref groups - text = re.sub(u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name="%s" />' % v, text) + name = v[0] + if v[1]: + name = u'"%s"' % name + text = re.sub(u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name=%s />' % name, text) return text
class ReferencesRobot:
pywikipedia-l@lists.wikimedia.org