[Pywikipedia-l] SVN: [5750] trunk/pywikipedia/reflinks.py
nicdumz at svn.wikimedia.org
nicdumz at svn.wikimedia.org
Thu Jul 24 12:50:39 UTC 2008
Revision: 5750
Author: nicdumz
Date: 2008-07-24 12:50:38 +0000 (Thu, 24 Jul 2008)
Log Message:
-----------
* Several regex fixes (handle unquoted references names)
* Better var names to make script understanding easier
* Adding support for quoted/unquoted ref names. Will not add quotes when there were no quotes wrapping the ref name
* Detect when a reference name is used with several different references, and remember it to eventually keep one reference per name
Modified Paths:
--------------
trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py 2008-07-23 16:59:16 UTC (rev 5749)
+++ trunk/pywikipedia/reflinks.py 2008-07-24 12:50:38 UTC (rev 5750)
@@ -229,18 +229,18 @@
"""
def __init__(self):
# Match references
- self.REFS = re.compile(u'(?i)<ref(?P<name>[^>/]*)>(?P<content>.*?)</ref>')
- self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.*?)\s*(?P=quote).*')
- self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.*?)\s*(?P=quote).*')
+ self.REFS = re.compile(u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
+ self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
+ self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
def process(self, text):
# keys are ref groups
# values are a dict where :
# keys are ref content
- # values are [name, [list of full ref matches]]
+ # values are [name, [list of full ref matches], quoted, need_to_change]
foundRefs = {}
- foundRefNames = []
- # Replace key by value
+ foundRefNames = {}
+ # Replace key by [value, quoted]
namedRepl = {}
for match in self.REFS.finditer(text):
@@ -248,8 +248,8 @@
if not content.strip():
continue
- name = match.group('name')
- group = self.GROUPS.match(name)
+ params = match.group('params')
+ group = self.GROUPS.match(params)
if not foundRefs.has_key(group):
foundRefs[group] = {}
@@ -258,19 +258,34 @@
v = groupdict[content]
v[1].append(match.group())
else:
- v = [None, [match.group()]]
- n = self.NAMES.match(name)
- if n:
- n = n.group('name')
+ v = [None, [match.group()], False, False]
+ name = self.NAMES.match(params)
+ if name:
+ quoted = name.group('quote') == '"'
+ name = name.group('name')
if v[0]:
- namedRepl[n] = v[0]
+ if v[0] != name:
+ namedRepl[name] = [v[0], v[2]]
else:
- v[0] = n
- foundRefNames.append(n)
+ #First name associated with this content
+
+ if name == 'population':
+ wikipedia.output(content)
+ if not foundRefNames.has_key(name):
+ # first time ever we meet this name
+ if name == 'population':
+ print "in"
+ v[2] = quoted
+ v[0] = name
+ else:
+ # if has_key, means that this name is used
+ # with another content. We'll need to change it
+ v[3] = True
+ foundRefNames[name] = 1
groupdict[content] = v
id = 1
- while 'autogenerated%s' % id in foundRefNames:
+ while foundRefNames.has_key('autogenerated%s' % id):
id += 1
for (g, d) in foundRefs.iteritems():
if g:
@@ -279,13 +294,15 @@
group = ""
for (k, v) in d.iteritems():
- if len(v[1]) == 1:
+ if len(v[1]) == 1 and not v[3]:
continue
name = v[0]
if not name:
name = 'autogenerated%s' % id
id += 1
- named = u'<ref %sname="%s">%s</ref>' % (group, name, k)
+ elif v[2]:
+ name = u'"%s"' % name
+ named = u'<ref %sname=%s>%s</ref>' % (group, name, k)
text = text.replace(v[1][0], named, 1)
# make sure that the first (named ref) is not
@@ -294,14 +311,17 @@
header = text[:pos]
end = text[pos:]
- unnamed = u'<ref %sname="%s" />' % (group, name)
+ unnamed = u'<ref %sname=%s />' % (group, name)
for ref in v[1][1:]:
end = end.replace(ref, unnamed)
text = header + end
for (k,v) in namedRepl.iteritems():
# TODO : Support ref groups
- text = re.sub(u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name="%s" />' % v, text)
+ name = v[0]
+ if v[1]:
+ name = u'"%s"' % name
+ text = re.sub(u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name=%s />' % name, text)
return text
class ReferencesRobot:
More information about the Pywikipedia-l
mailing list