http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11335
Revision: 11335
Author: xqt
Date: 2013-04-04 07:29:29 +0000 (Thu, 04 Apr 2013)
Log Message:
-----------
update from trunk r11333
Modified Paths:
--------------
branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2013-04-04 06:30:35 UTC (rev 11334)
+++ branches/rewrite/pywikibot/textlib.py 2013-04-04 07:29:29 UTC (rev 11335)
@@ -19,6 +19,7 @@
from HTMLParser import HTMLParser
import config2 as config
+TEMP_REGEX = re.compile('{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
def unescape(s):
"""Replace escaped HTML-special characters by their originals"""
@@ -75,14 +76,6 @@
# source code readability.
# TODO: handle nested tables.
'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
- # templates with parameters often have whitespace that is used to
- # improve wiki source code readability.
- # 'template': re.compile(r'(?s){{.*?}}'),
- # The regex above fails on nested templates. This regex can handle
- # templates cascaded up to level 2, but no deeper. For arbitrary
- # depth, we'd need recursion which can't be done in Python's re.
- # After all, the language of correct parenthesis words is not regular.
- 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
'hyperlink': compileLinkR(),
'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
# this matches internal wikilinks, but also interwiki, categories, and
@@ -107,12 +100,15 @@
old = re.compile(old)
dontTouchRegexes = []
+ except_templates = False
for exc in exceptions:
if isinstance(exc, basestring):
# assume it's a reference to the exceptionRegexes dictionary
# defined above.
if exc in exceptionRegexes:
dontTouchRegexes.append(exceptionRegexes[exc])
+ elif exc == 'template':
+ except_templates = True
else:
# nowiki, noinclude, includeonly, timeline, math ond other
# extensions
@@ -125,6 +121,35 @@
else:
# assume it's a regular expression
dontTouchRegexes.append(exc)
+
+ # mark templates
+ # don't care about mw variables and parser functions
+ if except_templates:
+ marker1 = findmarker(text)
+ marker2 = findmarker(text, u'##', u'#')
+ Rvalue = re.compile('{{{.+?}}}')
+ Rmarker1 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker1})
+ Rmarker2 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker2})
+ values = {}
+ count = 0
+ for m in Rvalue.finditer(text):
+ count += 1
+ item = m.group()
+ text = text.replace(item, '%s%d%s' % (marker2, count, marker2))
+ values[count] = item
+ inside = {}
+ count = 0
+ while TEMP_REGEX.search(text) is not None:
+ for m in TEMP_REGEX.finditer(text):
+ count += 1
+ item = m.group()
+ text = text.replace(item, '%s%d%s' % (marker1, count, marker1))
+
+ for m2 in Rmarker1.finditer(item):
+ item = item.replace(m2.group(), inside[int(m2.group(1))])
+ for m2 in Rmarker2.finditer(item):
+ item = item.replace(m2.group(), values[int(m2.group(1))])
+ inside[count] = item
index = 0
markerpos = len(text)
while True:
@@ -194,6 +219,12 @@
index = match.start() + len(replacement)
markerpos = match.start() + len(replacement)
text = text[:markerpos] + marker + text[markerpos:]
+
+ if except_templates: # restore templates from dict
+ for m2 in Rmarker1.finditer(text):
+ text = text.replace(m2.group(), inside[int(m2.group(1))])
+ for m2 in Rmarker2.finditer(text):
+ text = text.replace(m2.group(), values[int(m2.group(1))])
return text
@@ -831,7 +862,7 @@
#----------------------------------
def extract_templates_and_params(text):
- """Return list of template calls found in text.
+ """Return a list of templates found in text.
Return value is a list of tuples. There is one tuple for each use of a
template in the page, with the template title as the first entry and a
@@ -840,6 +871,8 @@
with an integer value corresponding to its position among the unnnamed
parameters, and if this results multiple parameters with the same name
only the last value provided will be returned.
+ @param text: The wikitext from which templates are extracted
+ @type text: unicode or string
"""
# remove commented-out stuff etc.
@@ -858,8 +891,6 @@
marker4 = findmarker(thistxt, u'§§', u'§')
result = []
- Rtemplate = re.compile(
- ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
Rmath = re.compile(ur'<math>[^<]+</math>')
Rvalue = re.compile(r'{{{.+?}}}')
Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
@@ -886,8 +917,8 @@
inside = {}
count = 0
- while Rtemplate.search(thistxt) is not None:
- for m in Rtemplate.finditer(thistxt):
+ while TEMP_REGEX.search(thistxt) is not None:
+ for m in TEMP_REGEX.finditer(thistxt):
# Make sure it is not detected again
count += 1
text = m.group()
@@ -909,6 +940,35 @@
# Doesn't detect templates whose name changes,
# or templates whose name contains math tags
continue
+
+ # {{#if: }}
+ if name.startswith('#'):
+ continue
+
+## TODO: merged from wikipedia.py - implement the following
+## if self.site().isInterwikiLink(name):
+## continue
+## # {{DEFAULTSORT:...}}
+## defaultKeys = self.site().versionnumber() > 13 and \
+## self.site().getmagicwords('defaultsort')
+## # It seems some wikis does not have this magic key
+## if defaultKeys:
+## found = False
+## for key in defaultKeys:
+## if name.startswith(key):
+## found = True
+## break
+## if found: continue
+##
+## try:
+## name = Page(self.site(), name).title()
+## except InvalidTitle:
+## if name:
+## output(
+## u"Page %s contains invalid template name {{%s}}."
+## % (self.title(), name.strip()))
+## continue
+
# Parameters
paramString = m.group('params')
params = {}
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11333
Revision: 11333
Author: xqt
Date: 2013-04-04 05:54:15 +0000 (Thu, 04 Apr 2013)
Log Message:
-----------
enable nested templates handling for textlib.replaceExcept()
The old implementation could only handle templates cascaded up to level 2 and in some circumstances it fails into an infinite loop.
Now we use a similar code of textlib.extract_templates_and_params() resp. templatesWithParams() to hide and restore the templates.
MediaWiki variables and parser functions are handled as templates.
Bugfix for bug #3603994, bug #2819291, bug #3158761
Modified Paths:
--------------
trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2013-04-03 22:39:10 UTC (rev 11332)
+++ trunk/pywikipedia/pywikibot/textlib.py 2013-04-04 05:54:15 UTC (rev 11333)
@@ -19,6 +19,7 @@
from HTMLParser import HTMLParser
import config
+TEMP_REGEX = re.compile('{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
def unescape(s):
"""Replace escaped HTML-special characters by their originals"""
@@ -75,14 +76,6 @@
# source code readability.
# TODO: handle nested tables.
'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
- # templates with parameters often have whitespace that is used to
- # improve wiki source code readability.
- # 'template': re.compile(r'(?s){{.*?}}'),
- # The regex above fails on nested templates. This regex can handle
- # templates cascaded up to level 2, but no deeper. For arbitrary
- # depth, we'd need recursion which can't be done in Python's re.
- # After all, the language of correct parenthesis words is not regular.
- 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
'hyperlink': compileLinkR(),
'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
# this matches internal wikilinks, but also interwiki, categories, and
@@ -107,12 +100,15 @@
old = re.compile(old)
dontTouchRegexes = []
+ except_templates = False
for exc in exceptions:
if isinstance(exc, basestring):
# assume it's a reference to the exceptionRegexes dictionary
# defined above.
if exc in exceptionRegexes:
dontTouchRegexes.append(exceptionRegexes[exc])
+ elif exc == 'template':
+ except_templates = True
else:
# nowiki, noinclude, includeonly, timeline, math ond other
# extensions
@@ -125,6 +121,35 @@
else:
# assume it's a regular expression
dontTouchRegexes.append(exc)
+
+ # mark templates
+ # don't care about mw variables and parser functions
+ if except_templates:
+ marker1 = findmarker(text)
+ marker2 = findmarker(text, u'##', u'#')
+ Rvalue = re.compile('{{{.+?}}}')
+ Rmarker1 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker1})
+ Rmarker2 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker2})
+ values = {}
+ count = 0
+ for m in Rvalue.finditer(text):
+ count += 1
+ item = m.group()
+ text = text.replace(item, '%s%d%s' % (marker2, count, marker2))
+ values[count] = item
+ inside = {}
+ count = 0
+ while TEMP_REGEX.search(text) is not None:
+ for m in TEMP_REGEX.finditer(text):
+ count += 1
+ item = m.group()
+ text = text.replace(item, '%s%d%s' % (marker1, count, marker1))
+
+ for m2 in Rmarker1.finditer(item):
+ item = item.replace(m2.group(), inside[int(m2.group(1))])
+ for m2 in Rmarker2.finditer(item):
+ item = item.replace(m2.group(), values[int(m2.group(1))])
+ inside[count] = item
index = 0
markerpos = len(text)
while True:
@@ -194,6 +219,12 @@
index = match.start() + len(replacement)
markerpos = match.start() + len(replacement)
text = text[:markerpos] + marker + text[markerpos:]
+
+ if except_templates: # restore templates from dict
+ for m2 in Rmarker1.finditer(text):
+ text = text.replace(m2.group(), inside[int(m2.group(1))])
+ for m2 in Rmarker2.finditer(text):
+ text = text.replace(m2.group(), values[int(m2.group(1))])
return text
@@ -863,8 +894,6 @@
marker4 = findmarker(thistxt, u'§§', u'§')
result = []
- Rtemplate = re.compile(
- ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
Rmath = re.compile(ur'<math>[^<]+</math>')
Rvalue = re.compile(r'{{{.+?}}}')
Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
@@ -891,8 +920,8 @@
inside = {}
count = 0
- while Rtemplate.search(thistxt) is not None:
- for m in Rtemplate.finditer(thistxt):
+ while TEMP_REGEX.search(thistxt) is not None:
+ for m in TEMP_REGEX.finditer(thistxt):
# Make sure it is not detected again
count += 1
text = m.group()
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11331
Revision: 11331
Author: legoktm
Date: 2013-04-03 20:03:54 +0000 (Wed, 03 Apr 2013)
Log Message:
-----------
Check that the api_secret key is set before checking it's value
Modified Paths:
--------------
trunk/pywikipedia/flickrripper.py
Modified: trunk/pywikipedia/flickrripper.py
===================================================================
--- trunk/pywikipedia/flickrripper.py 2013-04-03 20:01:45 UTC (rev 11330)
+++ trunk/pywikipedia/flickrripper.py 2013-04-03 20:03:54 UTC (rev 11331)
@@ -501,7 +501,7 @@
'Any flickr user can get a key at http://www.flickr.com/services/api/keys/apply/')
return
- if config.flickr['api_secret']:
+ if 'api_secret' in config.flickr and config.flickr['api_secret']:
flickr = flickrapi.FlickrAPI(config.flickr['api_key'], config.flickr['api_secret'])
(token, frob) = flickr.get_token_part_one(perms='read')
if not token: # The user still hasn't authorised this app yet, get_token_part_one() will have spawn a browser window
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11329
Revision: 11329
Author: xqt
Date: 2013-04-03 19:59:51 +0000 (Wed, 03 Apr 2013)
Log Message:
-----------
return dictionary as default for extract_templates_and_params()
Modified Paths:
--------------
trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2013-04-03 19:53:22 UTC (rev 11328)
+++ trunk/pywikipedia/pywikibot/textlib.py 2013-04-03 19:59:51 UTC (rev 11329)
@@ -828,14 +828,14 @@
# Functions dealing with templates
#----------------------------------
-def extract_templates_and_params(text, asDict=False):
+def extract_templates_and_params(text, asList=False):
"""Return a list of templates found in text.
Return value is a list of tuples. There is one tuple for each use of a
template in the page, with the template title as the first entry and
either a list of parameters or a dict of parameters as the second entry
- which depends on asDict method parameter.
- If asDict is True the parameters is a dict, and they are indexed by strings;
+ which depends on asList method parameter.
+ If asList is False the parameters is a dict, and they are indexed by strings;
as in MediaWiki, an unnamed parameter is given a parameter name with an
integer value corresponding to its position among the unnamed parameters,
and if this results multiple parameters with the same name, only the last
@@ -843,8 +843,8 @@
@param text: The wikitext from which templates are extracted
@type text: unicode or string
- @param asDict: If True, return parameters as list, else as dict
- @type asDict: bool
+ @param asList: If True, return parameters as list, else as dict
+ @type asList: bool
"""
# remove commented-out stuff etc.
@@ -961,7 +961,7 @@
markedParams = paramString.split('|')
# Replace markers
for param in markedParams:
- if asDict and "=" in param:
+ if not asList and "=" in param:
param_name, param_val = param.split("=", 1)
else:
param_name = unicode(numbered_param)
@@ -982,10 +982,10 @@
params[param_name.strip()] = param_val.strip()
# Add it to the result
- if asDict:
+ if asList:
+ result.append((name, params.values()))
+ else:
result.append((name, params))
- else:
- result.append((name, params.values()))
return result