Revision: 8130
Author: xqt
Date: 2010-04-26 07:33:31 +0000 (Mon, 26 Apr 2010)
Log Message:
-----------
Fix for getmagicwords('defaultsort') returns None
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-04-26 07:17:17 UTC (rev 8129)
+++ trunk/pywikipedia/wikipedia.py 2010-04-26 07:33:31 UTC (rev 8130)
@@ -2354,12 +2354,14 @@
continue
# {{DEFAULTSORT:...}}
defaultKeys = self.site().getmagicwords('defaultsort')
- found = False
- for key in defaultKeys:
- if name.startswith(key):
- found = True
- break
- if found: continue
+ # It seems some wikis does not have this magic key
+ if defaultKeys:
+ found = False
+ for key in defaultKeys:
+ if name.startswith(key):
+ found = True
+ break
+ if found: continue
try:
name = Page(self.site(), name).title()
Revision: 8123
Author: xqt
Date: 2010-04-23 13:31:31 +0000 (Fri, 23 Apr 2010)
Log Message:
-----------
revert r4245 (don't works for me) and handle nested templates up to level 2; update rewrite from trunk
Modified Paths:
--------------
branches/rewrite/pywikibot/textlib.py
trunk/pywikipedia/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2010-04-23 10:04:06 UTC (rev 8122)
+++ branches/rewrite/pywikibot/textlib.py 2010-04-23 13:31:31 UTC (rev 8123)
@@ -61,17 +61,11 @@
'comment': re.compile(r'(?s)<!--.*?-->'),
# section headers
'header': re.compile(r'\r\n=+.+=+ *\r\n'),
- 'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'),
- 'math': re.compile(r'(?is)<math>.*?</math>'),
- 'noinclude': re.compile(r'(?is)<noinclude>.*?</noinclude>'),
- # wiki tags are ignored inside nowiki tags.
- 'nowiki': re.compile(r'(?is)<nowiki>.*?</nowiki>'),
# preformatted text
'pre': re.compile(r'(?ism)<pre>.*?</pre>'),
'source': re.compile(r'(?is)<source .*?</source>'),
# inline references
'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'),
- 'timeline': re.compile(r'(?is)<timeline>.*?</timeline>'),
# lines that start with a space are shown in a monospace font and
# have whitespace preserved.
'startspace': re.compile(r'(?m)^ (.*?)$'),
@@ -83,10 +77,10 @@
# improve wiki source code readability.
# 'template': re.compile(r'(?s){{.*?}}'),
# The regex above fails on nested templates. This regex can handle
- # templates cascaded up to level 3, but no deeper. For arbitrary
+ # templates cascaded up to level 2, but no deeper. For arbitrary
# depth, we'd need recursion which can't be done in Python's re.
# After all, the language of correct parenthesis words is not regular.
- 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
+ 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
'hyperlink': compileLinkR(),
'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
# this matches internal wikilinks, but also interwiki, categories, and
@@ -112,9 +106,11 @@
if isinstance(exc, str) or isinstance(exc, unicode):
# assume it's a reference to the exceptionRegexes dictionary
# defined above.
- if exc not in exceptionRegexes:
- raise ValueError("Unknown tag type: " + exc)
- dontTouchRegexes.append(exceptionRegexes[exc])
+ if exc in exceptionRegexes:
+ dontTouchRegexes.append(exceptionRegexes[exc])
+ else:
+ # nowiki, noinclude, includeonly, timeline, math ond other extensions
+ dontTouchRegexes.append(re.compile(r'(?is)<%s>.*?</%s>' % (exc, exc)))
# handle alias
if exc == 'source':
dontTouchRegexes.append(re.compile(r'(?is)<syntaxhighlight .*?</syntaxhighlight>'))
@@ -676,10 +672,10 @@
# RFC 2396 says that URLs may only contain certain characters.
# For this regex we also accept non-allowed characters, so that the bot
# will later show these links as broken ('Non-ASCII Characters in URL').
- # Note: While allowing parenthesis inside URLs, MediaWiki will regard
- # right parenthesis at the end of the URL as not part of that URL.
- # The same applies to dot, comma, colon and some other characters.
- notAtEnd = '\]\s\)\.:;,<>"\|'
+ # Note: While allowing dots inside URLs, MediaWiki will regard
+ # dots at the end of the URL as not part of that URL.
+ # The same applies to comma, colon and some other characters.
+ notAtEnd = '\]\s\.:;,<>"\|'
# So characters inside the URL can be anything except whitespace,
# closing squared brackets, quotation marks, greater than and less
# than, and the last character also can't be parenthesis or another
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2010-04-23 10:04:06 UTC (rev 8122)
+++ trunk/pywikipedia/pywikibot/textlib.py 2010-04-23 13:31:31 UTC (rev 8123)
@@ -77,10 +77,10 @@
# improve wiki source code readability.
# 'template': re.compile(r'(?s){{.*?}}'),
# The regex above fails on nested templates. This regex can handle
- # templates cascaded up to level 3, but no deeper. For arbitrary
+ # templates cascaded up to level 2, but no deeper. For arbitrary
# depth, we'd need recursion which can't be done in Python's re.
# After all, the language of correct parenthesis words is not regular.
- 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
+ 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
'hyperlink': compileLinkR(),
'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
# this matches internal wikilinks, but also interwiki, categories, and