Revision: 4191
Author: wikipedian
Date: 2007-09-03 23:48:49 +0000 (Mon, 03 Sep 2007)
Log Message:
-----------
fixed bug [ 1787369 ] Bug in wikipedia.py function getCategoryLinks()
markup within <pre> tags will now be ignored when searching for
categories, links, interwikis etc.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-09-03 17:57:38 UTC (rev 4190)
+++ trunk/pywikipedia/wikipedia.py 2007-09-03 23:48:49 UTC (rev 4191)
@@ -1384,7 +1384,7 @@
return []
thistxt = removeCategoryLinks(thistxt, self.site())
- # remove HTML comments, nowiki sections, and includeonly sections
+ # remove HTML comments, pre, nowiki, and includeonly sections
# from text before processing
thistxt = removeDisabledParts(thistxt)
@@ -2634,15 +2634,17 @@
Removes those parts of a wiki text where wiki markup is disabled, i.e.
* HTML comments
* nowiki tags
+ * pre tags
* includeonly tags
- The exact set of parts which are removed is passed as the 'parts' parameter
- and defaults to all.
+ The exact set of parts which should be removed can be passed as the
+ 'parts' parameter, which defaults to all.
"""
regexes = {
- 'nowiki' : r'<nowiki>.*?</nowiki>',
- 'comments' : r'<!--.*?-->',
- 'includeonly' : r'<includeonly>.*?</includeonly>',
+ 'comments' : r'<!--.*?-->',
+ 'includeonly': r'<includeonly>.*?</includeonly>',
+ 'nowiki': r'<nowiki>.*?</nowiki>',
+ 'pre': r'<pre>.*?</pre>',
}
if '*' in parts:
parts = regexes.keys()
@@ -2659,7 +2661,8 @@
if insite == None:
insite = getSite()
result = {}
- # Ignore interwiki links within nowiki tags, includeonly tags, and HTML comments
+ # Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
+ # and HTML comments
text = removeDisabledParts(text)
# This regular expression will find every link that is possibly an
@@ -2816,7 +2819,8 @@
in the form {code:pagename}. Do not call this routine directly, use
Page objects instead"""
result = []
- # Ignore category links within nowiki tags, includeonly tags, and HTML comments
+ # Ignore category links within nowiki tags, pre tags, includeonly tags,
+ # and HTML comments
text = removeDisabledParts(text)
catNamespace = '|'.join(site.category_namespaces())
R = re.compile(r'\[\[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)(?:\|(?P<sortKey>.+?))?\s*\]\]' % catNamespace)