Revision: 7940
Author: xqt
Date: 2010-02-16 17:01:39 +0000 (Tue, 16 Feb 2010)
Log Message:
-----------
put weblinkchecker.compileLinkR() into wikipedia library (textlib update from rewrite)
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2010-02-16 16:02:34 UTC (rev 7939)
+++ trunk/pywikipedia/weblinkchecker.py 2010-02-16 17:01:39 UTC (rev 7940)
@@ -202,32 +202,6 @@
re.compile('.*[\./(a)]bodo\.kommune\.no(/.*)?'))?'), # bot can't handle their
redirects
]
-def compileLinkR(withoutBracketed = False, onlyBracketed = False):
- # RFC 2396 says that URLs may only contain certain characters.
- # For this regex we also accept non-allowed characters, so that the bot
- # will later show these links as broken ('Non-ASCII Characters in URL').
- # Note: While allowing parenthesis inside URLs, MediaWiki will regard
- # right parenthesis at the end of the URL as not part of that URL.
- # The same applies to dot, comma, colon and some other characters.
- notAtEnd = '\]\s\)\.:;,<>"'
- # So characters inside the URL can be anything except whitespace,
- # closing squared brackets, quotation marks, greater than and less
- # than, and the last character also can't be parenthesis or another
- # character disallowed by MediaWiki.
- notInside = '\]\s<>"'
- # The first half of this regular expression is required because '' is
- # not allowed inside links. For example, in this wiki text:
- # ''Please see
http://www.example.org.''
- # .'' shouldn't be considered as part of the link.
- regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' +
notAtEnd + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside
+ ']*[^' + notAtEnd + '])'
-
- if withoutBracketed:
- regex = r'(?<!\[)' + regex
- elif onlyBracketed:
- regex = r'\[' + regex
- linkR = re.compile(regex)
- return linkR
-
def weblinksIn(text, withoutBracketed = False, onlyBracketed = False):
text = wikipedia.removeDisabledParts(text)
@@ -245,7 +219,7 @@
while templateWithParamsR.search(text):
text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
- linkR = compileLinkR(withoutBracketed, onlyBracketed)
+ linkR = wikipedia.compileLinkR(withoutBracketed, onlyBracketed)
# Remove HTML comments in URLs as well as URLs in HTML comments.
# Also remove text inside nowiki links etc.
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-02-16 16:02:34 UTC (rev 7939)
+++ trunk/pywikipedia/wikipedia.py 2010-02-16 17:01:39 UTC (rev 7940)
@@ -4963,6 +4963,35 @@
#catLinks.sort()
return sep.join(catLinks) + '\r\n'
+def compileLinkR(withoutBracketed=False, onlyBracketed=False):
+ """Return a regex that matches external links."""
+ # RFC 2396 says that URLs may only contain certain characters.
+ # For this regex we also accept non-allowed characters, so that the bot
+ # will later show these links as broken ('Non-ASCII Characters in URL').
+ # Note: While allowing parenthesis inside URLs, MediaWiki will regard
+ # right parenthesis at the end of the URL as not part of that URL.
+ # The same applies to dot, comma, colon and some other characters.
+ notAtEnd = '\]\s\)\.:;,<>"'
+ # So characters inside the URL can be anything except whitespace,
+ # closing squared brackets, quotation marks, greater than and less
+ # than, and the last character also can't be parenthesis or another
+ # character disallowed by MediaWiki.
+ notInside = '\]\s<>"'
+ # The first half of this regular expression is required because '' is
+ # not allowed inside links. For example, in this wiki text:
+ # ''Please see
http://www.example.org.''
+ # .'' shouldn't be considered as part of the link.
+ regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' +
notAtEnd \
+ + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' +
notInside \
+ + ']*[^' + notAtEnd + '])'
+
+ if withoutBracketed:
+ regex = r'(?<!\[)' + regex
+ elif onlyBracketed:
+ regex = r'\[' + regex
+ linkR = re.compile(regex)
+ return linkR
+
# end of category specific code
def url2link(percentname, insite, site):
"""Convert urlname of a wiki page into interwiki link format.