Revision: 7940 Author: xqt Date: 2010-02-16 17:01:39 +0000 (Tue, 16 Feb 2010)
Log Message: ----------- put weblinkchecker.compileLinkR() into wikipedia library (textlib update from rewrite)
Modified Paths: -------------- trunk/pywikipedia/weblinkchecker.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/weblinkchecker.py =================================================================== --- trunk/pywikipedia/weblinkchecker.py 2010-02-16 16:02:34 UTC (rev 7939) +++ trunk/pywikipedia/weblinkchecker.py 2010-02-16 17:01:39 UTC (rev 7940) @@ -202,32 +202,6 @@ re.compile('.*[./@]bodo.kommune.no(/.*)?'), # bot can't handle their redirects ]
-def compileLinkR(withoutBracketed = False, onlyBracketed = False): - # RFC 2396 says that URLs may only contain certain characters. - # For this regex we also accept non-allowed characters, so that the bot - # will later show these links as broken ('Non-ASCII Characters in URL'). - # Note: While allowing parenthesis inside URLs, MediaWiki will regard - # right parenthesis at the end of the URL as not part of that URL. - # The same applies to dot, comma, colon and some other characters. - notAtEnd = ']\s).:;,<>"' - # So characters inside the URL can be anything except whitespace, - # closing squared brackets, quotation marks, greater than and less - # than, and the last character also can't be parenthesis or another - # character disallowed by MediaWiki. - notInside = ']\s<>"' - # The first half of this regular expression is required because '' is - # not allowed inside links. For example, in this wiki text: - # ''Please see http://www.example.org.'' - # .'' shouldn't be considered as part of the link. - regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside + ']*[^' + notAtEnd + '])' - - if withoutBracketed: - regex = r'(?<![)' + regex - elif onlyBracketed: - regex = r'[' + regex - linkR = re.compile(regex) - return linkR - def weblinksIn(text, withoutBracketed = False, onlyBracketed = False): text = wikipedia.removeDisabledParts(text)
@@ -245,7 +219,7 @@ while templateWithParamsR.search(text): text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
- linkR = compileLinkR(withoutBracketed, onlyBracketed) + linkR = wikipedia.compileLinkR(withoutBracketed, onlyBracketed)
# Remove HTML comments in URLs as well as URLs in HTML comments. # Also remove text inside nowiki links etc.
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2010-02-16 16:02:34 UTC (rev 7939) +++ trunk/pywikipedia/wikipedia.py 2010-02-16 17:01:39 UTC (rev 7940) @@ -4963,6 +4963,35 @@ #catLinks.sort() return sep.join(catLinks) + '\r\n'
+def compileLinkR(withoutBracketed=False, onlyBracketed=False): + """Return a regex that matches external links.""" + # RFC 2396 says that URLs may only contain certain characters. + # For this regex we also accept non-allowed characters, so that the bot + # will later show these links as broken ('Non-ASCII Characters in URL'). + # Note: While allowing parenthesis inside URLs, MediaWiki will regard + # right parenthesis at the end of the URL as not part of that URL. + # The same applies to dot, comma, colon and some other characters. + notAtEnd = ']\s).:;,<>"' + # So characters inside the URL can be anything except whitespace, + # closing squared brackets, quotation marks, greater than and less + # than, and the last character also can't be parenthesis or another + # character disallowed by MediaWiki. + notInside = ']\s<>"' + # The first half of this regular expression is required because '' is + # not allowed inside links. For example, in this wiki text: + # ''Please see http://www.example.org.'' + # .'' shouldn't be considered as part of the link. + regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \ + + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside \ + + ']*[^' + notAtEnd + '])' + + if withoutBracketed: + regex = r'(?<![)' + regex + elif onlyBracketed: + regex = r'[' + regex + linkR = re.compile(regex) + return linkR + # end of category specific code def url2link(percentname, insite, site): """Convert urlname of a wiki page into interwiki link format.