Revision: 4268 Author: wikipedian Date: 2007-09-13 13:24:42 +0000 (Thu, 13 Sep 2007)
Log Message: ----------- extracted method used removeDisabledParts() instead of code duplication
Modified Paths: -------------- trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py =================================================================== --- trunk/pywikipedia/weblinkchecker.py 2007-09-13 10:20:55 UTC (rev 4267) +++ trunk/pywikipedia/weblinkchecker.py 2007-09-13 13:24:42 UTC (rev 4268) @@ -149,17 +149,7 @@ re.compile('.*[./@]bodo.kommune.no(/.*)?'), # bot can't handle their redirects ]
-def weblinksIn(text, withoutBracketed = False, onlyBracketed = False): - text = wikipedia.removeDisabledParts(text) - - # MediaWiki parses templates before parsing external links. Thus, there - # might be a | or a } directly after a URL which does not belong to - # the URL itself. - # Blow up templates with spaces to avoid these problems. - templateWithParamsR = re.compile(r'{{([^}]*?[^ ])|([^ ][^}]*?)}}', re.DOTALL) - while templateWithParamsR.search(text): - text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text) - +def compileLinkR(withoutBracketed = False, onlyBracketed = False): # RFC 2396 says that URLs may only contain certain characters. # For this regex we also accept non-allowed characters, so that the bot # will later show these links as broken ('Non-ASCII Characters in URL'). @@ -183,9 +173,23 @@ elif onlyBracketed: regex = r'[' + regex linkR = re.compile(regex) + +def weblinksIn(text, withoutBracketed = False, onlyBracketed = False): + text = wikipedia.removeDisabledParts(text) + + # MediaWiki parses templates before parsing external links. Thus, there + # might be a | or a } directly after a URL which does not belong to + # the URL itself. + # Blow up templates with spaces to avoid these problems. + templateWithParamsR = re.compile(r'{{([^}]*?[^ ])|([^ ][^}]*?)}}', re.DOTALL) + while templateWithParamsR.search(text): + text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text) + + linkR = compileLinkR(withoutBracketed, onlyBracketed) + # Remove HTML comments in URLs as well as URLs in HTML comments. - # Also remove text inside nowiki links - text = re.sub('(?s)<nowiki>.*?</nowiki>|<!--.*?-->', '', text) + # Also remove text inside nowiki links etc. + text = wikipedia.removeDisabledParts(text) for m in linkR.finditer(text): yield m.group('url')