[Pywikipedia-l] SVN: [4268] trunk/pywikipedia/weblinkchecker.py

13 Sep 2007

Revision: 4268
Author:   wikipedian
Date:     2007-09-13 13:24:42 +0000 (Thu, 13 Sep 2007)
Log Message:
-----------
extracted method
used removeDisabledParts() instead of code duplication
Modified Paths:
--------------
    trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================

--- trunk/pywikipedia/weblinkchecker.py	2007-09-13 10:20:55 UTC (rev 4267)
+++ trunk/pywikipedia/weblinkchecker.py	2007-09-13 13:24:42 UTC (rev 4268)
@@ -149,17 +149,7 @@
     re.compile('.*[./@]bodo.kommune.no(/.*)?'), # bot can't handle their redirects
 ]
-def weblinksIn(text, withoutBracketed = False, onlyBracketed = False):
-    text = wikipedia.removeDisabledParts(text)
-
-    # MediaWiki parses templates before parsing external links. Thus, there
-    # might be a | or a } directly after a URL which does not belong to
-    # the URL itself.
-    # Blow up templates with spaces to avoid these problems.
-    templateWithParamsR = re.compile(r'{{([^}]*?[^ ])|([^ ][^}]*?)}}', re.DOTALL)
-    while templateWithParamsR.search(text):
-        text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
-
+def compileLinkR(withoutBracketed = False, onlyBracketed = False):
     # RFC 2396 says that URLs may only contain certain characters.
     # For this regex we also accept non-allowed characters, so that the bot
     # will later show these links as broken ('Non-ASCII Characters in URL').
@@ -183,9 +173,23 @@
     elif onlyBracketed:
         regex = r'[' + regex
     linkR = re.compile(regex)
+
+def weblinksIn(text, withoutBracketed = False, onlyBracketed = False):
+    text = wikipedia.removeDisabledParts(text)
+
+    # MediaWiki parses templates before parsing external links. Thus, there
+    # might be a | or a } directly after a URL which does not belong to
+    # the URL itself.
+    # Blow up templates with spaces to avoid these problems.
+    templateWithParamsR = re.compile(r'{{([^}]*?[^ ])|([^ ][^}]*?)}}', re.DOTALL)
+    while templateWithParamsR.search(text):
+        text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
+
+    linkR = compileLinkR(withoutBracketed, onlyBracketed)
+
     # Remove HTML comments in URLs as well as URLs in HTML comments.
-    # Also remove text inside nowiki links
-    text = re.sub('(?s)<nowiki>.*?</nowiki>|<!--.*?-->', '', text)
+    # Also remove text inside nowiki links etc.
+    text = wikipedia.removeDisabledParts(text)
     for m in linkR.finditer(text):
         yield m.group('url')

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

[Pywikipedia-l] SVN: [4268] trunk/pywikipedia/weblinkchecker.py