SVN: [7940] trunk/pywikipedia - Pywikipedia-svn

16 Feb 2010

Revision: 7940
Author:   xqt
Date:     2010-02-16 17:01:39 +0000 (Tue, 16 Feb 2010)

Log Message:
-----------
put weblinkchecker.compileLinkR() into wikipedia library (textlib update from rewrite)

Modified Paths:
--------------
    trunk/pywikipedia/weblinkchecker.py
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================

--- trunk/pywikipedia/weblinkchecker.py	2010-02-16 16:02:34 UTC (rev 7939)
+++ trunk/pywikipedia/weblinkchecker.py	2010-02-16 17:01:39 UTC (rev 7940)
@@ -202,32 +202,6 @@
     re.compile(&#39;.*[\./(a)]bodo\.kommune\.no(/.*)?&#39;))?'), # bot can't handle their
redirects
 ]
 
-def compileLinkR(withoutBracketed = False, onlyBracketed = False):
-    # RFC 2396 says that URLs may only contain certain characters.
-    # For this regex we also accept non-allowed characters, so that the bot
-    # will later show these links as broken ('Non-ASCII Characters in URL').
-    # Note: While allowing parenthesis inside URLs, MediaWiki will regard
-    # right parenthesis at the end of the URL as not part of that URL.
-    # The same applies to dot, comma, colon and some other characters.
-    notAtEnd = '\]\s\)\.:;,<>"'
-    # So characters inside the URL can be anything except whitespace,
-    # closing squared brackets, quotation marks, greater than and less
-    # than, and the last character also can't be parenthesis or another
-    # character disallowed by MediaWiki.
-    notInside = '\]\s<>"'
-    # The first half of this regular expression is required because '' is
-    # not allowed inside links. For example, in this wiki text:
-    #       ''Please see http://www.example.org.''
-    # .'' shouldn't be considered as part of the link.
-    regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' +
notAtEnd + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside
+ ']*[^' + notAtEnd + '])'
-
-    if withoutBracketed:
-        regex = r'(?<!\[)' + regex
-    elif onlyBracketed:
-        regex = r'\[' + regex
-    linkR = re.compile(regex)
-    return linkR
-
 def weblinksIn(text, withoutBracketed = False, onlyBracketed = False):
     text = wikipedia.removeDisabledParts(text)
 
@@ -245,7 +219,7 @@
     while templateWithParamsR.search(text):
         text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
 
-    linkR = compileLinkR(withoutBracketed, onlyBracketed)
+    linkR = wikipedia.compileLinkR(withoutBracketed, onlyBracketed)
 
     # Remove HTML comments in URLs as well as URLs in HTML comments.
     # Also remove text inside nowiki links etc.

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2010-02-16 16:02:34 UTC (rev 7939)
+++ trunk/pywikipedia/wikipedia.py	2010-02-16 17:01:39 UTC (rev 7940)
@@ -4963,6 +4963,35 @@
     #catLinks.sort()
     return sep.join(catLinks) + '\r\n'
 
+def compileLinkR(withoutBracketed=False, onlyBracketed=False):
+    """Return a regex that matches external links."""
+    # RFC 2396 says that URLs may only contain certain characters.
+    # For this regex we also accept non-allowed characters, so that the bot
+    # will later show these links as broken ('Non-ASCII Characters in URL').
+    # Note: While allowing parenthesis inside URLs, MediaWiki will regard
+    # right parenthesis at the end of the URL as not part of that URL.
+    # The same applies to dot, comma, colon and some other characters.
+    notAtEnd = '\]\s\)\.:;,<>"'
+    # So characters inside the URL can be anything except whitespace,
+    # closing squared brackets, quotation marks, greater than and less
+    # than, and the last character also can't be parenthesis or another
+    # character disallowed by MediaWiki.
+    notInside = '\]\s<>"'
+    # The first half of this regular expression is required because '' is
+    # not allowed inside links. For example, in this wiki text:
+    #       ''Please see http://www.example.org.''
+    # .'' shouldn't be considered as part of the link.
+    regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' +
notAtEnd \
+            + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' +
notInside \
+            + ']*[^' + notAtEnd + '])'
+
+    if withoutBracketed:
+        regex = r'(?<!\[)' + regex
+    elif onlyBracketed:
+        regex = r'\[' + regex
+    linkR = re.compile(regex)
+    return linkR
+
 # end of category specific code
 def url2link(percentname, insite, site):
     """Convert urlname of a wiki page into interwiki link format.