[Pywikipedia-svn] SVN: [7940] trunk/pywikipedia

17 Feb 2010

Revision: 7940
Author:   xqt
Date:     2010-02-16 17:01:39 +0000 (Tue, 16 Feb 2010)
Log Message:
-----------
put weblinkchecker.compileLinkR() into wikipedia library (textlib update from rewrite)
Modified Paths:
--------------
    trunk/pywikipedia/weblinkchecker.py
    trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================

--- trunk/pywikipedia/weblinkchecker.py	2010-02-16 16:02:34 UTC (rev 7939)
+++ trunk/pywikipedia/weblinkchecker.py	2010-02-16 17:01:39 UTC (rev 7940)
@@ -202,32 +202,6 @@
     re.compile('.*[./@]bodo.kommune.no(/.*)?'), # bot can't handle their redirects
 ]
-def compileLinkR(withoutBracketed = False, onlyBracketed = False):
-    # RFC 2396 says that URLs may only contain certain characters.
-    # For this regex we also accept non-allowed characters, so that the bot
-    # will later show these links as broken ('Non-ASCII Characters in URL').
-    # Note: While allowing parenthesis inside URLs, MediaWiki will regard
-    # right parenthesis at the end of the URL as not part of that URL.
-    # The same applies to dot, comma, colon and some other characters.
-    notAtEnd = ']\s).:;,<>"'
-    # So characters inside the URL can be anything except whitespace,
-    # closing squared brackets, quotation marks, greater than and less
-    # than, and the last character also can't be parenthesis or another
-    # character disallowed by MediaWiki.
-    notInside = ']\s<>"'
-    # The first half of this regular expression is required because '' is
-    # not allowed inside links. For example, in this wiki text:
-    #       ''Please see http://www.example.org.''
-    # .'' shouldn't be considered as part of the link.
-    regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside + ']*[^' + notAtEnd + '])'
-
-    if withoutBracketed:
-        regex = r'(?<![)' + regex
-    elif onlyBracketed:
-        regex = r'[' + regex
-    linkR = re.compile(regex)
-    return linkR
-
 def weblinksIn(text, withoutBracketed = False, onlyBracketed = False):
     text = wikipedia.removeDisabledParts(text)
@@ -245,7 +219,7 @@
     while templateWithParamsR.search(text):
         text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)
-    linkR = compileLinkR(withoutBracketed, onlyBracketed)
+    linkR = wikipedia.compileLinkR(withoutBracketed, onlyBracketed)
# Remove HTML comments in URLs as well as URLs in HTML comments.
     # Also remove text inside nowiki links etc.
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2010-02-16 16:02:34 UTC (rev 7939)
+++ trunk/pywikipedia/wikipedia.py	2010-02-16 17:01:39 UTC (rev 7940)
@@ -4963,6 +4963,35 @@
     #catLinks.sort()
     return sep.join(catLinks) + '\r\n'
+def compileLinkR(withoutBracketed=False, onlyBracketed=False):
+    """Return a regex that matches external links."""
+    # RFC 2396 says that URLs may only contain certain characters.
+    # For this regex we also accept non-allowed characters, so that the bot
+    # will later show these links as broken ('Non-ASCII Characters in URL').
+    # Note: While allowing parenthesis inside URLs, MediaWiki will regard
+    # right parenthesis at the end of the URL as not part of that URL.
+    # The same applies to dot, comma, colon and some other characters.
+    notAtEnd = ']\s).:;,<>"'
+    # So characters inside the URL can be anything except whitespace,
+    # closing squared brackets, quotation marks, greater than and less
+    # than, and the last character also can't be parenthesis or another
+    # character disallowed by MediaWiki.
+    notInside = ']\s<>"'
+    # The first half of this regular expression is required because '' is
+    # not allowed inside links. For example, in this wiki text:
+    #       ''Please see http://www.example.org.''
+    # .'' shouldn't be considered as part of the link.
+    regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
+            + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside \
+            + ']*[^' + notAtEnd + '])'
+
+    if withoutBracketed:
+        regex = r'(?<![)' + regex
+    elif onlyBracketed:
+        regex = r'[' + regex
+    linkR = re.compile(regex)
+    return linkR
+
 # end of category specific code
 def url2link(percentname, insite, site):
     """Convert urlname of a wiki page into interwiki link format.

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [7940] trunk/pywikipedia