[Pywikipedia-l] SVN: [5481] trunk/pywikipedia/reflinks.py

nicdumz at svn.wikimedia.org nicdumz at svn.wikimedia.org
Sat May 31 10:54:25 UTC 2008


Revision: 5481
Author:   nicdumz
Date:     2008-05-31 10:54:25 +0000 (Sat, 31 May 2008)

Log Message:
-----------
Improving bad titles detection: re.match() is better here

Modified Paths:
--------------
    trunk/pywikipedia/reflinks.py

Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py	2008-05-31 10:14:01 UTC (rev 5480)
+++ trunk/pywikipedia/reflinks.py	2008-05-31 10:54:25 UTC (rev 5481)
@@ -76,21 +76,21 @@
                 # is
                 ur'(?is)(test|'
                 # starts with
-                +'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$))'
+                +'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$)).*'
                 # anywhere
-                +'|(404|page|file).*not( *be)? *found'
+                +'|.*((404|page|file).*not( *be)? *found).*'
                 # ends with
-                +'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
+                +'|.*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
                 +')',
             'fr':
                 #is
                 ur'(?is)(test|'
                 # starts with
-                + ur'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$))'
+                + ur'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$)).*'
                 # anywhere
-                +'|(404|page|file|site).*(not *found|en +travaux)'
+                +'|.*((404|page|file|site).*(not *found|en +travaux)).*'
                 # ends with
-                +'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
+                +'|.*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
                 +')'}
 
 
@@ -472,7 +472,7 @@
                     wikipedia.output(u'%s : Hybrid encoding...' % ref.link)
                     continue
                
-                if self.titleBlackList.search(ref.title):
+                if self.titleBlackList.match(ref.title):
                     repl = ref.refLink()
                     new_text = new_text.replace(match.group(), repl)
                     wikipedia.output(u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' % (ref.link, ref.title))





More information about the Pywikipedia-l mailing list