[Pywikipedia-l] SVN: [5481] trunk/pywikipedia/reflinks.py
nicdumz at svn.wikimedia.org
nicdumz at svn.wikimedia.org
Sat May 31 10:54:25 UTC 2008
Revision: 5481
Author: nicdumz
Date: 2008-05-31 10:54:25 +0000 (Sat, 31 May 2008)
Log Message:
-----------
Improving bad titles detection: re.match() is better here
Modified Paths:
--------------
trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py 2008-05-31 10:14:01 UTC (rev 5480)
+++ trunk/pywikipedia/reflinks.py 2008-05-31 10:54:25 UTC (rev 5481)
@@ -76,21 +76,21 @@
# is
ur'(?is)(test|'
# starts with
- +'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$))'
+ +'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$)).*'
# anywhere
- +'|(404|page|file).*not( *be)? *found'
+ +'|.*((404|page|file).*not( *be)? *found).*'
# ends with
- +'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
+ +'|.*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
+')',
'fr':
#is
ur'(?is)(test|'
# starts with
- + ur'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$))'
+ + ur'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$)).*'
# anywhere
- +'|(404|page|file|site).*(not *found|en +travaux)'
+ +'|.*((404|page|file|site).*(not *found|en +travaux)).*'
# ends with
- +'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
+ +'|.*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
+')'}
@@ -472,7 +472,7 @@
wikipedia.output(u'%s : Hybrid encoding...' % ref.link)
continue
- if self.titleBlackList.search(ref.title):
+ if self.titleBlackList.match(ref.title):
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
wikipedia.output(u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' % (ref.link, ref.title))
More information about the Pywikipedia-l
mailing list