Revision: 5481 Author: nicdumz Date: 2008-05-31 10:54:25 +0000 (Sat, 31 May 2008)
Log Message: ----------- Improving bad titles detection: re.match() is better here
Modified Paths: -------------- trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py =================================================================== --- trunk/pywikipedia/reflinks.py 2008-05-31 10:14:01 UTC (rev 5480) +++ trunk/pywikipedia/reflinks.py 2008-05-31 10:54:25 UTC (rev 5481) @@ -76,21 +76,21 @@ # is ur'(?is)(test|' # starts with - +'^\W*(register|registration|(sign|log)[ -]?in|subscribe|sign[ -]?up|log[ -]?on|untitled *(document|page|$))' + +'^\W*(register|registration|(sign|log)[ -]?in|subscribe|sign[ -]?up|log[ -]?on|untitled *(document|page|$)).*' # anywhere - +'|(404|page|file).*not( *be)? *found' + +'|.*((404|page|file).*not( *be)? *found).*' # ends with - +'|(register|registration|(sign|log)[ -]?in|subscribe|sign[ -]?up|log[ -]?on)\W*$' + +'|.*(register|registration|(sign|log)[ -]?in|subscribe|sign[ -]?up|log[ -]?on)\W*$' +')', 'fr': #is ur'(?is)(test|' # starts with - + ur'^\W*(register|registration|(sign|log)[ -]?in|subscribe|sign[ -]?up|log[ -]?on|untitled *(document|page|$))' + + ur'^\W*(register|registration|(sign|log)[ -]?in|subscribe|sign[ -]?up|log[ -]?on|untitled *(document|page|$)).*' # anywhere - +'|(404|page|file|site).*(not *found|en +travaux)' + +'|.*((404|page|file|site).*(not *found|en +travaux)).*' # ends with - +'|(register|registration|(sign|log)[ -]?in|subscribe|sign[ -]?up|log[ -]?on)\W*$' + +'|.*(register|registration|(sign|log)[ -]?in|subscribe|sign[ -]?up|log[ -]?on)\W*$' +')'}
@@ -472,7 +472,7 @@ wikipedia.output(u'%s : Hybrid encoding...' % ref.link) continue
- if self.titleBlackList.search(ref.title): + if self.titleBlackList.match(ref.title): repl = ref.refLink() new_text = new_text.replace(match.group(), repl) wikipedia.output(u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' % (ref.link, ref.title))