Revision: 5488
Author: nicdumz
Date: 2008-06-01 09:21:54 +0000 (Sun, 01 Jun 2008)
Log Message:
-----------
Adjusting the max title length, refactoring the badtitles
Modified Paths:
--------------
trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py 2008-05-31 19:16:48 UTC (rev 5487)
+++ trunk/pywikipedia/reflinks.py 2008-06-01 09:21:54 UTC (rev 5488)
@@ -52,7 +52,7 @@
'de':u'Bot: Korrektes Referenzformat (siehe
[[:en:User:DumZiBoT/refLinks]])',
'hu':u'Robot: Forráshivatkozások kibővítése a hivatkozott oldal
címével',
'ko':u'봇: url만 있는 주석을 보강, (영문)[[:en:User:DumZiBoT/refLinks]]
참조',
- 'es':u'Formateando las referencias que no tuvieran títulos (Pruebas
por [[Wikipedia:Bot/Autorizaciones#DumZiBoT]] )',
+ 'es':u'Formateando las referencias que no tuvieran títulos (FAQ :
[[:en:User:DumZiBoT/refLinks]] )',
'en':u'Bot: Converting bare references, see
[[User:DumZiBoT/refLinks|FAQ]]'}
deadLinkTag = {'fr':u'[%s] {{lien mort}}',
@@ -72,28 +72,36 @@
soft404 =
re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog',
re.IGNORECASE)
dirIndex =
re.compile(ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$',
re.IGNORECASE)
domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
-badtitles = {'en':
- # is
- ur'(?is)(test|'
- # starts with
- +'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[
\-]?up|log[ \-]?on|untitled *(document|page|$)).*'
- # anywhere
- +'|.*((404|page|file).*not( *be)? *found).*'
- # ends with
- +'|.*(register|registration|(sign|log)[ \-]?in|subscribe|sign[
\-]?up|log[ \-]?on)\W*$'
- +')',
- 'fr':
- #is
- ur'(?is)(test|'
- # starts with
- + ur'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[
\-]?up|log[ \-]?on|untitled *(document|page|$)).*'
- # anywhere
- +'|.*((404|page|file|site).*(not *found|en +travaux)).*'
- # ends with
- +'|.*(register|registration|(sign|log)[ \-]?in|subscribe|sign[
\-]?up|log[ \-]?on)\W*$'
- +')'}
+globalbadtitles = """
+# is
+(test|
+# starts with
+ ^\W*(
+ register
+ |registration
+ |(sign|log)[ \-]?in
+ |subscribe
+ |sign[ \-]?up
+ |log[ \-]?on
+ |untitled *(document|page|$)
+ ).*
+# anywhere
+ |.*(404|page|file).*not([ ]*be)?[ ]*found.*
+# ends with
+ |.*(
+ register
+ |registration
+ |(sign|log)[ \-]?in
+ |subscribe|sign[ \-]?up
+ |log[ \-]?on
+ )\W*$
+)
+"""
+badtitles = { 'en': '',
+ 'fr': '.*(404|page|site).*en +travaux.*',
+ 'es': '.*sitio.*no +disponible.*'
+ }
-
linksInRef = re.compile(
# bracketed URLs
ur'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https|ftp)://(?:'
+
@@ -218,7 +226,9 @@
self.NON_HTML =
re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
str = ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml'
self.MIME = re.compile(str)
- self.titleBlackList = re.compile(wikipedia.translate(self.site, badtitles))
+ self.titleBlackList = re.compile(
+ '(' + globalbadtitles + '|' +
wikipedia.translate(self.site, badtitles) + ')',
+ re.I | re.S | re.X)
self.norefbot = noreferences.NoReferencesBot(None)
def put_page(self, page, new):
@@ -477,8 +487,8 @@
new_text = new_text.replace(match.group(), repl)
wikipedia.output(u'\03{lightred}WARNING\03{default} %s :
Blacklisted title (%s)' % (ref.link, ref.title))
continue
- if len(ref.title) > 250:
- ref.title = ref.title[:250] + "..."
+ if len(ref.title) > 175:
+ ref.title = ref.title[:175] + "..."
repl = ref.refTitle()
new_text = new_text.replace(match.group(), repl)
Show replies by date