Revision: 5653 Author: nicdumz Date: 2008-06-29 08:45:11 +0000 (Sun, 29 Jun 2008)
Log Message: ----------- Doc commit, mostly. (+ one cleanup; + one minor fix when bailing out)
Modified Paths: -------------- trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py =================================================================== --- trunk/pywikipedia/reflinks.py 2008-06-29 07:46:34 UTC (rev 5652) +++ trunk/pywikipedia/reflinks.py 2008-06-29 08:45:11 UTC (rev 5653) @@ -70,8 +70,11 @@ 'en':u'Bot generated title'}
soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE) +# matches an URL at the index of a website dirIndex = re.compile(ur'^\w+://[^/]+/((default|index).(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE) +# Extracts the domain name domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)') + globalbadtitles = """ # is (test| @@ -97,11 +100,13 @@ )\W*$ ) """ +# Language-specific bad titles badtitles = { 'en': '', 'fr': '.*(404|page|site).*en +travaux.*', 'es': '.*sitio.*no +disponible.*' }
+# Regex that match bare references linksInRef = re.compile( # bracketed URLs ur'(?i)<ref(?P<name>[^>]*)>\s*[?(?P<url>(?:http|https|ftp)://(?:' + @@ -109,10 +114,15 @@ ur'^[]\s<>"]+([^[]\s<>"]+[^[]\s.:;\,<>?"]+|'+ # unbracketed without () ur'[^[]\s<>"]+[^[]\s).:;\,<>?"]+|[^[]\s<>"]+))[!?,\s]*]?\s*</ref>') -#http://www.twoevils.org/files/wikipedia/404-links.txt.gz + +# Download this file : +# http://www.twoevils.org/files/wikipedia/404-links.txt.gz +# ( maintained by User:Dispenser ) listof404pages = '404-links.txt'
class XmlDumpPageGenerator: + """Xml generator that yiels pages containing bare references""" + def __init__(self, xmlFilename, xmlStart, namespaces): self.xmlFilename = xmlFilename self.xmlStart = xmlStart @@ -145,6 +155,8 @@ return page
class RefLink: + """Container to handle a single bare reference""" + def __init__(self, link, name): self.refname = name self.link = link @@ -154,16 +166,20 @@ self.title = None
def refTitle(self): + """Returns the <ref> with its new title""" return '<ref%s>[%s %s<!-- %s -->]</ref>' % (self.refname, self.link, self.title, self.linkComment)
def refLink(self): + """No title has been found, return the unbracketed link""" return '<ref%s>%s</ref>' % (self.refname, self.link)
def refDead(self): + """Dead link, tag it with a {{dead link}}""" tag = wikipedia.translate(self.site, deadLinkTag) % self.link return '<ref%s>%s</ref>' % (self.refname, tag)
def transform(self, ispdf = False): + """Normalize the title""" #convert html entities if not ispdf: self.title = wikipedia.html2unicode(self.title) @@ -185,6 +201,7 @@ #prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('''', '''') self.title = wikipedia.unicode2html(self.title, self.site.encoding()) + # TODO : remove HTML when both opening and closing tags are included
def avoid_uppercase(self): """ @@ -207,12 +224,24 @@
class ReferencesRobot: def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ): + """ + - generator : Page generator + - acceptall : boolean, is -always on ? + - limit : int, stop after n modified pages + - ignorepdf : boolean + """ self.generator = generator self.acceptall = acceptall self.limit = limit self.ignorepdf = ignorepdf self.site = wikipedia.getSite() self.stopPage = wikipedia.translate(self.site, stopPage) + + self.titleBlackList = re.compile( + '(' + globalbadtitles + '|' + wikipedia.translate(self.site, badtitles) + ')', + re.I | re.S | re.X) + self.norefbot = noreferences.NoReferencesBot(None) + try : self.stopPageRevId = wikipedia.Page(self.site, self.stopPage).latestRevision() @@ -220,16 +249,20 @@ wikipedia.output(u'The stop page %s does not exist' % self.stopPage.aslink()) wikipedia.stopme() + sys.exit(1) + + # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content-type[^>]*>') + # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^'";>/]*)') + # Extract html title from page self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)') + # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<![CDATA[.*?]]>') + + # Authorized mime types for HTML pages str = ur'application/(?:xhtml+xml|xml)|text/(?:ht|x)ml' self.MIME = re.compile(str) - self.titleBlackList = re.compile( - '(' + globalbadtitles + '|' + wikipedia.translate(self.site, badtitles) + ')', - re.I | re.S | re.X) - self.norefbot = noreferences.NoReferencesBot(None)
def put_page(self, page, new): """ @@ -270,6 +303,10 @@ toStdout = True)
def getPDFTitle(self, ref, f): + """ + Use pdfinfo to retrieve title from a PDF. + Unix-only, I'm afraid. + """ wikipedia.output( u'PDF file.' ) fd, infile = tempfile.mkstemp() urlobj = os.fdopen(fd, 'r+w') @@ -293,13 +330,6 @@ urlobj.close() os.unlink(infile)
- def replace(self, sub, text, repl): - new_text = text - for match in re.finditer(re.escape(sub), text): - if wikipedia.isDisabled(text, match.start()): - continue - #blablah - def run(self): """ Runs the Bot @@ -361,7 +391,7 @@ repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue - # Test if the redirect was valid + # Get the real url where we end (http redirects !) redir = f.geturl() if redir != ref.link and domain.findall(redir) == domain.findall(link): if soft404.search(redir) and not soft404.search(ref.link): @@ -383,19 +413,17 @@ wikipedia.output(u'HTTP error (%s) for %s on %s' % (e.code, ref.url, page.aslink()), toStdout = True) - if e.code == 410: # 410 Gone, indicates that the resource has been purposely removed + # 410 Gone, indicates that the resource has been purposely removed + if e.code == 410 or (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) - elif e.code == 404 and (u'\t%s\t' % ref.url in deadLinks): - repl = ref.refDead() - new_text = new_text.replace(match.group(), repl) continue except (urllib2.URLError, socket.error, IOError, httplib.error), e: #except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e: - wikipedia.output(u'Can't get page %s : %s' % (ref.url, e)) + wikipedia.output(u'Can't retrieve page %s : %s' % (ref.url, e)) continue except ValueError: #Known bug of httplib, google for : @@ -412,19 +440,23 @@ enc = [] if meta_content: tag = meta_content.group() + # Prefer the contentType from the HTTP header : if not contentType: contentType = tag s = self.CHARSET.search(tag) if s: tmp = s.group('enc').strip(""' ").lower() - enc.append(tmp) - if tmp in ("gb 2312", "gb2312", "gb-2312", "gb_2312"): + naked = re.sub('[ _-]', '', tmp) + # Convert to python correct encoding names + if naked == "gb2312": enc.append("gbk") - if tmp in ("shift jis", "shiftjis", "shift-jis", "shift_jis"): + elif naked == "shiftjis": enc.append("shift jis 2004") enc.append("cp932") - if tmp in ("x euc jp", "x-euc-jp"): + elif naked == "xeucjp": enc.append("euc-jp") + else: + enc.append(tmp) if not contentType: wikipedia.output(u'No content-type found for %s' % ref.link) continue @@ -434,6 +466,8 @@ new_text = new_text.replace(match.group(), repl) continue
+ # Ugly hacks to try to survive when both server and page return no encoding. + # Uses most used encodings for each national suffix if u'.ru' in ref.link or u'.su' in ref.link: # see http://www.sci.aha.ru/ATL/ra13a.htm : no server encoding, no page encoding enc = enc + ['koi8-r', 'windows-1251'] @@ -460,6 +494,7 @@ continue
+ # Retrieves the first non empty string inside <title> tags for m in self.TITLE.finditer(u.unicode): t = m.group() if t: @@ -474,8 +509,11 @@ wikipedia.output(u'%s : No title found...' % ref.link) continue if enc and u.originalEncoding not in enc: + # BeautifulSoup thinks that the original encoding of our page was not one + # of the encodings we specified. Output a warning. wikipedia.output(u'\03{lightpurple}ENCODING\03{default} : %s (%s)' % (ref.link, ref.title))
+ # XXX Ugly hack if u'é' in ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) @@ -487,6 +525,8 @@ new_text = new_text.replace(match.group(), repl) wikipedia.output(u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' % (ref.link, ref.title)) continue + + # Truncate long titles. 175 is arbitrary if len(ref.title) > 175: ref.title = ref.title[:175] + "..."
@@ -498,15 +538,17 @@ % page.aslink()) continue
- # Do not add <references/> to templates ! + # Add <references/> when needed, but ignore templates ! if page.namespace != 10: if self.norefbot.lacksReferences(new_text, verbose=False): new_text = self.norefbot.addReferences(new_text) editedpages += 1 self.put_page(page, new_text) + if self.limit and editedpages >= self.limit: wikipedia.output('Edited %s pages, stopping.' % self.limit) return + if editedpages % 20 == 0: wikipedia.output('\03{lightgreen}Checking stop page...\03{default}') actualRev = wikipedia.Page(self.site,
pywikipedia-l@lists.wikimedia.org