[Pywikipedia-l] SVN: [5298] trunk/pywikipedia/reflinks.py

nicdumz at svn.wikimedia.org nicdumz at svn.wikimedia.org
Sat May 3 07:57:27 UTC 2008


Revision: 5298
Author:   nicdumz
Date:     2008-05-03 07:57:27 +0000 (Sat, 03 May 2008)

Log Message:
-----------
Adding my reflink.py to the repository.
It is stable now, even if some minor issues are being reported from time to time.

Added Paths:
-----------
    trunk/pywikipedia/reflinks.py

Added: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py	                        (rev 0)
+++ trunk/pywikipedia/reflinks.py	2008-05-03 07:57:27 UTC (rev 5298)
@@ -0,0 +1,544 @@
+# -*- coding: utf-8 -*-
+"""
+This bot will search for references which are only made of a link 
+without title, (i.e. <ref>[http://www.google.fr/]</ref> or 
+<ref>http://www.google.fr/</ref>) and will fetch the html title from 
+the link to use it as the title of the wiki link in the reference, i.e.
+<ref>[http://www.google.fr/search?q=test test - Google Search]</ref>
+
+The bot checks every 20 edits its talk page and a special stop page : if
+one of these page has been edited, it stops.
+
+DumZiBoT is running that script on en: & fr: at every new dump, running it on de: is not allowed anymore.
+
+See [[:en:User:DumZiBoT/refLinks]] for more information on the bot.
+
+&params;
+
+-limit:n                Stops after n edits
+
+-xml:dump.xml           Should be used instead of a simple page fetching
+                        method from pagegenerators.py for performance and
+                        load issues
+
+-xmlstart               Page to start with when using an XML dump
+
+Basic pagegenerators commands, -page, etc...
+"""
+# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
+#
+# Distributed under the terms of the GPL
+
+from BeautifulSoup import UnicodeDammit
+import sys, re, urllib2, httplib, socket, codecs, ftplib
+import wikipedia, pagegenerators, noreferences
+import subprocess, tempfile, os
+
+stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
+            'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
+            'it':u'Utente:DumZiBoT/EditThisPageToStopMe',
+            'en':u'User:DumZiBoT/EditThisPageToStopMe'}
+
+msg = { 'fr':u'Bot: Correction des refs. mal formatées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])',
+        'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])',
+        'en':u'Bot: Converting bare references, see [[User:DumZiBoT/refLinks|FAQ]]'}
+
+deadLinkTag = {'fr':u'{{lien mort}}',
+               'de':u'',
+               'en':u'{{dead link}}'}
+
+comment = {'fr':u'Titre généré automatiquement',
+           'de':u'Automatisch generierter titel',
+           'en':u'Bot generated title'}
+
+soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE)
+dirIndex = re.compile(ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE)
+domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
+badtitles = {'en':
+                # is
+                ur'(?is) (test|'
+                # starts with
+                +'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$))'
+                # anywhere
+                +'|(404|page|file).*not( *be)? *found'
+                # ends with
+                +'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
+                +')',
+            'fr':
+                #is
+                ur'(?is) (test|'
+                # starts with
+                ur'^\W*(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on|untitled *(document|page|$))'
+                # anywhere
+                +'|(404|page|file|site).*(not *found|en +travaux)'
+                # ends with
+                +'|(register|registration|(sign|log)[ \-]?in|subscribe|sign[ \-]?up|log[ \-]?on)\W*$'
+                +')'}
+
+
+linksInRef = re.compile(
+	# bracketed URLs
+	ur'(?i)<ref(?P<name>[^>]*)>\s*\[?(?P<url>(?:http|https|ftp)://(?:' +
+	# unbracketed with()
+	ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'+
+	# unbracketed without ()
+	ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*</ref>')
+listof404pages = 'http://www.twoevils.org/files/wikipedia/404-links.txt.gz'
+
+class XmlDumpPageGenerator:
+    def __init__(self, xmlFilename, xmlStart, namespaces):
+        self.xmlFilename = xmlFilename
+        self.xmlStart = xmlStart
+        self.namespaces = namespaces
+        self.skipping = bool(xmlStart)
+        self.site = wikipedia.getSite()
+
+        import xmlreader
+        dump = xmlreader.XmlDump(self.xmlFilename)
+        self.parser = dump.parse()
+
+    def __iter__(self):
+        return self
+    
+    def next(self):
+        while True:
+            try:
+                entry = self.parser.next()
+            except StopIteration:
+                raise
+            if self.skipping:
+                if entry.title != self.xmlStart:
+                    continue
+                self.skipping = False
+            page=wikipedia.Page(self.site, entry.title)
+            if not self.namespaces == []:
+                if page.namespace() not in self.namespaces:
+                    continue
+            if linksInRef.search(entry.text):
+                return page
+
+class RefLink:
+    def __init__(self, link, name):
+        self.refname = name
+        self.link = link
+        self.site = wikipedia.getSite()
+        self.linkComment = wikipedia.translate(self.site, comment)
+        self.url = re.sub(u'#.*', '', self.link)
+        self.title = None
+
+    def refTitle(self):
+        return '<ref%s>[%s %s<!-- %s -->]</ref>' % (self.refname, self.link, self.title, self.linkComment)
+
+    def refLink(self):
+        return '<ref%s>%s</ref>' % (self.refname, self.link)
+    
+    def refDead(self):
+        tag = wikipedia.translate(self.site, deadLinkTag)
+        return '<ref%s>[%s]%s</ref>' % (self.refname, self.link, tag)
+
+    def transform(self, ispdf = False):
+        #convert html entities
+        if not ispdf:
+            self.title = wikipedia.html2unicode(self.title)
+        self.title = re.sub(r'-+', '-', self.title)
+        #remove formatting, i.e long useless strings
+        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
+        #remove \n and \r and Unicode spaces from titles
+        self.title = re.sub(r'(?u)\s', ' ', self.title)
+        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
+        #remove extra whitespaces
+        #remove leading and trailing ./;/,/-/_/+/ /
+        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))
+
+        self.avoid_uppercase()
+        #avoid closing the link before the end
+        self.title = self.title.replace(']', '&#93;')
+        #avoid multiple } being interpreted as a template inclusion
+        self.title = self.title.replace('}}', '}&#125;')
+        #prevent multiple quotes being interpreted as '' or '''
+        self.title = self.title.replace('\'\'', '\'&#39;')
+        self.title = wikipedia.unicode2html(self.title, self.site.encoding())
+
+    def avoid_uppercase(self):
+        """
+        If title has more than 6 characters and has 60% of uppercase
+        characters, capitalize() it
+        """
+        if len(self.title) <= 6:
+            return
+        nb_upper = 0
+        nb_letter = 0
+        for letter in self.title:
+            if letter.isupper():
+                nb_upper += 1
+            if letter.isalpha():
+                nb_letter += 1
+            if letter.isdigit():
+                return
+        if float(nb_upper)/(nb_letter+1) > .70:
+            self.title = self.title.title()
+
+class ReferencesRobot:
+    def __init__(self, generator, acceptall = False, limit = None):
+        self.generator = generator
+        self.acceptall = acceptall
+        self.limit = limit
+        self.site = wikipedia.getSite()
+        self.stopPage = wikipedia.translate(self.site, stopPage)
+        self.stopPageRevId = wikipedia.Page(self.site, 
+                                            self.stopPage).latestRevision()
+        self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>')
+        self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)')
+        self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
+        self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>')
+        str = ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml'
+        self.MIME = re.compile(str)
+        self.titleBlackList = re.compile(wikipedia.translate(self.site, badtitles))   
+        self.norefbot = noreferences.NoReferencesBot(None)
+ 
+    def put_page(self, page, new):
+        """
+        Prints diffs between orginal and new (text), puts new text for page
+        """
+        wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" 
+                         % page.title())
+        wikipedia.showDiff(page.get(), new)
+        if not self.acceptall:
+            choice = wikipedia.inputChoice(u'Do you want to accept ' +
+                                           u'these changes?', 
+                                           ['Yes', 'No', 'All'], 
+                                           ['y', 'N', 'a'], 'N')
+            if choice in ['a', 'A']:
+                self.acceptall = True
+            if choice in ['y', 'Y']:
+                page.put_async(new)
+        if self.acceptall:
+            try:
+                page.put(new)
+            except wikipedia.EditConflict:
+                wikipedia.output(u'Skipping %s because of edit conflict' 
+                                  % (page.title(),))
+            except wikipedia.SpamfilterError, e:
+                wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
+            except wikipedia.PageNotSaved, error:
+                wikipedia.output(u'Error putting page: %s' % (error.args,))
+            except wikipedia.LockedPage:
+                wikipedia.output(u'Skipping %s (locked page)' 
+                                  % (page.title(),))
+            except wikipedia.ServerError, e:
+                wikipedia.output(u'Server Error : %s' % e)
+
+    def httpError(self, err_num, link, pagetitleaslink):
+        """Log HTTP Error"""
+        wikipedia.output(u'HTTP error (%s) for %s on %s' 
+                          % (err_num, link, pagetitleaslink),
+                         toStdout = True)
+        f = codecs.open(
+                wikipedia.config.datafilepath(
+                    'reflinks-httpErrorLog', 
+                    'reflinks-%s-%s.txt' % (self.site.family.name, 
+                                            self.site.lang)),
+                'a', 'utf-8')
+        f.write(u'%s: %s from %s\n' % (err_num, link, pagetitleaslink))
+        f.close()
+
+    def getPDFTitle(self, ref, f): 
+        wikipedia.output( u'PDF file.' )
+        fd, infile = tempfile.mkstemp()
+        urlobj = os.fdopen(fd, 'r+w')
+        urlobj.write(f.read())
+        try:
+            pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False).communicate()[0]
+            for aline in pdfinfo_out.splitlines():
+                if aline.lower().startswith('title'):
+                    ref.title = aline.split(None)[1:]
+                    ref.title = ' '.join(ref.title)
+                    if ref.title != '': wikipedia.output(u'title: ' +ref.title )
+            wikipedia.output( u'PDF done.' )
+        except ValueError:
+            wikipedia.output( u'pdfinfo value error.' )
+        except OSError:
+            wikipedia.output( u'pdfinfo OS error.' )
+        except:	# Ignore errors
+            wikipedia.output( u'PDF processing error.' )
+            pass
+        finally:
+            urlobj.close()
+            os.unlink(infile)
+
+    def replace(self, sub, text, repl):
+        new_text = text
+        for match in re.finditer(re.escape(sub), text):
+            if wikipedia.isDisabled(text, match.start()):
+                continue
+            #blablah
+
+    def run(self):
+        """
+        Runs the Bot
+        """
+        wikipedia.setAction(wikipedia.translate(self.site, msg))
+        deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() 
+        socket.setdefaulttimeout(30)
+        editedpages = 0
+        for page in self.generator:
+            try:
+                # Load the page's text from the wiki
+                new_text = page.get()
+                if not page.canBeEdited():
+                    wikipedia.output(u"You can't edit page %s" 
+                                      % page.aslink())
+                    continue
+            except wikipedia.NoPage:
+                wikipedia.output(u'Page %s not found' % page.aslink())
+                continue
+            except wikipedia.IsRedirectPage:
+                wikipedia.output(u'Page %s is a redirect' % page.aslink())
+                continue
+
+            for match in linksInRef.finditer(wikipedia.removeDisabledParts(page.get())):
+            #for each link to change
+                link = match.group(u'url')
+                #debugging purpose
+                #print link
+                if u'jstor.org' in link:
+                    #TODO: Clean URL blacklist
+                    continue
+                
+                ref = RefLink(link, match.group('name'))
+                f = None
+                try:
+                    socket.setdefaulttimeout(20)
+                    f = urllib2.urlopen(ref.url)
+                    #Try to get Content-Type from server
+                    headers = f.info()
+                    contentType = headers.getheader('Content-Type')
+                    if contentType and not self.MIME.search(contentType):
+                        if ref.link.lower().endswith('.pdf'):
+                            # If file has a PDF suffix
+                            self.getPDFTitle(ref, f)
+                        else:
+                            wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link)
+                        if ref.title:
+                            if not re.match('(?i) *microsoft (word|excel|visio)', ref.title):
+                                ref.transform(ispdf=True)
+                                repl = ref.refTitle()
+                            else:
+                                wikipedia.output('\03{lightyellow}WARNING\03{default} : PDF title blacklisted : %s ' % ref.title)
+                                repl = ref.refLink()
+                        else:
+                            repl = ref.refLink()
+                        new_text = new_text.replace(match.group(), repl)
+                        continue
+                    # Test if the redirect was valid
+                    redir = f.geturl()
+                    if redir != ref.link and domain.findall(redir) == domain.findall(link):
+                        if soft404.search(redir) and not soft404.search(ref.link):
+                            wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' % ref.link)
+                            continue
+                        if dirIndex.match(redir) and not dirIndex.match(ref.link):
+                            wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link)
+                            continue
+
+					# Read the first 1,000,000 bytes (0.95 MB)
+                    linkedpagetext = f.read(1000000)
+                    socket.setdefaulttimeout(None)
+
+                except UnicodeError:
+                    #example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html in [[fr:Cyanure]]
+                    wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.aslink()))
+                    continue
+                except urllib2.HTTPError, e:
+                    self.httpError(e.code, ref.url, page.aslink())
+                    if e.code == 410: # 410 Gone, indicates that the resource has been purposely removed
+                        repl = ref.refDead()
+                        new_text = new_text.replace(match.group(), repl)
+                    elif e.code == 404 and (u'\t%s\t' % ref.url in deadLinks):
+                        repl = ref.refDead()
+                        new_text = new_text.replace(match.group(), repl)
+                    continue
+                except (urllib2.URLError, 
+                        socket.error, 
+                        IOError, 
+                        httplib.error), e:
+                #except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e:
+                    wikipedia.output(u'Can\'t get page %s : %s' % (ref.url, e))
+                    continue
+                except ValueError:
+                    #Known bug of httplib, google for :
+                    #"httplib raises ValueError reading chunked content"
+                    continue
+                finally:
+                    if f:
+                        f.close()
+                
+                #remove <script>/<style>/comments/CDATA tags
+                linkedpagetext = self.NON_HTML.sub('', linkedpagetext)
+
+                meta_content = self.META_CONTENT.search(linkedpagetext)
+                enc = []
+                if meta_content:
+                    tag = meta_content.group()
+                    if not contentType: 
+                        contentType = tag
+                    s = self.CHARSET.search(tag)
+                    if s: 
+                        tmp = s.group('enc').strip("\"' ").lower()
+                        enc.append(tmp)
+                        if tmp in ("gb 2312", "gb2312", "gb-2312", "gb_2312"):
+                            enc.append("gbk")
+                        if tmp in ("shift jis", "shiftjis", "shift-jis", "shift_jis"):
+                            enc.append("shift jis 2004")
+                            enc.append("cp932")
+                        if tmp in ("x euc jp", "x-euc-jp"):
+                            enc.append("euc-jp")
+                if not contentType:
+                    wikipedia.output(u'No content-type found for %s' % ref.link)
+                    continue
+                elif not self.MIME.search(contentType):
+                    wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link)
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    continue
+                            
+                if u'.ru' in ref.link or u'.su' in ref.link:
+                    # see http://www.sci.aha.ru/ATL/ra13a.htm : no server encoding, no page encoding
+                    enc = enc + ['koi8-r', 'windows-1251']
+                elif u'.jp' in ref.link:
+                    enc.append("shift jis 2004")
+                    enc.append("cp932")
+                elif u'.zh' in ref.link:
+                    enc.append("gbk")
+                
+                #print(enc)
+                u = UnicodeDammit(linkedpagetext, overrideEncodings = enc)
+                #print(u.triedEncodings)
+                
+                
+                if not u.unicode:
+                    #Some page have utf-8 AND windows-1252 characters,
+                    #Can't easily parse them. (~1 on 1000)
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    wikipedia.output('%s : Hybrid encoding...' % ref.link)
+                    continue
+                
+
+                for m in self.TITLE.finditer(u.unicode):
+                    t = m.group()
+                    if t:
+                        ref.title = t
+                        ref.transform()
+                        if ref.title:
+                            break;
+                
+                if not ref.title:
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    wikipedia.output(u'%s : No title found...' % ref.link)
+                    continue
+                if enc and u.originalEncoding not in enc:
+                    wikipedia.output(u'\03{lightpurple}ENCODING\03{default} : %s (%s)' % (ref.link, ref.title))
+                
+                if u'é' in ref.title:
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    wikipedia.output(u'%s : Hybrid encoding...' % ref.link)
+                    continue
+               
+                if self.titleBlackList.search(ref.title):
+                    repl = ref.refLink()
+                    new_text = new_text.replace(match.group(), repl)
+                    wikipedia.output(u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' % (ref.link, ref.title))
+                    continue
+                if len(ref.title) > 250:
+                    ref.title = ref.title[:250] + "..."
+
+                repl = ref.refTitle()
+                new_text = new_text.replace(match.group(), repl)
+
+            if new_text == page.get():
+                wikipedia.output('No changes were necessary in %s' 
+                                 % page.aslink())
+                continue
+
+            if self.norefbot.lacksReferences(new_text, verbose=False):
+                new_text = self.norefbot.addReferences(new_text)
+            editedpages += 1
+            self.put_page(page, new_text)
+            if self.limit and editedpages >= self.limit:
+                wikipedia.output('Edited %s pages, stopping.' % self.limit)
+                return
+            if editedpages % 20 == 0:
+                wikipedia.output('\03{lightgreen}Checking stop page...\03{default}')
+                actualRev = wikipedia.Page(self.site, 
+                                           self.stopPage).latestRevision()
+                if actualRev != self.stopPageRevId:
+                    wikipedia.output(u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage)
+                    return
+            if self.site.messages():
+                wikipedia.output(u'Bot has new messages. Better stop to check.')
+                return
+
+def main():
+    genFactory = pagegenerators.GeneratorFactory()
+    
+    PageTitles = []
+    xmlFilename = None
+    always = False
+    limit = None
+    namespaces = []
+    generator = None
+    for arg in wikipedia.handleArgs():
+        if arg.startswith('-page:'):
+            PageTitles.append(arg[6:])
+        elif arg.startswith('-namespace:'):
+            try:
+                namespaces.append(int(arg[11:]))
+            except ValueError:
+                namespaces.append(arg[11:])
+        elif arg.startswith('-summary:'):
+            wikipedia.setAction(arg[9:])
+        elif arg == '-always':
+            always = True
+        elif arg.startswith('-limit:'):
+            limit = int(arg[7:])
+        elif arg.startswith('-xmlstart'):
+            if len(arg) == 9:
+                xmlStart = wikipedia.input(
+                    u'Please enter the dumped article to start with:')
+            else:
+                xmlStart = arg[10:]
+        elif arg.startswith('-xml'):
+            if len(arg) == 4:
+                xmlFilename = wikipedia.input(
+                    u'Please enter the XML dump\'s filename:')
+            else:
+                xmlFilename = arg[5:]
+        else:
+            generator = genFactory.handleArg(arg)
+    
+    if xmlFilename:
+        try:
+            xmlStart
+        except NameError:
+            xmlStart = None
+        generator = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+    elif PageTitles:
+        pages= [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
+        generator = iter(pages)
+    if not generator:
+        # syntax error, show help text from the top of this file
+        wikipedia.showHelp('reflinks')
+        wikipedia.stopme()
+        sys.exit()
+    generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50)
+    generator = pagegenerators.RedirectFilterPageGenerator(generator)
+    bot = ReferencesRobot(generator, always, limit)
+    bot.run()
+
+if __name__ == "__main__":
+    try:
+        main()
+    finally:
+        wikipedia.stopme()


Property changes on: trunk/pywikipedia/reflinks.py
___________________________________________________________________
Name: svn:eol-style
   + native





More information about the Pywikipedia-l mailing list