Revision: 8630 Author: xqt Date: 2010-10-09 19:32:57 +0000 (Sat, 09 Oct 2010)
Log Message: ----------- import wikipedia as pywikibot for merging to rewrite
Modified Paths: -------------- trunk/pywikipedia/reflinks.py trunk/pywikipedia/revertbot.py trunk/pywikipedia/selflink.py trunk/pywikipedia/spamremove.py trunk/pywikipedia/speedy_delete.py trunk/pywikipedia/spellcheck.py trunk/pywikipedia/standardize_interwiki.py trunk/pywikipedia/standardize_notes.py
Modified: trunk/pywikipedia/reflinks.py =================================================================== --- trunk/pywikipedia/reflinks.py 2010-10-09 16:11:46 UTC (rev 8629) +++ trunk/pywikipedia/reflinks.py 2010-10-09 19:32:57 UTC (rev 8630) @@ -33,15 +33,19 @@ Basic pagegenerators commands, -page, etc... """ # (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ ) +# (C) Pywikipedia bot team, 2008-2010 # -# Distributed under the terms of the GPL - +# Distributed under the terms of the MIT license. +# __version__ = '$Id$' +#
-from BeautifulSoup import UnicodeDammit import sys, re, urllib2, httplib, socket, codecs, ftplib -import wikipedia, pagegenerators, noreferences import subprocess, tempfile, os, gzip, StringIO +import wikipedia as pywikibot +from BeautifulSoup import UnicodeDammit +import pagegenerators +import noreferences
stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper', 'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe', @@ -90,9 +94,13 @@ 'it':u'Titolo generato automaticamente', }
-soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE) +soft404 = re.compile( + ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', + re.IGNORECASE) # matches an URL at the index of a website -dirIndex = re.compile(ur'^\w+://[^/]+/((default|index).(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE) +dirIndex = re.compile( + ur'^\w+://[^/]+/((default|index).(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', + re.IGNORECASE) # Extracts the domain name domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
@@ -156,7 +164,7 @@ self.xmlStart = xmlStart self.namespaces = namespaces self.skipping = bool(xmlStart) - self.site = wikipedia.getSite() + self.site = pywikibot.getSite()
import xmlreader dump = xmlreader.XmlDump(xmlFilename) @@ -175,7 +183,7 @@ if entry.title != self.xmlStart: continue self.skipping = False - page=wikipedia.Page(self.site, entry.title) + page=pywikibot.Page(self.site, entry.title) if not self.namespaces == []: if page.namespace() not in self.namespaces: continue @@ -188,14 +196,16 @@ def __init__(self, link, name): self.refname = name self.link = link - self.site = wikipedia.getSite() - self.linkComment = wikipedia.translate(self.site, comment) + self.site = pywikibot.getSite() + self.linkComment = pywikibot.translate(self.site, comment) self.url = re.sub(u'#.*', '', self.link) self.title = None
def refTitle(self): """Returns the <ref> with its new title""" - return '<ref%s>[%s %s<!-- %s -->]</ref>' % (self.refname, self.link, self.title, self.linkComment) + return '<ref%s>[%s %s<!-- %s -->]</ref>' % (self.refname, self.link, + self.title, + self.linkComment)
def refLink(self): """No title has been found, return the unbracketed link""" @@ -203,14 +213,14 @@
def refDead(self): """Dead link, tag it with a {{dead link}}""" - tag = wikipedia.translate(self.site, deadLinkTag) % self.link + tag = pywikibot.translate(self.site, deadLinkTag) % self.link return '<ref%s>%s</ref>' % (self.refname, tag)
def transform(self, ispdf = False): """Normalize the title""" #convert html entities if not ispdf: - self.title = wikipedia.html2unicode(self.title) + self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r'-+', '-', self.title) #remove formatting, i.e long useless strings self.title = re.sub(r'[.+-=]{4,}', ' ', self.title) @@ -228,7 +238,7 @@ self.title = self.title.replace('}}', '}}') #prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('''', '''') - self.title = wikipedia.unicode2html(self.title, self.site.encoding()) + self.title = pywikibot.unicode2html(self.title, self.site.encoding()) # TODO : remove HTML when both opening and closing tags are included
def avoid_uppercase(self): @@ -257,10 +267,13 @@ """ def __init__(self): # Match references - self.REFS = re.compile(u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>') - self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*') - self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*') - self.autogen = wikipedia.translate(wikipedia.getSite(), autogen) + self.REFS = re.compile( + u'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>') + self.NAMES = re.compile( + u'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*') + self.GROUPS = re.compile( + u'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*') + self.autogen = pywikibot.translate(pywikibot.getSite(), autogen)
def process(self, text): # keys are ref groups @@ -299,7 +312,7 @@ #First name associated with this content
if name == 'population': - wikipedia.output(content) + pywikibot.output(content) if not name in foundRefNames: # first time ever we meet this name if name == 'population': @@ -350,11 +363,13 @@ name = v[0] if v[1]: name = u'"%s"' % name - text = re.sub(u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, u'<ref name=%s />' % name, text) + text = re.sub( + u'<ref name\s*=\s*(?P<quote>"?)\s*%s\s*(?P=quote)\s*/>' % k, + u'<ref name=%s />' % name, text) return text
class ReferencesRobot: - def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ): + def __init__(self, generator, acceptall=False, limit=None, ignorepdf=False): """ - generator : Page generator - acceptall : boolean, is -always on ? @@ -365,10 +380,11 @@ self.acceptall = acceptall self.limit = limit self.ignorepdf = ignorepdf - self.site = wikipedia.getSite() - self.stopPage = wikipedia.Page(self.site, wikipedia.translate(self.site, stopPage)) + self.site = pywikibot.getSite() + self.stopPage = pywikibot.Page(self.site, + pywikibot.translate(self.site, stopPage))
- local = wikipedia.translate(self.site, badtitles) + local = pywikibot.translate(self.site, badtitles) if local: bad = '(' + globalbadtitles + '|' + local + ')' else: @@ -380,9 +396,9 @@
try : self.stopPageRevId = self.stopPage.latestRevision() - except wikipedia.NoPage : - wikipedia.output(u'The stop page %s does not exist' - % self.stopPage.aslink()) + except pywikibot.NoPage : + pywikibot.output(u'The stop page %s does not exist' + % self.stopPage.title(asLink=True)) raise
# Regex to grasp content-type meta HTML tag in HTML source @@ -392,20 +408,22 @@ # Extract html title from page self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments - self.NON_HTML = re.compile(ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<![CDATA[.*?]]>') + self.NON_HTML = re.compile( + ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<![CDATA[.*?]]>')
# Authorized mime types for HTML pages - self.MIME = re.compile(ur'application/(?:xhtml+xml|xml)|text/(?:ht|x)ml') + self.MIME = re.compile( + ur'application/(?:xhtml+xml|xml)|text/(?:ht|x)ml')
def put_page(self, page, new): """ Prints diffs between orginal and new (text), puts new text for page """ - wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) - wikipedia.showDiff(page.get(), new) + pywikibot.showDiff(page.get(), new) if not self.acceptall: - choice = wikipedia.inputChoice(u'Do you want to accept ' + + choice = pywikibot.inputChoice(u'Do you want to accept ' + u'these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N') @@ -416,22 +434,24 @@ if self.acceptall: try: page.put(new) - except wikipedia.EditConflict: - wikipedia.output(u'Skipping %s because of edit conflict' + except pywikibot.EditConflict: + pywikibot.output(u'Skipping %s because of edit conflict' % (page.title(),)) - except wikipedia.SpamfilterError, e: - wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) - except wikipedia.PageNotSaved, error: - wikipedia.output(u'Error putting page: %s' % (error.args,)) - except wikipedia.LockedPage: - wikipedia.output(u'Skipping %s (locked page)' + except pywikibot.SpamfilterError, e: + pywikibot.output( + u'Cannot change %s because of blacklist entry %s' + % (page.title(), e.url)) + except pywikibot.PageNotSaved, error: + pywikibot.output(u'Error putting page: %s' % (error.args,)) + except pywikibot.LockedPage: + pywikibot.output(u'Skipping %s (locked page)' % (page.title(),)) - except wikipedia.ServerError, e: - wikipedia.output(u'Server Error : %s' % e) + except pywikibot.ServerError, e: + pywikibot.output(u'Server Error : %s' % e)
def httpError(self, err_num, link, pagetitleaslink): """Log HTTP Error""" - wikipedia.output(u'HTTP error (%s) for %s on %s' + pywikibot.output(u'HTTP error (%s) for %s on %s' % (err_num, link, pagetitleaslink), toStdout = True)
@@ -440,24 +460,27 @@ Use pdfinfo to retrieve title from a PDF. Unix-only, I'm afraid. """ - wikipedia.output( u'PDF file.' ) + pywikibot.output( u'PDF file.' ) fd, infile = tempfile.mkstemp() urlobj = os.fdopen(fd, 'r+w') urlobj.write(f.read()) try: - pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False).communicate()[0] + pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], + stdin=urlobj, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=False).communicate()[0] for aline in pdfinfo_out.splitlines(): if aline.lower().startswith('title'): ref.title = aline.split(None)[1:] ref.title = ' '.join(ref.title) - if ref.title != '': wikipedia.output(u'title: ' +ref.title ) - wikipedia.output( u'PDF done.' ) + if ref.title != '': + pywikibot.output(u'title: %s' % ref.title) + pywikibot.output(u'PDF done.') except ValueError: - wikipedia.output( u'pdfinfo value error.' ) + pywikibot.output(u'pdfinfo value error.') except OSError: - wikipedia.output( u'pdfinfo OS error.' ) + pywikibot.output(u'pdfinfo OS error.') except: # Ignore errors - wikipedia.output( u'PDF processing error.' ) + pywikibot.output(u'PDF processing error.') pass finally: urlobj.close() @@ -467,11 +490,12 @@ """ Runs the Bot """ - wikipedia.setAction(wikipedia.translate(self.site, msg)) + pywikibot.setAction(pywikibot.translate(self.site, msg)) try: deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() except IOError: - wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory') + pywikibot.output( + 'You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory') raise socket.setdefaulttimeout(30) editedpages = 0 @@ -480,17 +504,18 @@ # Load the page's text from the wiki new_text = page.get() if not page.canBeEdited(): - wikipedia.output(u"You can't edit page %s" - % page.aslink()) + pywikibot.output(u"You can't edit page %s" + % page.title(asLink=True)) continue - except wikipedia.NoPage: - wikipedia.output(u'Page %s not found' % page.aslink()) + except pywikibot.NoPage: + pywikibot.output(u'Page %s not found' % page.title(asLink=True)) continue - except wikipedia.IsRedirectPage: - wikipedia.output(u'Page %s is a redirect' % page.aslink()) + except pywikibot.IsRedirectPage: + pywikibot.output(u'Page %s is a redirect' + % page.title(asLink=True)) continue
- for match in linksInRef.finditer(wikipedia.removeDisabledParts(page.get())): + for match in linksInRef.finditer(pywikibot.removeDisabledParts(page.get())): #for each link to change link = match.group(u'url') #debugging purpose @@ -508,17 +533,24 @@ headers = f.info() contentType = headers.getheader('Content-Type') if contentType and not self.MIME.search(contentType): - if ref.link.lower().endswith('.pdf') and not self.ignorepdf: + if ref.link.lower().endswith('.pdf') and \ + not self.ignorepdf: # If file has a PDF suffix self.getPDFTitle(ref, f) else: - wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link) + pywikibot.output( + u'\03{lightyellow}WARNING\03{default} : media : %s ' + % ref.link) if ref.title: - if not re.match('(?i) *microsoft (word|excel|visio)', ref.title): + if not re.match( + '(?i) *microsoft (word|excel|visio)', + ref.title): ref.transform(ispdf=True) repl = ref.refTitle() else: - wikipedia.output('\03{lightyellow}WARNING\03{default} : PDF title blacklisted : %s ' % ref.title) + pywikibot.output( + '\03{lightyellow}WARNING\03{default} : PDF title blacklisted : %s ' + % ref.title) repl = ref.refLink() else: repl = ref.refLink() @@ -526,12 +558,19 @@ continue # Get the real url where we end (http redirects !) redir = f.geturl() - if redir != ref.link and domain.findall(redir) == domain.findall(link): - if soft404.search(redir) and not soft404.search(ref.link): - wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' % ref.link) + if redir != ref.link and \ + domain.findall(redir) == domain.findall(link): + if soft404.search(redir) and \ + not soft404.search(ref.link): + pywikibot.output( + u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' + % ref.link) continue - if dirIndex.match(redir) and not dirIndex.match(ref.link): - wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link) + if dirIndex.match(redir) and \ + not dirIndex.match(ref.link): + pywikibot.output( + u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' + % ref.link) continue
# uncompress if necessary @@ -548,15 +587,21 @@ socket.setdefaulttimeout(None)
except UnicodeError: - #example : http://www.adminet.com/jo/20010615%C2%A6/ECOC0100037D.html in [[fr:Cyanure]] - wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.aslink())) + #example : http://www.adminet.com/jo/20010615%C2%A6/ECOC0100037D.html + # in [[fr:Cyanure]] + pywikibot.output( + u'\03{lightred}Bad link\03{default} : %s in %s' + % (ref.url, page.title(asLink=True))) continue except urllib2.HTTPError, e: - wikipedia.output(u'HTTP error (%s) for %s on %s' - % (e.code, ref.url, page.aslink()), + pywikibot.output(u'HTTP error (%s) for %s on %s' + % (e.code, ref.url, + page.title(asLink=True)), toStdout = True) - # 410 Gone, indicates that the resource has been purposely removed - if e.code == 410 or (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)): + # 410 Gone, indicates that the resource has been purposely + # removed + if e.code == 410 or \ + (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue @@ -565,7 +610,8 @@ IOError, httplib.error), e: #except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e: - wikipedia.output(u'Can't retrieve page %s : %s' % (ref.url, e)) + pywikibot.output(u'Can't retrieve page %s : %s' + % (ref.url, e)) continue except ValueError: #Known bug of httplib, google for : @@ -606,21 +652,25 @@ else: enc.append(tmp) else: - wikipedia.output(u'No charset found for %s' % ref.link) + pywikibot.output(u'No charset found for %s' % ref.link) #continue # do not process pages without charset if not contentType: - wikipedia.output(u'No content-type found for %s' % ref.link) + pywikibot.output(u'No content-type found for %s' % ref.link) continue elif not self.MIME.search(contentType): - wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link) + pywikibot.output( + u'\03{lightyellow}WARNING\03{default} : media : %s ' + % ref.link) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue
- # Ugly hacks to try to survive when both server and page return no encoding. + # Ugly hacks to try to survive when both server and page + # return no encoding. # Uses most used encodings for each national suffix if u'.ru' in ref.link or u'.su' in ref.link: - # see http://www.sci.aha.ru/ATL/ra13a.htm : no server encoding, no page encoding + # see http://www.sci.aha.ru/ATL/ra13a.htm : no server + # encoding, no page encoding enc = enc + ['koi8-r', 'windows-1251'] elif u'.jp' in ref.link: enc.append("shift jis 2004") @@ -641,7 +691,7 @@ #Can't easily parse them. (~1 on 1000) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) - wikipedia.output('%s : Hybrid encoding...' % ref.link) + pywikibot.output('%s : Hybrid encoding...' % ref.link) continue
@@ -657,24 +707,29 @@ if not ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) - wikipedia.output(u'%s : No title found...' % ref.link) + pywikibot.output(u'%s : No title found...' % ref.link) continue if enc and u.originalEncoding not in enc: - # BeautifulSoup thinks that the original encoding of our page was not one - # of the encodings we specified. Output a warning. - wikipedia.output(u'\03{lightpurple}ENCODING\03{default} : %s (%s)' % (ref.link, ref.title)) + # BeautifulSoup thinks that the original encoding of our + # page was not one of the encodings we specified. Output a + # warning. + pywikibot.output( + u'\03{lightpurple}ENCODING\03{default} : %s (%s)' + % (ref.link, ref.title))
# XXX Ugly hack if u'é' in ref.title: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) - wikipedia.output(u'%s : Hybrid encoding...' % ref.link) + pywikibot.output(u'%s : Hybrid encoding...' % ref.link) continue
if self.titleBlackList.match(ref.title): repl = ref.refLink() new_text = new_text.replace(match.group(), repl) - wikipedia.output(u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' % (ref.link, ref.title)) + pywikibot.output( + u'\03{lightred}WARNING\03{default} %s : Blacklisted title (%s)' + % (ref.link, ref.title)) continue
# Truncate long titles. 175 is arbitrary @@ -692,22 +747,25 @@ new_text = self.deduplicator.process(new_text)
if new_text == page.get(): - wikipedia.output('No changes were necessary in %s' - % page.aslink()) + pywikibot.output('No changes were necessary in %s' + % page.title(asLink=True)) continue
editedpages += 1 self.put_page(page, new_text)
if self.limit and editedpages >= self.limit: - wikipedia.output('Edited %s pages, stopping.' % self.limit) + pywikibot.output('Edited %s pages, stopping.' % self.limit) return
if editedpages % 20 == 0: - wikipedia.output('\03{lightgreen}Checking stop page...\03{default}') + pywikibot.output( + '\03{lightgreen}Checking stop page...\03{default}') actualRev = self.stopPage.latestRevision() if actualRev != self.stopPageRevId: - wikipedia.output(u'[[%s]] has been edited : Someone wants us to stop.' % self.stopPage) + pywikibot.output( + u'[[%s]] has been edited : Someone wants us to stop.' + % self.stopPage) return
def main(): @@ -720,14 +778,14 @@ limit = None namespaces = [] generator = None - for arg in wikipedia.handleArgs(): + for arg in pywikibot.handleArgs(): if arg.startswith('-namespace:'): try: namespaces.append(int(arg[11:])) except ValueError: namespaces.append(arg[11:]) elif arg.startswith('-summary:'): - wikipedia.setAction(arg[9:]) + pywikibot.setAction(arg[9:]) elif arg == '-always': always = True elif arg == '-ignorepdf': @@ -736,13 +794,13 @@ limit = int(arg[7:]) elif arg.startswith('-xmlstart'): if len(arg) == 9: - xmlStart = wikipedia.input( + xmlStart = pywikibot.input( u'Please enter the dumped article to start with:') else: xmlStart = arg[10:] elif arg.startswith('-xml'): if len(arg) == 4: - xmlFilename = wikipedia.input( + xmlFilename = pywikibot.input( u'Please enter the XML dump's filename:') else: xmlFilename = arg[5:] @@ -759,7 +817,7 @@ generator = genFactory.getCombinedGenerator() if not generator: # syntax error, show help text from the top of this file - wikipedia.showHelp('reflinks') + pywikibot.showHelp('reflinks') return generator = pagegenerators.PreloadingGenerator(generator, pageNumber = 50) generator = pagegenerators.RedirectFilterPageGenerator(generator) @@ -770,4 +828,4 @@ try: main() finally: - wikipedia.stopme() + pywikibot.stopme()
Modified: trunk/pywikipedia/revertbot.py =================================================================== --- trunk/pywikipedia/revertbot.py 2010-10-09 16:11:46 UTC (rev 8629) +++ trunk/pywikipedia/revertbot.py 2010-10-09 19:32:57 UTC (rev 8630) @@ -1,13 +1,21 @@ -import wikipedia, query, userlib - -__version__ = '$Id$' - +#!/usr/bin/python +# -*- coding: utf-8 -*- """ - (c) Bryan Tong Minh, 2008 - (c) Pywikipedia team, 2008-2010 - Licensed under the terms of the MIT license. """ +# +# (C) Bryan Tong Minh, 2008 +# (C) Pywikipedia bot team, 2008-2010 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' +#
+import re +import wikipedia as pywikibot +import query, userlib + + class BaseRevertBot(object): """ Base revert bot
@@ -94,38 +102,39 @@ rev['user'], rev['timestamp']) if self.comment: comment += ': ' + self.comment
- page = wikipedia.Page(self.site, item['title']) - wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.aslink(True, True)) + page = pywikibot.Page(self.site, item['title']) + pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % page.aslink(True, True)) old = page.get() new = rev['*'] - wikipedia.showDiff(old, new) + pywikibot.showDiff(old, new) page.put(new, comment) return comment
def log(self, msg): - wikipedia.output(msg) + pywikibot.output(msg)
-import re
class myRevertBot(BaseRevertBot):
def callback(self, item): if 'top' in item: - page = wikipedia.Page(self.site, item['title']) + page = pywikibot.Page(self.site, item['title']) text=page.get() pattern = re.compile(u'[[.+?:.+?..+?]]', re.UNICODE) return pattern.search(text) >= 0 return False
+ def main(): item = None - for arg in wikipedia.handleArgs(): + for arg in pywikibot.handleArgs(): continue - bot = myRevertBot(site = wikipedia.getSite()) + bot = myRevertBot(site = pywikibot.getSite()) bot.revert_contribs()
if __name__ == "__main__": try: main() finally: - wikipedia.stopme() + pywikibot.stopme()
Modified: trunk/pywikipedia/selflink.py =================================================================== --- trunk/pywikipedia/selflink.py 2010-10-09 16:11:46 UTC (rev 8629) +++ trunk/pywikipedia/selflink.py 2010-10-09 19:32:57 UTC (rev 8630) @@ -25,12 +25,18 @@ All other parameters will be regarded as part of the title of a single page, and the bot will only work on that single page. """ - +# +# (C) Pywikipedia bot team, 2006-2010 +# +# Distributed under the terms of the MIT license. +# __version__='$Id$' +#
-import wikipedia, pagegenerators, catlib +import re, sys +import wikipedia as pywikibot +import pagegenerators, catlib import editarticle -import re, sys
# This is required for the text that is shown when you run this script # with the parameter -help. @@ -78,7 +84,7 @@
def __iter__(self): import xmlreader - mysite = wikipedia.getSite() + mysite = pywikibot.getSite() dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): if mysite.nocapitalize: @@ -89,14 +95,14 @@ re.escape(entry.title[1:])) selflinkR = re.compile(r'[[' + title + '(|[^]]*)?]]') if selflinkR.search(entry.text): - yield wikipedia.Page(mysite, entry.title) + yield pywikibot.Page(mysite, entry.title) continue
class SelflinkBot:
def __init__(self, generator, always=False): self.generator = generator - linktrail = wikipedia.getSite().linktrail() + linktrail = pywikibot.getSite().linktrail() # The regular expression which finds links. Results consist of four groups: # group title is the target page title, that is, everything before | or ]. # group section is the page section. It'll include the # to make life easier for us. @@ -122,9 +128,9 @@ or match.group('section'): return text, False try: - linkedPage = wikipedia.Page(page.site(), match.group('title')) - except wikipedia.InvalidTitle, err: - wikipedia.output(u'Warning: %s' % err) + linkedPage = pywikibot.Page(page.site(), match.group('title')) + except pywikibot.InvalidTitle, err: + pywikibot.output(u'Warning: %s' % err) return text, False
# Check whether the link found is to the current page itself. @@ -137,16 +143,16 @@ if self.always: choice = 'a' else: - wikipedia.output( + pywikibot.output( text[max(0, match.start() - context) : match.start()] \ + '\03{lightred}' + text[match.start() : match.end()] \ + '\03{default}' + text[match.end() : match.end() + context]) - choice = wikipedia.inputChoice( + choice = pywikibot.inputChoice( u'\nWhat shall be done with this selflink?\n', ['unlink', 'make bold', 'skip', 'edit', 'more context', 'unlink all', 'quit'], ['U', 'b', 's', 'e', 'm', 'a', 'q'], 'u') - wikipedia.output(u'') + pywikibot.output(u'')
if choice == 's': # skip this link @@ -161,7 +167,8 @@ return text, True elif choice == 'm': # show more context by recursive self-call - return self.handleNextLink(page, text, match, context = context + 100) + return self.handleNextLink(page, text, match, + context=context + 100) elif choice == 'a': self.always = True elif choice == 'q': @@ -178,14 +185,17 @@ def treat(self, page): # Show the title of the page we're working on. # Highlight the title in purple. - wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) + pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % page.title()) try: oldText = page.get() # Inside image maps, don't touch selflinks, as they're used # to create tooltip labels. See for example: # http://de.wikipedia.org/w/index.php?title=Innenstadt_%28Bautzen%29&diff=... if '<imagemap>' in oldText: - wikipedia.output(u'Skipping page %s because it contains an image map.' % page.aslink()) + pywikibot.output( + u'Skipping page %s because it contains an image map.' + % page.title(asLink=True)) return text = oldText curpos = 0 @@ -193,27 +203,30 @@ match = self.linkR.search(text, pos = curpos) if not match: break - # Make sure that next time around we will not find this same hit. + # Make sure that next time around we will not find this same + # hit. curpos = match.start() + 1 text, jumpToBeginning = self.handleNextLink(page, text, match) if jumpToBeginning: curpos = 0
if oldText == text: - wikipedia.output(u'No changes necessary.') + pywikibot.output(u'No changes necessary.') else: - wikipedia.showDiff(oldText, text) + pywikibot.showDiff(oldText, text) page.put_async(text) - except wikipedia.NoPage: - wikipedia.output(u"Page %s does not exist?!" % page.aslink()) - except wikipedia.IsRedirectPage: - wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink()) - except wikipedia.LockedPage: - wikipedia.output(u"Page %s is locked?!" % page.aslink()) + except pywikibot.NoPage: + pywikibot.output(u"Page %s does not exist?!" + % page.title(asLink=True)) + except pywikibot.IsRedirectPage: + pywikibot.output(u"Page %s is a redirect; skipping." + % page.title(asLink=True)) + except pywikibot.LockedPage: + pywikibot.output(u"Page %s is locked?!" % page.title(asLink=True))
def run(self): - comment = wikipedia.translate(wikipedia.getSite(), msg) - wikipedia.setAction(comment) + comment = pywikibot.translate(pywikibot.getSite(), msg) + pywikibot.setAction(comment)
for page in self.generator: if self.done: break @@ -234,10 +247,11 @@ genFactory = pagegenerators.GeneratorFactory() always = False
- for arg in wikipedia.handleArgs(): + for arg in pywikibot.handleArgs(): if arg.startswith('-xml'): if len(arg) == 4: - xmlFilename = wikipedia.input(u'Please enter the XML dump's filename:') + xmlFilename = pywikibot.input( + u'Please enter the XML dump's filename:') else: xmlFilename = arg[5:] gen = XmlDumpSelflinkPageGenerator(xmlFilename) @@ -265,12 +279,12 @@ pageTitle.append(arg)
if pageTitle: - page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle)) + page = pywikibot.Page(pywikibot.getSite(), ' '.join(pageTitle)) gen = iter([page]) if not gen: gen = genFactory.getCombinedGenerator() if not gen: - wikipedia.showHelp('selflink') + pywikibot.showHelp('selflink') else: if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) @@ -282,4 +296,4 @@ try: main() finally: - wikipedia.stopme() + pywikibot.stopme()
Modified: trunk/pywikipedia/spamremove.py =================================================================== --- trunk/pywikipedia/spamremove.py 2010-10-09 16:11:46 UTC (rev 8629) +++ trunk/pywikipedia/spamremove.py 2010-10-09 19:32:57 UTC (rev 8630) @@ -23,11 +23,20 @@
"""
+# +# (C) Pywikipedia bot team, 2007-2010 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +# + import sys -import wikipedia, editarticle, pagegenerators +import wikipedia as pywikibot +import pagegenerators +import editarticle
-__version__ = '$Id$' - def main(): automatic = False namespaces = [] @@ -48,7 +57,7 @@ 'zh': u'機器人: 移除廣告黑名單連結 %s', } spamSite = '' - for arg in wikipedia.handleArgs(): + for arg in pywikibot.handleArgs(): if arg.startswith("-automatic"): automatic = True elif arg.startswith('-namespace:'): @@ -59,56 +68,61 @@ else: spamSite = arg if not automatic: - wikipedia.put_throttle.setDelay(1) + pywikibot.put_throttle.setDelay(1) if not spamSite: - wikipedia.showHelp('spamremove') - wikipedia.output(u"No spam site specified.") + pywikibot.showHelp('spamremove') + pywikibot.output(u"No spam site specified.") sys.exit() - mysite = wikipedia.getSite() + mysite = pywikibot.getSite() pages = list(set(mysite.linksearch(spamSite))) if namespaces: - pages = list(set(pagegenerators.NamespaceFilterPageGenerator(pages, namespaces))) + pages = list(set(pagegenerators.NamespaceFilterPageGenerator(pages, + namespaces))) if len(pages) == 0: - wikipedia.output('No page found.') + pywikibot.output('No page found.') else: - wikipedia.getall(mysite, pages) + pywikibot.getall(mysite, pages) for p in pages: text = p.get() if not spamSite in text: continue # Show the title of the page we're working on. # Highlight the title in purple. - wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % p.title()) + pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % p.title()) lines = text.split('\n') newpage = [] lastok = "" for line in lines: if spamSite in line: if lastok: - wikipedia.output(lastok) - wikipedia.output('\03{lightred}%s\03{default}' % line) + pywikibot.output(lastok) + pywikibot.output('\03{lightred}%s\03{default}' % line) lastok = None else: newpage.append(line) if line.strip(): if lastok is None: - wikipedia.output(line) + pywikibot.output(line) lastok = line if automatic: answer = "y" else: - answer = wikipedia.inputChoice(u'\nDelete the red lines?', ['yes', 'no', 'edit'], ['y', 'N', 'e'], 'n') + answer = pywikibot.inputChoice(u'\nDelete the red lines?', + ['yes', 'no', 'edit'], + ['y', 'N', 'e'], 'n') if answer == "n": continue elif answer == "e": editor = editarticle.TextEditor() - newtext = editor.edit(text, highlight = spamSite, jumpIndex = text.find(spamSite)) + newtext = editor.edit(text, highlight=spamSite, + jumpIndex=text.find(spamSite)) else: newtext = "\n".join(newpage) if newtext != text: - p.put(newtext, wikipedia.translate(mysite, msg) % spamSite) + p.put(newtext, pywikibot.translate(mysite, msg) % spamSite)
try: main() finally: - wikipedia.stopme() + pywikibot.stopme()
Modified: trunk/pywikipedia/speedy_delete.py =================================================================== --- trunk/pywikipedia/speedy_delete.py 2010-10-09 16:11:46 UTC (rev 8629) +++ trunk/pywikipedia/speedy_delete.py 2010-10-09 19:32:57 UTC (rev 8630) @@ -19,20 +19,25 @@ NOTE: This script currently only works for the Wikipedia project.
""" -__version__ = '$Id$' + # +# (C) Pywikipedia bot team, 2007-2010 +# # Distributed under the terms of the MIT license. # -import wikipedia +__version__ = '$Id$' +# + +import wikipedia as pywikibot import pagegenerators, catlib import time
class SpeedyRobot: + """ This robot will load a list of pages from the category of candidates for + speedy deletion on the language's wiki and give the user an interactive + prompt to decide whether each should be deleted or not. + """ - This robot will load a list of pages from the category of candidates for speedy - deletion on the language's wiki and give the user an interactive prompt to decide - whether each should be deleted or not. - """
csd_cat={ 'wikipedia':{ @@ -452,28 +457,32 @@ Arguments: none yet """ - self.mySite = wikipedia.getSite() - self.csdCat = catlib.Category(self.mySite, wikipedia.translate(self.mySite, self.csd_cat)) + self.mySite = pywikibot.getSite() + self.csdCat = catlib.Category(self.mySite, + pywikibot.translate(self.mySite, + self.csd_cat)) self.savedProgress = None self.preloadingGen = None
def guessReasonForDeletion(self, page): reason = None - # TODO: The following check loads the page 2 times. Find a better way to do it. - if page.isTalkPage() and (page.toggleTalkPage().isRedirectPage() or not page.toggleTalkPage().exists()): + # TODO: The following check loads the page 2 times. Find a better way to + # do it. + if page.isTalkPage() and (page.toggleTalkPage().isRedirectPage() or + not page.toggleTalkPage().exists()): # This is probably a talk page that is orphaned because we # just deleted the associated article. - reason = wikipedia.translate(self.mySite, self.talk_deletion_msg) + reason = pywikibot.translate(self.mySite, self.talk_deletion_msg) else: # Try to guess reason by the template used templateNames = page.templates() - reasons = wikipedia.translate(self.mySite, self.deletion_messages) + reasons = pywikibot.translate(self.mySite, self.deletion_messages)
for templateName in templateNames: if templateName in reasons: if type(reasons[templateName]) is not unicode: #Make alias to delete_reasons - reason = wikipedia.translate(self.mySite, self.delete_reasons)[reasons[templateName]] + reason = pywikibot.translate(self.mySite, self.delete_reasons)[reasons[templateName]] else: reason = reasons[templateName] break @@ -484,26 +493,32 @@
def getReasonForDeletion(self, page): suggestedReason = self.guessReasonForDeletion(page) - wikipedia.output(u'The suggested reason is: \03{lightred}%s\03{default}' % suggestedReason) + pywikibot.output( + u'The suggested reason is: \03{lightred}%s\03{default}' + % suggestedReason)
- # We don't use wikipedia.translate() here because for some languages the + # We don't use pywikibot.translate() here because for some languages the # entry is intentionally left out. if self.mySite.family.name in self.delete_reasons: if page.site().lang in self.delete_reasons[self.mySite.family.name]: - localReasons = wikipedia.translate(page.site().lang, self.delete_reasons) - wikipedia.output(u'') + localReasons = pywikibot.translate(page.site().lang, + self.delete_reasons) + pywikibot.output(u'') localReasoneKey = localReasons.keys() localReasoneKey.sort() for key in localReasoneKey: - wikipedia.output((key + ':').ljust(8) + localReasons[key]) - wikipedia.output(u'') - reason = wikipedia.input(u'Please enter the reason for deletion, choose a default reason, or press enter for the suggested message:') + pywikibot.output((key + ':').ljust(8) + localReasons[key]) + pywikibot.output(u'') + reason = pywikibot.input( + u'Please enter the reason for deletion, choose a default reason, or press enter for the suggested message:') if reason.strip() in localReasons: reason = localReasons[reason] else: - reason = wikipedia.input(u'Please enter the reason for deletion, or press enter for the suggested message:') + reason = pywikibot.input( + u'Please enter the reason for deletion, or press enter for the suggested message:') else: - reason = wikipedia.input(u'Please enter the reason for deletion, or press enter for the suggested message:') + reason = pywikibot.input( + u'Please enter the reason for deletion, or press enter for the suggested message:')
if not reason: reason = suggestedReason @@ -525,56 +540,69 @@ try: pageText = page.get(get_redirect = True).split("\n") count += 1 - except wikipedia.NoPage: - wikipedia.output(u'Page %s does not exist or has already been deleted, skipping.' % page.aslink()) + except pywikibot.NoPage: + pywikibot.output( + u'Page %s does not exist or has already been deleted, skipping.' + % page.title(asLink=True)) continue # Show the title of the page we're working on. # Highlight the title in purple. - wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) - wikipedia.output(u'- - - - - - - - - ') + pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % page.title()) + pywikibot.output(u'- - - - - - - - - ') if len(pageText) > 75: - wikipedia.output('The page detail is too many lines, only output first 50 lines:') - wikipedia.output(u'- - - - - - - - - ') - wikipedia.output(u'\n'.join(pageText[:50])) + pywikibot.output( + 'The page detail is too many lines, only output first 50 lines:') + pywikibot.output(u'- - - - - - - - - ') + pywikibot.output(u'\n'.join(pageText[:50])) else: - wikipedia.output(u'\n'.join(pageText)) - wikipedia.output(u'- - - - - - - - - ') - choice = wikipedia.inputChoice(u'Input action?', ['delete', 'skip', 'update', 'quit'], ['d', 'S', 'u', 'q'], 'S') + pywikibot.output(u'\n'.join(pageText)) + pywikibot.output(u'- - - - - - - - - ') + choice = pywikibot.inputChoice(u'Input action?', + ['delete', 'skip', 'update', + 'quit'], + ['d', 'S', 'u', 'q'], 'S') if choice == 'q': keepGoing = False break elif choice == 'u': - wikipedia.output(u'Updating from CSD category.') + pywikibot.output(u'Updating from CSD category.') self.savedProgress = page.title() startFromBeginning = False break elif choice == 'd': reason = self.getReasonForDeletion(page) - wikipedia.output(u'The chosen reason is: \03{lightred}%s\03{default}' % reason) + pywikibot.output( + u'The chosen reason is: \03{lightred}%s\03{default}' + % reason) page.delete(reason, prompt = False) else: - wikipedia.output(u'Skipping page %s' % page.title()) + pywikibot.output(u'Skipping page %s' % page.title()) startFromBeginning = True if count == 0: if startFromBeginning: - wikipedia.output(u'There are no pages to delete.\nWaiting for 30 seconds or press Ctrl+C to quit...') + pywikibot.output( + u'There are no pages to delete.\nWaiting for 30 seconds or press Ctrl+C to quit...') try: time.sleep(30) except KeyboardInterrupt: keepGoing = False else: startFromBeginning = True - wikipedia.output(u'Quitting program.') + pywikibot.output(u'Quitting program.')
def refreshGenerator(self): - generator = pagegenerators.CategorizedPageGenerator(self.csdCat, start = self.savedProgress) - # wrap another generator around it so that we won't produce orphaned talk pages. + generator = pagegenerators.CategorizedPageGenerator( + self.csdCat, start=self.savedProgress) + # wrap another generator around it so that we won't produce orphaned + # talk pages. generator2 = pagegenerators.PageWithTalkPageGenerator(generator) - self.preloadingGen = pagegenerators.PreloadingGenerator(generator2, pageNumber = 20) + self.preloadingGen = pagegenerators.PreloadingGenerator(generator2, + pageNumber=20)
def main(): # read command line parameters - for arg in wikipedia.handleArgs(): + for arg in pywikibot.handleArgs(): pass #No args yet
bot = SpeedyRobot() @@ -584,4 +612,4 @@ try: main() finally: - wikipedia.stopme() + pywikibot.stopme()
Modified: trunk/pywikipedia/spellcheck.py =================================================================== --- trunk/pywikipedia/spellcheck.py 2010-10-09 16:11:46 UTC (rev 8629) +++ trunk/pywikipedia/spellcheck.py 2010-10-09 19:32:57 UTC (rev 8630) @@ -51,15 +51,17 @@ """ # # (C) Andre Engels, 2005 +# (C) Pywikipedia bot team, 2006-2010 # # Distributed under the terms of the MIT license. # - __version__ = '$Id$' +#
import re, sys -import wikipedia, pagegenerators import string, codecs +import wikipedia as pywikibot +import pagegenerators
msg={ 'ar':u'تدقيق إملائي بمساعدة البوت', @@ -73,8 +75,9 @@ 'pt':u'Bot de correção ortográfica', }
+ class SpecialTerm(object): - def __init__(self,text): + def __init__(self, text): self.style = text
@@ -102,7 +105,8 @@
def getalternatives(string): # Find possible correct words for the incorrect word string - basetext = wikipedia.input(u"Give a text that should occur in the words to be checked.\nYou can choose to give no text, but this will make searching slow:") + basetext = pywikibot.input( + u"Give a text that should occur in the words to be checked.\nYou can choose to give no text, but this will make searching slow:") basetext = basetext.lower() simwords = {} for i in xrange(11): @@ -140,26 +144,28 @@
def askAlternative(word,context=None): correct = None - wikipedia.output(u"="*60) - wikipedia.output(u"Found unknown word '%s'"%word) + pywikibot.output(u"="*60) + pywikibot.output(u"Found unknown word '%s'"%word) if context: - wikipedia.output(u"Context:") - wikipedia.output(u""+context) - wikipedia.output(u"-"*60) + pywikibot.output(u"Context:") + pywikibot.output(u""+context) + pywikibot.output(u"-"*60) while not correct: for i in xrange(len(Word(word).getAlternatives())): - wikipedia.output(u"%s: Replace by '%s'"%(i+1,Word(word).getAlternatives()[i].replace('_',' '))) - wikipedia.output(u"a: Add '%s' as correct"%word) + pywikibot.output(u"%s: Replace by '%s'" + % (i+1, + Word(word).getAlternatives()[i].replace('_',' '))) + pywikibot.output(u"a: Add '%s' as correct"%word) if word[0].isupper(): - wikipedia.output(u"c: Add '%s' as correct"%(uncap(word))) - wikipedia.output(u"i: Ignore once (default)") - wikipedia.output(u"p: Ignore on this page") - wikipedia.output(u"r: Replace text") - wikipedia.output(u"s: Replace text, but do not save as alternative") - wikipedia.output(u"g: Guess (give me a list of similar words)") - wikipedia.output(u"*: Edit by hand") - wikipedia.output(u"x: Do not check the rest of this page") - answer = wikipedia.input(u":") + pywikibot.output(u"c: Add '%s' as correct" % (uncap(word))) + pywikibot.output(u"i: Ignore once (default)") + pywikibot.output(u"p: Ignore on this page") + pywikibot.output(u"r: Replace text") + pywikibot.output(u"s: Replace text, but do not save as alternative") + pywikibot.output(u"g: Guess (give me a list of similar words)") + pywikibot.output(u"*: Edit by hand") + pywikibot.output(u"x: Do not check the rest of this page") + answer = pywikibot.input(u":") if answer == "": answer = "i" if answer in "aAiIpP": correct = word @@ -169,11 +175,13 @@ elif answer in "pP": pageskip.append(word) elif answer in "rRsS": - correct = wikipedia.input(u"What should I replace it by?") + correct = pywikibot.input(u"What should I replace it by?") if answer in "rR": if correct_html_codes: correct = removeHTML(correct) - if correct != cap(word) and correct != uncap(word) and correct != word: + if correct != cap(word) and \ + correct != uncap(word) and \ + correct != word: try: knownwords[word] += [correct.replace(' ','_')] except KeyError: @@ -190,7 +198,7 @@ if possible: print "Found alternatives:" for pos in possible: - wikipedia.output(" %s"%pos) + pywikibot.output(" %s"%pos) else: print "No similar words found." elif answer=="*": @@ -204,7 +212,8 @@ return correct
def removeHTML(page): - # TODO: Consider removing this; this stuff can be done by cosmetic_changes.py + # TODO: Consider removing this; this stuff can be done by + # cosmetic_changes.py result = page result = result.replace('Ä',u'Ä') result = result.replace('ä',u'ä') @@ -266,13 +275,15 @@ loc += len(match.group(1)) bigword = Word(match.group(2)) smallword = bigword.derive() - if not Word(smallword).isCorrect(checkalternative = knownonly) and (checknames or not smallword[0].isupper()): - replacement = askAlternative(smallword,context=text[max(0,loc-40):loc+len(match.group(2))+40]) + if not Word(smallword).isCorrect(checkalternative = knownonly) and \ + (checknames or not smallword[0].isupper()): + replacement = askAlternative(smallword, + context=text[max(0,loc-40):loc + len(match.group(2))+40]) if replacement == edit: import editarticle editor = editarticle.TextEditor() # TODO: Don't know to which index to jump - newtxt = editor.edit(text, jumpIndex = 0, highlight = smallword) + newtxt = editor.edit(text, jumpIndex = 0, highlight=smallword) if newtxt: text = newtxt elif replacement == endpage: @@ -291,6 +302,7 @@ pageskip = [] return text
+ class Word(object): def __init__(self,text): self.word = text @@ -348,13 +360,16 @@ if rep == self.derive(): return self.word if self.derive() not in self.word: - return wikipedia.input(u"Please give the result of replacing %s by %s in %s:"%(self.derive(),rep,self.word)) + return pywikibot.input( + u"Please give the result of replacing %s by %s in %s:" + % (self.derive(), rep, self.word)) return self.word.replace(self.derive(),rep)
def isCorrect(self,checkalternative = False): # If checkalternative is True, the word will only be found incorrect if # it is on the spelling list as a spelling error. Otherwise it will - # be found incorrect if it is not on the list as a correctly spelled word. + # be found incorrect if it is not on the list as a correctly spelled + # word. if self.word == "": return True if self.word in pageskip: @@ -367,7 +382,7 @@ except KeyError: pass if self.word != uncap(self.word): - return Word(uncap(self.word)).isCorrect(checkalternative = checkalternative) + return Word(uncap(self.word)).isCorrect(checkalternative=checkalternative) else: if checkalternative: if checklang == 'nl' and self.word.endswith("'s"): @@ -424,7 +439,7 @@ checklang = None knownonly = False
- for arg in wikipedia.handleArgs(): + for arg in pywikibot.handleArgs(): if arg.startswith("-start:"): start = arg[7:] elif arg.startswith("-newpages"): @@ -446,11 +461,11 @@ else: title.append(arg)
- mysite = wikipedia.getSite() + mysite = pywikibot.getSite() if not checklang: checklang = mysite.language() - wikipedia.setAction(wikipedia.translate(mysite,msg)) - filename = wikipedia.config.datafilepath('spelling', + pywikibot.setAction(pywikibot.translate(mysite,msg)) + filename = pywikibot.config.datafilepath('spelling', 'spelling-' + checklang + '.txt') print "Getting wordlist" try: @@ -480,40 +495,43 @@ else: print "Wordlist successfully loaded." # This is a purely interactive bot, we therefore do not want to put-throttle - wikipedia.put_throttle.setDelay(1) + pywikibot.put_throttle.setDelay(1) except: - wikipedia.stopme() + pywikibot.stopme() raise try: if newpages: - for (page, date, length, loggedIn, user, comment) in wikipedia.getSite().newpages(1000): + for (page, date, length, loggedIn, user, comment) in pywikibot.getSite().newpages(1000): try: text = page.get() - except wikipedia.Error: + except pywikibot.Error: pass else: - text = spellcheck(text,checknames=checknames,knownonly=knownonly) + text = spellcheck(text, checknames=checknames, + knownonly=knownonly) if text != page.get(): page.put(text) elif start: for page in pagegenerators.PreloadingGenerator(pagegenerators.AllpagesPageGenerator(start=start,includeredirects=False)): try: text = page.get() - except wikipedia.Error: + except pywikibot.Error: pass else: - text = spellcheck(text,checknames=checknames,knownonly=knownonly) + text = spellcheck(text, checknames=checknames, + knownonly=knownonly) if text != page.get(): page.put(text)
if longpages: - for (page, length) in wikipedia.getSite().longpages(500): + for (page, length) in pywikibot.getSite().longpages(500): try: text = page.get() - except wikipedia.Error: + except pywikibot.Error: pass else: - text = spellcheck(text, checknames = checknames,knownonly=knownonly) + text = spellcheck(text, checknames=checknames, + knownonly=knownonly) if text != page.get(): page.put(text)
@@ -521,20 +539,20 @@ title = ' '.join(title) while title != '': try: - page = wikipedia.Page(mysite,title) + page = pywikibot.Page(mysite,title) text = page.get() - except wikipedia.NoPage: + except pywikibot.NoPage: print "Page does not exist." - except wikipedia.IsRedirectPage: + except pywikibot.IsRedirectPage: print "Page is a redirect page" else: text = spellcheck(text,knownonly=knownonly) if text != page.get(): page.put(text) - title = wikipedia.input(u"Which page to check now? (enter to stop)") + title = pywikibot.input(u"Which page to check now? (enter to stop)") finally: - wikipedia.stopme() - filename = wikipedia.config.datafilepath('spelling', + pywikibot.stopme() + filename = pywikibot.config.datafilepath('spelling', 'spelling-' + checklang + '.txt') if rebuild: list = knownwords.keys() @@ -547,7 +565,8 @@ if Word(word).isCorrect(): if word != uncap(word): if Word(uncap(word)).isCorrect(): - # Capitalized form of a word that is in the list uncapitalized + # Capitalized form of a word that is in the list + # uncapitalized continue f.write("1 %s\n"%word) else:
Modified: trunk/pywikipedia/standardize_interwiki.py =================================================================== --- trunk/pywikipedia/standardize_interwiki.py 2010-10-09 16:11:46 UTC (rev 8629) +++ trunk/pywikipedia/standardize_interwiki.py 2010-10-09 19:32:57 UTC (rev 8630) @@ -9,7 +9,7 @@ """ # # (C) Rob W.W. Hooft, 2003 -# (C) Filnik, 2007 +# (C) Pywikipedia bot team, 2003-2010 # # Distributed under the terms of the MIT license. # @@ -17,8 +17,9 @@ #
import os, sys -import wikipedia, config import difflib +import wikipedia as pywikibot +import config
# The summary that the Bot will use. comment = { @@ -47,47 +48,48 @@ nothing = False
# Load the default parameters and start -for arg in wikipedia.handleArgs(): +for arg in pywikibot.handleArgs(): if arg.startswith('-start'): if len(arg) == 6: - start = unicode(wikipedia.input(u'From what page do you want to start?')) + start = unicode(pywikibot.input( + u'From what page do you want to start?')) else: start = unicode(arg[7:])
-site = wikipedia.getSite() -comm = wikipedia.translate(site, comment) +site = pywikibot.getSite() +comm = pywikibot.translate(site, comment)
# What follows is the main part of the code. try: for pl in site.allpages(start): plname = pl.title() - wikipedia.output(u'\nLoading %s...' % plname) + pywikibot.output(u'\nLoading %s...' % plname) try: oldtext = pl.get() - except wikipedia.IsRedirectPage: - wikipedia.output(u"%s is a redirect!" % plname) + except pywikibot.IsRedirectPage: + pywikibot.output(u"%s is a redirect!" % plname) continue old = pl.interwiki() new = {} for pl2 in old: new[pl2.site()] = pl2 - newtext = wikipedia.replaceLanguageLinks(oldtext, new) + newtext = pywikibot.replaceLanguageLinks(oldtext, new) if new: if oldtext != newtext: - wikipedia.showDiff(oldtext, newtext) + pywikibot.showDiff(oldtext, newtext) # Submit changes try: status, reason, data = pl.put(newtext, comment=comm) if str(status) != '302': - wikipedia.output(status, reason) - except wikipedia.LockedPage: - wikipedia.output(u"%s is locked" % plname) + pywikibot.output(status, reason) + except pywikibot.LockedPage: + pywikibot.output(u"%s is locked" % plname) continue else: - wikipedia.output(u'No changes needed.') + pywikibot.output(u'No changes needed.') continue else: - wikipedia.output(u'No interwiki found.') + pywikibot.output(u'No interwiki found.') continue finally: - wikipedia.stopme() + pywikibot.stopme()
Modified: trunk/pywikipedia/standardize_notes.py =================================================================== --- trunk/pywikipedia/standardize_notes.py 2010-10-09 16:11:46 UTC (rev 8629) +++ trunk/pywikipedia/standardize_notes.py 2010-10-09 19:32:57 UTC (rev 8630) @@ -8,7 +8,7 @@
NOTE: This script is not capable of handling the <ref></ref> syntax. It just handles the {{ref}} syntax, which is still used, but DEPRECATED on the English -Wikipedia. +wikipedia.
You can run the bot with the following commandline parameters:
@@ -20,7 +20,8 @@ -page - Only edit a single page. Argument can also be given as "-page:pagename". You can give this parameter multiple times to edit multiple pages. --regex - Make replacements using regular expressions. (Obsolete; always True) +-regex - Make replacements using regular expressions. + (Obsolete; always True) -except:XYZ - Ignore pages which contain XYZ. If the -regex argument is given, XYZ will be regarded as a regular expression. -namespace:n - Namespace to process. Works only with a sql dump @@ -41,17 +42,18 @@ # __version__ = '$Id$' # -# 2005-07-15: Find name of section containing citations: doFindRefSection(). (SEWilco) +# 2005-07-15: Find name of section containing citations: doFindRefSection(). +# (SEWilco) # 2005-07-15: Obey robots.txt restrictions. (SEWilco) -# 2005-07-15: Build list of all sections which may contain citations: doFindAllCitationSections(). (SEWilco) +# 2005-07-15: Build list of all sections which may contain citations: +# doFindAllCitationSections(). (SEWilco) #
-#from __future__ import generators import subprocess, sys, re, random import socket, urllib, robotparser -import wikipedia, pagegenerators, config - from datetime import date +import wikipedia as pywikibot +import pagegenerators, config
# httpcache is optional have_httpcache = True @@ -77,7 +79,8 @@ }
fixes = { - # These replacements will convert alternate reference formats to format used by this tool. + # These replacements will convert alternate reference formats to format used + # by this tool. 'ALTREFS': { 'regex': True, # We don't want to mess up pages which discuss HTML tags, so we skip @@ -95,7 +98,8 @@ }, 'replacements': [ # Everything case-insensitive (?i) - # These translate variations of footnote templates to ref|note format. + # These translate variations of footnote templates to ref|note + # format. (r'(?i){{an|(.*?)}}', r"{{ref|\1}}"), (r'(?i){{anb|(.*?)}}', r"{{note|\1}}"), (r'(?i){{endnote|(.*?)}}', r"{{note|\1}}"), @@ -141,50 +145,56 @@
# news sites for which to generate 'news reference' citations, the org name, and prefix to strip newssites = [ - ( 'abcnews.go.com', 'ABC News', 'ABC News: ' ), - ( 'books.guardian.co.uk', 'The Guardian', 'Guardian Unlimited : The Guardian : ' ), - ( 'edition.cnn.com', 'CNN', 'CNN.com - ' ), - ( 'news.bbc.co.uk', 'BBC', 'BBC NEWS : ' ), - ( 'news.scotsman.com', 'The Scotsman', 'Scotsman.com News - ' ), - ( 'nyobserver.com', 'New York Observer', '' ), - ( 'observer.guardian.co.uk', 'The Guardian', 'The Observer : ' ), - ( 'politics.guardian.co.uk', 'The Guardian', 'Guardian Unlimited Politics : ' ), - ( 'seattletimes.nwsource.com', 'The Seattle Times', 'The Seattle Times: ' ), - ( 'service.spiegel.de', 'Der Spiegel', '' ), - ( 'thescotsman.scotsman.com', 'The Scotsman', 'The Scotsman - ' ), - ( 'today.reuters.com', 'Reuters', 'Latest News and Financial Information : ' ), - ( 'today.reuters.co.uk', 'Reuters', 'Latest News and Financial Information : ' ), - ( 'www.boston.com', 'The Boston Globe', 'Boston.com / ' ), - ( 'www.cbsnews.com', 'CBS News', 'CBS News : ' ), - ( 'www.cnn.com', 'CNN', 'CNN.com - ' ), - ( 'www.cnsnews.com', 'Cybercast News Service', '' ), - ( 'www.csmonitor.com', 'Christian Science Monitor', '' ), - ( 'www.dallasnews.com', 'The Dallas Morning News', '' ), - ( 'www.forbes.com', 'Forbes', '' ), - ( 'www.foxnews.com', 'Fox News Channel', 'FOXNews.com - ' ), - ( 'www.gnn.com', 'Government News Network', 'GNN - ' ), - ( 'www.guardian.co.uk', 'The Guardian', 'Guardian Unlimited : The Guardian : ' ), - ( 'www.latimes.com', 'Los Angeles Times', '' ), - ( 'www.msnbc.msn.com', 'MSNBC', '' ), - ( 'www.nationalreview.com', 'National Review', '' ), - ( 'www.nytimes.com', 'The New York Times', '' ), - ( 'www.sfgate.com', 'San Francisco Chronicle', '' ), - ( 'www.socialistworker.co.uk', 'Socialist Worker', '' ), - ( 'www.spectator.org', 'The American Spectator', '' ), - ( 'www.telegraph.co.uk', 'The Daily Telegraph', 'Telegraph newspaper online - ' ), - ( 'www.time.com', 'TIME', '' ), - ( 'www.timesonline.co.uk', 'The Times', 'World news from The Times and the Sunday Times - ' ), - ( 'www.usatoday.com', 'USA Today', 'USATODAY.com - ' ), - ( 'www.washingtonpost.com', 'The Washington Post', '' ), - ( 'www.washtimes.com', 'The Washington Times', '' ), - ( 'www.weeklystandard.com', 'The Weekly Standard', '' ), - ( 'www.wired.com', 'Wired magazine', 'Wired News: ' ), - ( 'wwwimage.cbsnews.com', 'CBS News', 'CBS News : ' ), + ('abcnews.go.com', 'ABC News', 'ABC News: '), + ('books.guardian.co.uk', 'The Guardian', + 'Guardian Unlimited : The Guardian : '), + ('edition.cnn.com', 'CNN', 'CNN.com - '), + ('news.bbc.co.uk', 'BBC', 'BBC NEWS : '), + ('news.scotsman.com', 'The Scotsman', 'Scotsman.com News - '), + ('nyobserver.com', 'New York Observer', ''), + ('observer.guardian.co.uk', 'The Guardian', 'The Observer : '), + ('politics.guardian.co.uk', 'The Guardian', + 'Guardian Unlimited Politics : '), + ('seattletimes.nwsource.com', 'The Seattle Times', 'The Seattle Times: '), + ('service.spiegel.de', 'Der Spiegel', ''), + ('thescotsman.scotsman.com', 'The Scotsman', 'The Scotsman - '), + ('today.reuters.com', 'Reuters', 'Latest News and Financial Information : '), + ('today.reuters.co.uk', 'Reuters', + 'Latest News and Financial Information : '), + ('www.boston.com', 'The Boston Globe', 'Boston.com / '), + ('www.cbsnews.com', 'CBS News', 'CBS News : '), + ('www.cnn.com', 'CNN', 'CNN.com - '), + ('www.cnsnews.com', 'Cybercast News Service', ''), + ('www.csmonitor.com', 'Christian Science Monitor', ''), + ('www.dallasnews.com', 'The Dallas Morning News', ''), + ('www.forbes.com', 'Forbes', ''), + ('www.foxnews.com', 'Fox News Channel', 'FOXNews.com - '), + ('www.gnn.com', 'Government News Network', 'GNN - '), + ('www.guardian.co.uk', 'The Guardian', + 'Guardian Unlimited : The Guardian : '), + ('www.latimes.com', 'Los Angeles Times', ''), + ('www.msnbc.msn.com', 'MSNBC', ''), + ('www.nationalreview.com', 'National Review', ''), + ('www.nytimes.com', 'The New York Times', ''), + ('www.sfgate.com', 'San Francisco Chronicle', ''), + ('www.socialistworker.co.uk', 'Socialist Worker', ''), + ('www.spectator.org', 'The American Spectator', ''), + ('www.telegraph.co.uk', 'The Daily Telegraph', + 'Telegraph newspaper online - '), + ('www.time.com', 'TIME', ''), + ('www.timesonline.co.uk', 'The Times', + 'World news from The Times and the Sunday Times - '), + ('www.usatoday.com', 'USA Today', 'USATODAY.com - '), + ('www.washingtonpost.com', 'The Washington Post', ''), + ('www.washtimes.com', 'The Washington Times', ''), + ('www.weeklystandard.com', 'The Weekly Standard', ''), + ('www.wired.com', 'Wired magazine', 'Wired News: '), + ('wwwimage.cbsnews.com', 'CBS News', 'CBS News : '), ]
+ class ReplacePageGenerator: - """ - Generator which will yield Pages for pages that might contain text to + """ Generator which will yield Pages for pages that might contain text to replace. These pages might be retrieved from a local SQL dump file or a text file, or as a list of pages entered by the user.
@@ -205,7 +215,9 @@ will be used when source is 'sqldump'. * pagenames - a list of pages which will be used when source is 'userinput'. + """ + def __init__(self, source, replacements, exceptions, regex = False, namespace = -1, textfilename = None, sqlfilename = None, categoryname = None, pagenames = None): self.source = source self.replacements = replacements @@ -218,8 +230,7 @@ self.pagenames = pagenames
def read_pages_from_sql_dump(self): - """ - Generator which will yield Pages to pages that might contain text to + """ Generator which will yield Pages to pages that might contain text to replace. These pages will be retrieved from a local sql dump file (cur table).
@@ -229,12 +240,13 @@ are values * exceptions - a list of strings; pages which contain one of these won't be changed. - * regex - if the entries of replacements and exceptions should - be interpreted as regular expressions + * regex - if the entries of replacements and exceptions + should be interpreted as regular expressions + """ - mysite = wikipedia.getSite() + mysite = pywikibot.getSite() import sqldump - dump = sqldump.SQLdump(self.sqlfilename, wikipedia.getSite().encoding()) + dump = sqldump.SQLdump(self.sqlfilename, pywikibot.getSite().encoding()) for entry in dump.entries(): skip_page = False if self.namespace != -1 and self.namespace != entry.namespace: @@ -255,11 +267,11 @@ if self.regex: old = re.compile(old) if old.search(entry.text): - yield wikipedia.Page(mysite, entry.full_title()) + yield pywikibot.Page(mysite, entry.full_title()) break else: if old in entry.text: - yield wikipedia.Page(mysite, entry.full_title()) + yield pywikibot.Page(mysite, entry.full_title()) break
def read_pages_from_category(self): @@ -270,9 +282,10 @@
Arguments: * textfilename - the textfile's path, either absolute or relative + """ import catlib - category = catlib.Category(wikipedia.getSite(), self.categoryname) + category = catlib.Category(pywikibot.getSite(), self.categoryname) for page in category.articles(recurse = False): yield page
@@ -284,6 +297,7 @@
Arguments: * textfilename - the textfile's path, either absolute or relative + """ f = open(self.textfilename, 'r') # regular expression which will find [[wiki links]] @@ -294,7 +308,7 @@ # TODO: use findall() instead. m=R.match(line) if m: - yield wikipedia.Page(wikipedia.getSite(), m.group(1)) + yield pywikibot.Page(pywikibot.getSite(), m.group(1)) f.close()
def read_pages_from_wiki_page(self): @@ -305,9 +319,10 @@
Arguments: * pagetitle - the title of a page on the home wiki + ''' - listpage = wikipedia.Page(wikipedia.getSite(), self.pagetitle) - list = wikipedia.get(listpage) + listpage = pywikibot.Page(pywikibot.getSite(), self.pagetitle) + list = pywikibot.get(listpage) # TODO - UNFINISHED
# TODO: Make MediaWiki's search feature available. @@ -326,7 +341,7 @@ yield pl elif self.source == 'userinput': for pagename in self.pagenames: - yield wikipedia.Page(wikipedia.getSite(), pagename) + yield pywikibot.Page(pywikibot.getSite(), pagename)
class ReplaceRobot: def __init__(self, generator, replacements, refsequence, references, @@ -375,36 +390,40 @@ new_text = new_text.replace(old, new)
# Find name of Notes section. - refsectionname = self.doFindRefSection( new_text ) + refsectionname = self.doFindRefSection(new_text) # Get list of all sections which may contain citations. - refsectionlist = self.doFindAllCitationSections( new_text, refsectionname ) + refsectionlist = self.doFindAllCitationSections(new_text, + refsectionname) # Read existing Notes section contents into references list - wikipedia.output( u"Reading existing Notes section" ) + pywikibot.output(u"Reading existing Notes section") self.doReadReferencesSection( new_text, refsectionname ) while self.references and self.references[len(self.references)-1] == u'\n': del self.references[len(self.references)-1] # delete trailing empty lines # Convert any external links to footnote references - wikipedia.output( u"Converting external links" ) - new_text = self.doConvertExternalLinks( new_text ) + pywikibot.output(u"Converting external links" ) + new_text = self.doConvertExternalLinks(new_text) # Accumulate ordered list of all references - wikipedia.output( u"Collecting references" ) + pywikibot.output(u"Collecting references") (duplicatefound, self.refusage) = self.doBuildSequenceListOfReferences( new_text ) # Rewrite references, including dealing with duplicates. - wikipedia.output( u"Rewriting references" ) - new_text = self.doRewriteReferences( new_text, self.refusage, refsectionname ) + pywikibot.output(u"Rewriting references") + new_text = self.doRewriteReferences(new_text, self.refusage, + refsectionname) # Reorder Notes to match sequence of ordered list - wikipedia.output( u"Collating references" ) - self.references = self.doReorderReferences( self.references, self.refusage) + pywikibot.output(u"Collating references") + self.references = self.doReorderReferences(self.references, + self.refusage) # Rebuild Notes section - wikipedia.output( u"Rebuilding References section" ) - new_text = self.doUpdateReferencesSection( new_text, self.refusage, refsectionname ) + pywikibot.output(u"Rebuilding References section" ) + new_text = self.doUpdateReferencesSection(new_text, self.refusage, + refsectionname) return new_text
def doConvertExternalLinks(self, original_text): + """ Returns the text which is generated by converting external links to + References. Adds References to reference list. + """ - Returns the text which is generated by converting external links to References. - Adds References to reference list. - """ new_text = '' # Default is no text skipsection = False for text_line in original_text.splitlines(True): # Scan all text line by line @@ -422,7 +441,7 @@ # TODO: recognize {{inline}} invisible footnotes when something can be done with them # # Ignore lines within comments - if not text_line.startswith( u'<!--' ): + if not text_line.startswith( u'<!--'): # Fix erroneous external links in double brackets Rextlink = re.compile(r'(?i)[[(?P<linkname>http://%5B%5E%5C%5D%5D+?)%5C%5D%5C]') # TODO: compiling the regex each time might be inefficient @@ -485,20 +504,17 @@ m = re.search( r'==+(?P<sectionname>[^=]+)==', text_line ) if m: # if in a section, remember section name sectionname = m.group('sectionname').strip() - wikipedia.output( u'Section: %s' % sectionname ) + pywikibot.output( u'Section: %s' % sectionname ) else: # else not a section name so look for reference n = re.search( r'(i?){{(note|ibid)[|]', text_line ) if n: # if reference found refsectionname = sectionname # found reference section - wikipedia.output( u'Ref section: %s' % refsectionname ) + pywikibot.output( u'Ref section: %s' % refsectionname ) break # stop looking return refsectionname
def doFindAllCitationSections(self, original_text, refsectionname): - - """ - Returns list of sections which may contain citations. - """ + """ Returns list of sections which may contain citations. """ refsectionlist = [ ( refsectionname) ] sectionname = '' for text_line in original_text.splitlines(True): # Scan all text line by line @@ -523,13 +539,13 @@ if m: # if in a section, check if should skip this section if refsectionname != '': # if a certain section name has been identified m_section = m.group('sectionname') - wikipedia.output( u'Looking for "%s": "%s"' % (refsectionname,unicode(m_section)) ) + pywikibot.output( u'Looking for "%s": "%s"' % (refsectionname,unicode(m_section)) ) if unicode(m_section.strip()) == unicode(refsectionname): - wikipedia.output( u'Found Ref section.' ) + pywikibot.output( u'Found Ref section.') skipsection = True # skipsection left True so no further links converted else: # else grab all possible sections if m.group('sectionname').lower().strip() in referencesectionnames: - wikipedia.output( 'RefSection found by default names: %s' % m.group('sectionname') ) + pywikibot.output('RefSection found by default names: %s' % m.group('sectionname') ) skipsection = True # skipsection left True so no further links converted if skipsection: new_text = new_text + text_line # skip section, so retain text. @@ -543,11 +559,11 @@ m = Rtext_line.search( text_line ) alphabet26 = u'abcdefghijklmnopqrstuvwxyz' while m: # if found a reference - if m.group('reftype').lower() in ( 'ref', 'ref_num', 'ref_label' ): # confirm ref + if m.group('reftype').lower() in ('ref', 'ref_num', 'ref_label'): # confirm ref refkey = m.group('refname').strip() if refkey != '': if refkey in refusage: - # wikipedia.output( u'refusage[%s] = %s' % (refkey,refusage[refkey]) ) + # pywikibot.output( u'refusage[%s] = %s' % (refkey,refusage[refkey]) ) if refusage[refkey][2] == 0: # if first use of reference text_line=text_line[:m.start(0)] + '{{ref|%s}}' % (refkey) + text_line[m.end(0):] refusage[refkey][2] += 1 # count use of reference @@ -574,60 +590,71 @@ urlfile = None urlheaders = None if len(extlink_linkname) > 5: - socket.setdefaulttimeout( 20 ) # timeout in seconds - wikipedia.get_throttle() # throttle down to Wikipedia rate + socket.setdefaulttimeout(20) # timeout in seconds + pywikibot.get_throttle() # throttle down to Wikipedia rate # Obey robots.txt restrictions rp = robotparser.RobotFileParser() rp.set_url( extlink_linkname ) try: rp.read() # read robots.txt except (IOError, socket.timeout): - wikipedia.output( u'Error accessing URL: %s' % unicode(extlink_linkname) ) + pywikibot.output(u'Error accessing URL: %s' + % unicode(extlink_linkname)) else: urlobj = None if not rp.can_fetch( "*", extlink_linkname ): - wikipedia.output( u'Robot prohibited: %s' % unicode(extlink_linkname) ) + pywikibot.output(u'Robot prohibited: %s' + % unicode(extlink_linkname)) else: # else access allowed try: if have_httpcache: - cache = HTTPCache( extlink_linkname ) + cache = HTTPCache(extlink_linkname) urlfile = cache.filename() # filename of cached date urlheaders = cache.info() else: - (urlfile, urlheaders) = urllib.urlretrieve( extlink_linkname ) + (urlfile, urlheaders) = urllib.urlretrieve(extlink_linkname) except IOError: - wikipedia.output( u'Error accessing URL. %s' % unicode(extlink_linkname) ) + pywikibot.output(u'Error accessing URL. %s' + % unicode(extlink_linkname)) except (socket.herror, socket.gaierror), (err, msg): - wikipedia.output( u'Error %i accessing URL, %s. %s' % (err, unicode(msg), unicode(extlink_linkname)) ) + pywikibot.output(u'Error %i accessing URL, %s. %s' + % (err, unicode(msg), + unicode(extlink_linkname))) except socket.timeout, msg: - wikipedia.output( u'Error accessing URL, %s. %s' % (unicode(msg), unicode(extlink_linkname)) ) + pywikibot.output(u'Error accessing URL, %s. %s' + % (unicode(msg), + unicode(extlink_linkname))) except: # Ignore other errors pass if urlfile != None: urlobj = open( urlfile ) if extlink_linkname.lower().endswith('.pdf'): # If file has a PDF suffix - wikipedia.output( u'PDF file.' ) + pywikibot.output( u'PDF file.') try: pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, shell=False).communicate()[0] for aline in pdfinfo_out.splitlines(): if aline.lower().startswith('title'): urltitle = aline.split(None)[1:] urltitle = ' '.join(urltitle) - if urltitle != '': wikipedia.output(u'title: ' +urltitle ) + if urltitle: + pywikibot.output(u'title: %s' + % urltitle) else: if aline.lower().startswith('author'): urlauthor = aline.split(None)[1:] urlauthor = ' '.join(urlauthor) - if urlauthor != '': wikipedia.output(u'author: ' +urlauthor ) + if urlauthor: + pywikibot.output(u'author: %s' + % urlauthor ) except ValueError: - wikipedia.output( u'pdfinfo value error.' ) + pywikibot.output( u'pdfinfo value error.') except OSError: - wikipedia.output( u'pdfinfo OS error.' ) + pywikibot.output( u'pdfinfo OS error.') except: # Ignore errors - wikipedia.output( u'PDF processing error.' ) + pywikibot.output( u'PDF processing error.') pass - wikipedia.output( u'PDF done.' ) + pywikibot.output( u'PDF done.') if urlobj: urlobj.close() else: @@ -643,14 +670,16 @@ except: urltitle = u' ' # error, no title urltitle = u' '.join(urltitle.split()) # merge whitespace - wikipedia.output( u'::::Title: %s' % urltitle ) + pywikibot.output( u'::::Title: %s' % urltitle ) break # found a title so stop looking else: if maxalines < 1: - wikipedia.output( u'No title in URL. %s' % unicode(extlink_linkname) ) + pywikibot.output( + u'No title in URL. %s' + % unicode(extlink_linkname) ) else: if urlobj != None: - wikipedia.output( u'::+URL: ' + extlink_linkname ) + pywikibot.output( u'::+URL: ' + extlink_linkname ) # urlinfo = urlobj.info() aline = urlobj.read() full_page = '' @@ -664,7 +693,7 @@ try: urltitle = unicode(titleRE.group('HTMLtitle'), 'utf-8') urltitle = u' '.join(urltitle.split()) # merge whitespace - wikipedia.output( u'::::Title: %s' % urltitle ) + pywikibot.output( u'::::Title: %s' % urltitle ) except: aline = urlobj.read() continue @@ -676,7 +705,7 @@ aline = urlobj.read() else: aline = urlobj.read() - if urltitle != '': wikipedia.output( u'title: ' + urltitle ) + if urltitle != '': pywikibot.output( u'title: ' + urltitle ) # Try a more advanced search ##from nltk.parser.probabilistic import * ##from nltk.tokenizer import * @@ -698,17 +727,17 @@ #for tok in train_tokens: britaggerrules.train(tok, max_rules=200, min_score=2) # brittaggerrul = britaggerrules.train(train_tokens, max_rules=200, min_score=2) #britaggerrul = () - #britagger = BrillTagger(initial_tagger=unitagger, rules=britaggerrul, SUBTOKENS='WORDS' ) + #britagger = BrillTagger(initial_tagger=unitagger, rules=britaggerrul, SUBTOKENS='WORDS') # Training completed # Examine text ##text_token = Token(TEXT=full_page) ##WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token) #unitagger.tag(text_token) #britagger.tag(text_token) - ### wikipedia.output( unicode(text_token) ) + ### pywikibot.output( unicode(text_token) ) else: - wikipedia.output( u'No data retrieved.' ) - socket.setdefaulttimeout( 200 ) # timeout in seconds + pywikibot.output( u'No data retrieved.') + socket.setdefaulttimeout(200) urltitle = urltitle.replace(u'|',u':') return urltitle.strip()
@@ -731,11 +760,11 @@ new_text = u'' now = date.today() if extlink_linktext == None or len(extlink_linktext.strip()) < 20: - wikipedia.output( u'Fetching URL: %s' % unicode(extlink_linkname) ) + pywikibot.output( u'Fetching URL: %s' % unicode(extlink_linkname) ) urltitle = self.doGetTitleFromURL( extlink_linkname ) # try to get title from URL if urltitle == None or urltitle == '': - urltitle = extlink_linkname # Assume linkname for title - wikipedia.output( u'Title is: %s' % urltitle ) + urltitle = extlink_linkname + pywikibot.output( u'Title is: %s' % urltitle ) extlink_linktext = urltitle for newref in self.references: # scan through all references if extlink_linkname in newref: # if undescribed linkname same as a previous entry @@ -750,7 +779,7 @@ for (sitename, newscompany, stripprefix) in newssites: if refname.startswith( sitename ): # If there is a prefix to strip from the title - if stripprefix and extlink_linktext.startswith( stripprefix ): + if stripprefix and extlink_linktext.startswith(stripprefix): extlink_linktext = extlink_linktext[len(stripprefix):] new_text = u'{{news reference | title=%s | url=%s | urldate=%s | org=%s }}' % ( extlink_linktext, extlink_linkname, now.isoformat(), newscompany ) + '\n' break @@ -764,12 +793,14 @@ a format suitable for the Notes section. """ # TODO: look up DOI info and create full reference - urltitle = self.doGetTitleFromURL( 'http://dx.doi.org/' + doi_linktext ) # try to get title from URL + urltitle = self.doGetTitleFromURL('http://dx.doi.org/' + doi_linktext ) # try to get title from URL refname = 'refbot%d' % refsequence if urltitle: - new_text = '# {{note|%s}} %s {{doi|%s}}' % (refname, urltitle, doi_linktext) + '\n' + new_text = '# {{note|%s}} %s {{doi|%s}}\n' \ + % (refname, urltitle, doi_linktext) else: - new_text = '# {{note|%s}} {{doi|%s}}' % (refname, doi_linktext) + '\n' + new_text = '# {{note|%s}} {{doi|%s}}\n' \ + % (refname, doi_linktext) return (refname, new_text)
def doBuildSequenceListOfReferences(self, original_text): @@ -777,14 +808,14 @@ Returns a list with all found references and sequence numbers. """ duplicatefound = False - refusage = {} # Nothing found yet + refusage = {} # Data structure: refusage[reference_key] = [ sequence_in_document, count, count_during_dup_handling ] for text_line in original_text.splitlines(True): # Scan all text line by line # Check for various references Rtext_line = re.compile(r'(?i){{(?P<reftype>ref|ref_num|ref_label)|(?P<refname>[^}|]+?)}}') m = Rtext_line.search( text_line ) while m: # if found a reference - if m.group('reftype').lower() in ( 'ref', 'ref_num', 'ref_label' ): # confirm ref + if m.group('reftype').lower() in ('ref', 'ref_num', 'ref_label'): # confirm ref refkey = m.group('refname').strip() if refkey != '': if refkey in refusage: @@ -793,7 +824,7 @@ else: refusage[refkey] = [len(refusage),0,0] # remember this reference m = Rtext_line.search( text_line, m.end() ) - wikipedia.output( u'Number of refs: %d' % (len(refusage)) ) + pywikibot.output( u'Number of refs: %d' % (len(refusage)) ) return (duplicatefound, refusage)
def doReadReferencesSection(self, original_text, refsectionname): @@ -803,7 +834,7 @@ Contents of all Notes sections will be read. """ # TODO: support subsections within Notes - new_text = '' # Default is no text + new_text = '' intargetsection = False for text_line in original_text.splitlines(True): # Scan all text line by line # Check for target section @@ -811,19 +842,20 @@ if m: # if in a section, check if Notes section if refsectionname != '': # if a certain section name has been identified m_section = m.group('sectionname') - wikipedia.output( u'Looking for "%s": "%s"' % (refsectionname,m_section) ) + pywikibot.output(u'Looking for "%s": "%s"' + % (refsectionname,m_section) ) if unicode(m_section.strip()) == unicode(refsectionname): - wikipedia.output( u'Read Ref section.' ) - intargetsection = True # flag as being in section + pywikibot.output(u'Read Ref section.') + intargetsection = True new_text = new_text + text_line else: - intargetsection = False # flag as not being in section + intargetsection = False else: # else grab all possible sections if m.group('sectionname').lower().strip() in referencesectionnames: - intargetsection = True # flag as being in section + intargetsection = True new_text = new_text + text_line else: - intargetsection = False # flag as not being in section + intargetsection = False else: if intargetsection: # if inside target section, remember this reference line if text_line.strip() != '': @@ -837,8 +869,8 @@ if intargetsection: # if still inside target section # Convert any # wiki list to *; will be converted later if a reference if text_line[0] == '#': - text_line = '*' + text_line[1:] # replace # with * wiki - self.references.append( text_line.rstrip() + u'\n' ) # Append line to references + text_line = '*' + text_line[1:] + self.references.append(text_line.rstrip() + u'\n') new_text = new_text + text_line.rstrip() + u'\n' return new_text
@@ -891,7 +923,7 @@ Returns the text which is generated by rebuilding the Notes section. Rewrite Notes section from references list. """ - new_text = '' # Default is no text + new_text = '' intargetsection = False for text_line in original_text.splitlines(True): # Scan all text line by line # Check for target section @@ -899,9 +931,9 @@ if m: # if in a section, check if Notes section if refsectionname != '': # if a certain section name has been identified m_section = m.group('sectionname') - wikipedia.output( u'Looking for "%s": "%s"' % (refsectionname,m_section) ) + pywikibot.output( u'Looking for "%s": "%s"' % (refsectionname,m_section) ) if unicode(m_section.strip()) == unicode(refsectionname): - wikipedia.output( u'Updating Ref section.' ) + pywikibot.output( u'Updating Ref section.') intargetsection = True # flag as being in section else: intargetsection = False # flag as not being in section @@ -933,7 +965,7 @@ if not intargetsection: # if not in Notes section, remember line new_text = new_text + text_line # append new line to new text # If references list not emptied, there was no Notes section found - if self.references != []: # empty references + if self.references != []: # New Notes section needs to be created at bottom. text_line_counter = 0 # current line last_text_line_counter_value = 0 # number of last line of possible text @@ -978,26 +1010,29 @@ # Load the page's text from the wiki original_text = pl.get() if pl.editRestriction: - wikipedia.output(u'Skipping locked page %s' % pl.title()) + pywikibot.output(u'Skipping locked page %s' % pl.title()) continue - except wikipedia.NoPage: - wikipedia.output(u'Page %s not found' % pl.title()) + except pywikibot.NoPage: + pywikibot.output(u'Page %s not found' % pl.title()) continue - except wikipedia.IsRedirectPage: + except pywikibot.IsRedirectPage: continue match = self.checkExceptions(original_text) # skip all pages that contain certain texts if match: - wikipedia.output(u'Skipping %s because it contains %s' % (pl.title(), match)) + pywikibot.output(u'Skipping %s because it contains %s' + % (pl.title(), match)) else: new_text = self.doReplacements(original_text) if new_text == original_text: - wikipedia.output('No changes were necessary in %s' % pl.title()) + pywikibot.output('No changes were necessary in %s' + % pl.title()) else: - wikipedia.output(u'>>> %s <<<' % pl.title()) - wikipedia.showDiff(original_text, new_text) + pywikibot.output(u'>>> %s <<<' % pl.title()) + pywikibot.showDiff(original_text, new_text) if not self.acceptall: - choice = wikipedia.input(u'Do you want to accept these changes? [y|n|a(ll)]') + choice = pywikibot.input( + u'Do you want to accept these changes? [y|n|a(ll)]') if choice in ['a', 'A']: self.acceptall = True if self.acceptall or choice in ['y', 'Y']: @@ -1034,7 +1069,7 @@ # default to -1 which means all namespaces will be processed namespace = -1 # Load default summary message. - editSummary = wikipedia.translate(wikipedia.getSite(), msg) + editSummary = pywikibot.translate(pywikibot.getSite(), msg) # List of references in Notes section references = [] # Notes sequence number @@ -1043,31 +1078,33 @@ refusage = {}
# Read commandline parameters. - for arg in wikipedia.handleArgs(): + for arg in pywikibot.handleArgs(): if arg == '-regex': regex = True elif arg.startswith('-file'): if len(arg) == 5: - textfilename = wikipedia.input(u'Please enter the filename:') + textfilename = pywikibot.input(u'Please enter the filename:') else: textfilename = arg[6:] source = 'textfile' elif arg.startswith('-cat'): if len(arg) == 4: - categoryname = wikipedia.input(u'Please enter the category name:') + categoryname = pywikibot.input( + u'Please enter the category name:') else: categoryname = arg[5:] source = 'category' elif arg.startswith('-sql'): if len(arg) == 4: - sqlfilename = wikipedia.input(u'Please enter the SQL dump's filename:') + sqlfilename = pywikibot.input( + u'Please enter the SQL dump's filename:') else: sqlfilename = arg[5:] source = 'sqldump' elif arg.startswith('-page'): if len(arg) == 5: pagenames.append( - wikipedia.input(u'Which page do you want to change?')) + pywikibot.input(u'Which page do you want to change?')) else: pagenames.append(arg[6:]) source = 'userinput' @@ -1085,16 +1122,19 @@
if source == None or len(commandline_replacements) not in [0, 2]: # syntax error, show help text from the top of this file - wikipedia.output(__doc__, 'utf-8') + pywikibot.output(__doc__, 'utf-8') return if (len(commandline_replacements) == 2): replacements[commandline_replacements[0]] = commandline_replacements[1] - editSummary = wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')' + editSummary = pywikibot.translate(pywikibot.getSite(), msg) + % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')' else: change = '' - default_summary_message = wikipedia.translate(wikipedia.getSite(), msg) % change - wikipedia.output(u'The summary message will default to: %s' % default_summary_message) - summary_message = wikipedia.input(u'Press Enter to use this default message, or enter a description of the changes your bot will make:') + default_summary_message = pywikibot.translate(pywikibot.getSite(), msg) % change + pywikibot.output(u'The summary message will default to: %s' + % default_summary_message) + summary_message = pywikibot.input( + u'Press Enter to use this default message, or enter a description of the changes your bot will make:') if summary_message == '': summary_message = default_summary_message editSummary = summary_message @@ -1103,18 +1143,20 @@ try: fix = fixes['ALTREFS'] except KeyError: - wikipedia.output(u'Available predefined fixes are: %s' % fixes.keys()) + pywikibot.output(u'Available predefined fixes are: %s' + % fixes.keys()) return if 'regex' in fix: regex = fix['regex'] if 'msg' in fix: - editSummary = wikipedia.translate(wikipedia.getSite(), fix['msg']) + editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg']) if 'exceptions' in fix: exceptions = fix['exceptions'] replacements = fix['replacements']
- gen = ReplacePageGenerator(source, replacements, exceptions, regex, namespace, - textfilename, sqlfilename, categoryname, pagenames) + gen = ReplacePageGenerator(source, replacements, exceptions, regex, + namespace, textfilename, sqlfilename, + categoryname, pagenames) preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20) bot = ReplaceRobot(preloadingGen, replacements, refsequence, references, refusage, exceptions, regex, acceptall, editSummary) @@ -1125,4 +1167,4 @@ try: main() finally: - wikipedia.stopme() + pywikibot.stopme()
pywikipedia-svn@lists.wikimedia.org