jenkins-bot has submitted this change and it was merged.
Change subject: [PEP8] changes, code improvements ......................................................................
[PEP8] changes, code improvements
Change-Id: Ibfa8741849c0c59d38963afac94ba92a2765bdf7 --- M standardize_notes.py 1 file changed, 447 insertions(+), 308 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/standardize_notes.py b/standardize_notes.py index fec9af1..2682e21 100644 --- a/standardize_notes.py +++ b/standardize_notes.py @@ -35,8 +35,9 @@ """ # Derived from replace.py # -# (C) Daniel Herding, 2004 -# Copyright Scot E. Wilcoxon 2005 +# (c) Daniel Herding, 2004 +# (c) Scot E. Wilcoxon, 2005 +# (c) pywikibot team, 2006-2013 # # Distributed under the terms of the MIT license. # @@ -49,11 +50,19 @@ # doFindAllCitationSections(). (SEWilco) #
-import subprocess, sys, re, random -import socket, urllib, robotparser +import subprocess +import sys +import re +import random +import socket +import urllib +import robotparser from datetime import date +import string + import wikipedia as pywikibot -import pagegenerators, config +import pagegenerators +import config
# httpcache is optional have_httpcache = True @@ -64,38 +73,38 @@
# Summary messages in different languages msg = { - 'ar':u'روبوت: معالجة مراجع تلقائية %s', - 'de':u'Bot: Automatisierte Textersetzung %s', - 'en':u'Robot: Automated reference processing %s', - 'es':u'Robot: Reemplazo automático de texto %s', - 'fr':u'Robot : Remplacement de texte automatisé %s', - 'he':u'בוט: הופך את הערת השוליים %s לאוטומטית', - 'hu':u'Robot: Automatikus szövegcsere %s', - 'ia':u'Robot: Reimplaciamento automatic de texto %s', - 'is':u'Vélmenni: breyti texta %s', - 'nl':u'Bot: geautomatiseerde verwerking van referenties %s', - 'pl':u'Robot automatycznie przetwarza źródła %s', - 'pt':u'Bot: Mudança automática %s', - } + 'ar': u'روبوت: معالجة مراجع تلقائية %s', + 'de': u'Bot: Automatisierte Textersetzung %s', + 'en': u'Robot: Automated reference processing %s', + 'es': u'Robot: Reemplazo automático de texto %s', + 'fr': u'Robot : Remplacement de texte automatisé %s', + 'he': u'בוט: הופך את הערת השוליים %s לאוטומטית', + 'hu': u'Robot: Automatikus szövegcsere %s', + 'ia': u'Robot: Reimplaciamento automatic de texto %s', + 'is': u'Vélmenni: breyti texta %s', + 'nl': u'Bot: geautomatiseerde verwerking van referenties %s', + 'pl': u'Robot automatycznie przetwarza źródła %s', + 'pt': u'Bot: Mudança automática %s', +}
fixes = { - # These replacements will convert alternate reference formats to format used - # by this tool. + # These replacements will convert alternate reference formats to format + # used by this tool. 'ALTREFS': { 'regex': True, # We don't want to mess up pages which discuss HTML tags, so we skip # all pages which contain nowiki tags. 'exceptions': ['<nowiki>[^<]{3,}</nowiki>'], 'msg': { - 'ar':u'روبوت: إضافة/ترتيب المراجع.', - 'en':u'Robot: Adding/sorting references.', - 'ar':u'روبوت: إضافة/ترتيب المراجع.', - 'fr':u'Robot : Ajoute/trie les références.', - 'he':u'בוט: מוסיף/מסדר הערות שוליים', - 'ia':u'Robot: Addition/assortimento de referentias', - 'nl':u'Bot: referenties toegevoegd/gesorteerd', - 'pl':u'Robot dodaje/sortuje źródła', - }, + 'ar': u'روبوت: إضافة/ترتيب المراجع.', + 'en': u'Robot: Adding/sorting references.', + 'ar': u'روبوت: إضافة/ترتيب المراجع.', + 'fr': u'Robot : Ajoute/trie les références.', + 'he': u'בוט: מוסיף/מסדר הערות שוליים', + 'ia': u'Robot: Addition/assortimento de referentias', + 'nl': u'Bot: referenties toegevoegd/gesorteerd', + 'pl': u'Robot dodaje/sortuje źródła', + }, 'replacements': [ # Everything case-insensitive (?i) # These translate variations of footnote templates to ref|note @@ -141,7 +150,7 @@ 'references', 'source', 'sources', - ] +]
# news sites for which to generate 'news reference' citations, the org name, and prefix to strip newssites = [ @@ -218,7 +227,9 @@
"""
- def __init__(self, source, replacements, exceptions, regex = False, namespace = -1, textfilename = None, sqlfilename = None, categoryname = None, pagenames = None): + def __init__(self, source, replacements, exceptions, regex=False, + namespace=-1, textfilename=None, sqlfilename=None, + categoryname=None, pagenames=None): self.source = source self.replacements = replacements self.exceptions = exceptions @@ -286,14 +297,14 @@ """ import catlib category = catlib.Category(pywikibot.getSite(), self.categoryname) - for page in category.articles(recurse = False): + for page in category.articles(recurse=False): yield page
def read_pages_from_text_file(self): """ - Generator which will yield pages that are listed in a text file created by - the bot operator. Will regard everything inside [[double brackets]] as a - page name, and yield Pages for these pages. + Generator which will yield pages that are listed in a text file created + by the bot operator. Will regard everything inside [[double brackets]] + as a page name, and yield Pages for these pages.
Arguments: * textfilename - the textfile's path, either absolute or relative @@ -306,13 +317,13 @@ for line in f.readlines(): # BUG: this will only find one link per line. # TODO: use findall() instead. - m=R.match(line) + m = R.match(line) if m: yield pywikibot.Page(pywikibot.getSite(), m.group(1)) f.close()
def read_pages_from_wiki_page(self): - ''' + """ Generator which will yield pages that are listed in a wiki page. Will regard everything inside [[double brackets]] as a page name, except for interwiki and category links, and yield Pages for these pages. @@ -320,16 +331,14 @@ Arguments: * pagetitle - the title of a page on the home wiki
- ''' + """ listpage = pywikibot.Page(pywikibot.getSite(), self.pagetitle) list = pywikibot.get(listpage) # TODO - UNFINISHED
# TODO: Make MediaWiki's search feature available. def __iter__(self): - ''' - Starts the generator. - ''' + """ Starts the generator. """ if self.source == 'sqldump': for pl in self.read_pages_from_sql_dump(): yield pl @@ -343,10 +352,11 @@ for pagename in self.pagenames: yield pywikibot.Page(pywikibot.getSite(), pagename)
+ class ReplaceRobot: def __init__(self, generator, replacements, refsequence, references, - refusage, exceptions = [], regex = False, acceptall = False, - summary = ''): + refusage, exceptions=[], regex=False, acceptall=False, + summary=''): self.generator = generator self.replacements = replacements self.exceptions = exceptions @@ -361,6 +371,7 @@ """ If one of the exceptions applies for the given text, returns the substring. which matches the exception. Otherwise it returns None. + """ for exception in self.exceptions: if self.regex: @@ -372,12 +383,12 @@ hit = original_text.find(exception) if hit != -1: return original_text[hit:hit + len(exception)] - return None
def doReplacements(self, new_text): """ Returns the text which is generated by applying all replacements to the given text. + """
# For any additional replacements, loop through them @@ -396,15 +407,18 @@ refsectionname) # Read existing Notes section contents into references list pywikibot.output(u"Reading existing Notes section") - self.doReadReferencesSection( new_text, refsectionname ) - while self.references and self.references[len(self.references)-1] == u'\n': - del self.references[len(self.references)-1] # delete trailing empty lines + self.doReadReferencesSection(new_text, refsectionname) + while self.references and \ + self.references[len(self.references) - 1] == u'\n': + # delete trailing empty lines + del self.references[len(self.references) - 1] # Convert any external links to footnote references - pywikibot.output(u"Converting external links" ) + pywikibot.output(u"Converting external links") new_text = self.doConvertExternalLinks(new_text) # Accumulate ordered list of all references pywikibot.output(u"Collecting references") - (duplicatefound, self.refusage) = self.doBuildSequenceListOfReferences( new_text ) + (duplicatefound, + self.refusage) = self.doBuildSequenceListOfReferences(new_text) # Rewrite references, including dealing with duplicates. pywikibot.output(u"Rewriting references") new_text = self.doRewriteReferences(new_text, self.refusage, @@ -414,7 +428,7 @@ self.references = self.doReorderReferences(self.references, self.refusage) # Rebuild Notes section - pywikibot.output(u"Rebuilding References section" ) + pywikibot.output(u"Rebuilding References section") new_text = self.doUpdateReferencesSection(new_text, self.refusage, refsectionname) return new_text @@ -424,71 +438,93 @@ References. Adds References to reference list.
""" - new_text = '' # Default is no text + new_text = '' skipsection = False - for text_line in original_text.splitlines(True): # Scan all text line by line + # Scan all text line by line + for text_line in original_text.splitlines(True): # Check for protected sections m = re.search("== *(?P<sectionname>[^]|=]*) *==", text_line) # TODO: support subheadings within Notes section # TODO: support Notes in alphabetic order # TODO: support Notes in other orders - if m: # if in a section, check if should skip this section - if m.group('sectionname').lower().strip() in referencesectionnames: - skipsection = True # skipsection left True so no further links converted + if m: # if in a section, check if should skip this section + if m.group('sectionname').lower().strip() in \ + referencesectionnames: + # skipsection left True so no further links converted + skipsection = True if skipsection: - new_text = new_text + text_line # skip section, so retain text. + new_text += text_line # skip section, so retain text. else: - # TODO: recognize {{inline}} invisible footnotes when something can be done with them - # + # TODO: recognize {{inline}} invisible footnotes when something + # can be done with them + # Ignore lines within comments - if not text_line.startswith( u'<!--'): + if not text_line.startswith(u'<!--'): # Fix erroneous external links in double brackets - Rextlink = re.compile(r'(?i)[[(?P<linkname>http://%5B%5E%5C%5D%5D+?)%5C%5D%5C]') + Rextlink = re.compile( + r'(?i)[[(?P<linkname>http://%5B%5E%5C%5D%5D+?)%5C%5D%5C]') # TODO: compiling the regex each time might be inefficient text_lineR = re.compile(Rextlink) MOextlink = text_lineR.search(text_line) - while MOextlink: # find all links on line + while MOextlink: # find all links on line extlink_linkname = MOextlink.group('linkname') # Rewrite double brackets to single ones - text_line=text_line[:MOextlink.start()] + '[%s]' % extlink_linkname + text_line[MOextlink.end(0):] - MOextlink = text_lineR.search(text_line,MOextlink.start(0)+1) - # Regular expression to look for external link [linkname linktext] - linktext is optional. + text_line = text_line[:MOextlink.start()] + \ + '[%s]' % extlink_linkname + \ + text_line[MOextlink.end(0):] + MOextlink = text_lineR.search(text_line, + MOextlink.start(0) + 1) + # Regular expression to look for external link + # [linkname linktext] - linktext is optional. # Also accepts erroneous pipe symbol as separator. # Accepts wikilinks within <linktext> - #Rextlink = re.compile(r'[^[][(?P<linkname>[h]*[ft]+tp:[^ []|]+?)(?P<linktext>[ |]+(( *[^]|]*)|( *[[.+?]])*)+)*][^]]') - #Rextlink = re.compile(r'[(?P<linkname>[h]*[ft]+tp:[^ []|]+?)(?P<linktext>[ |]+(( *[^]|]*)|( *[[.+?]])*)+)*]') - Rextlink = re.compile(r'(?i)[(?P<linkname>[h]*[ft]+tp:[^ []|]+?)(?P<linktext>[ |]+(( *[^]|]*)|( *[[.+?]])*)+)*]') + Rextlink = re.compile( + r'(?i)[(?P<linkname>[h]*[ft]+tp:[^ []|]+?)(?P<linktext>[ |]+((*[^]|]*)|(*[[.+?]])*)+)*]') # TODO: compiling the regex each time might be inefficient text_lineR = re.compile(Rextlink) MOextlink = text_lineR.search(text_line) - while MOextlink: # find all links on line + while MOextlink: # find all links on line extlink_linkname = MOextlink.group('linkname') extlink_linktext = MOextlink.group('linktext') self.refsequence += 1 - ( refname, reftext ) = self.doConvertLinkTextToReference(self.refsequence, extlink_linkname, extlink_linktext) - self.references.append( reftext ) # append new entry to References + (refname, reftext) = self.doConvertLinkTextToReference( + self.refsequence, extlink_linkname, + extlink_linktext) + # append new entry to References + self.references.append(reftext) if extlink_linktext: - # If there was text as part of link, reinsert text before footnote. - text_line=text_line[:MOextlink.start(0)] + '%s{{ref|%s}}' % (extlink_linktext, refname) + text_line[MOextlink.end(0):] + # If there was text as part of link, reinsert text + # before footnote. + text_line = (text_line[:MOextlink.start(0)] + + '%s{{ref|%s}}' % (extlink_linktext, + refname) + + text_line[MOextlink.end(0):]) else: - text_line=text_line[:MOextlink.start(0)] + '{{ref|%s}}' % refname + text_line[MOextlink.end(0):] - MOextlink = text_lineR.search(text_line,MOextlink.start(0)+1) + text_line = (text_line[:MOextlink.start(0)] + + '{{ref|%s}}' % refname + + text_line[MOextlink.end(0):]) + MOextlink = text_lineR.search(text_line, + MOextlink.start(0) + 1) # Search for {{doi}} Rdoi = re.compile(r'(?i){{doi|(?P<doilink>[^}|]*)}}') # TODO: compiling the regex each time might be inefficient doiR = re.compile(Rdoi) MOdoi = doiR.search(text_line) - while MOdoi: # find all doi on line + while MOdoi: # find all doi on line doi_link = MOdoi.group('doilink') if doi_link: self.refsequence += 1 - ( refname, reftext ) = self.doConvertDOIToReference( self.refsequence, doi_link ) - self.references.append( reftext ) # append new entry to References - text_line=text_line[:MOdoi.start(0)] + '{{ref|%s}}' % refname + text_line[MOdoi.end(0):] - MOdoi = doiR.search(text_line, MOdoi.start(0)+1) - new_text = new_text + text_line # append new line to new text + (refname, reftext) = self.doConvertDOIToReference( + self.refsequence, doi_link) + # append new entry to References + self.references.append(reftext) + text_line = text_line[:MOdoi.start(0)] + \ + '{{ref|%s}}' % refname + \ + text_line[MOdoi.end(0):] + MOdoi = doiR.search(text_line, MOdoi.start(0) + 1) + new_text += text_line # append new line to new text if new_text == '': - new_text = original_text # If somehow no new text, return original text + new_text = original_text # If no new text, return original text return new_text
def doFindRefSection(self, original_text): @@ -498,121 +534,157 @@ """ refsectionname = '' sectionname = '' - for text_line in original_text.splitlines(True): # Scan all text line by line - if refsectionname == '': # if ref section not found + # Scan all text line by line + for text_line in original_text.splitlines(True): + if not refsectionname: # Check if line has a section name - m = re.search( r'==+(?P<sectionname>[^=]+)==', text_line ) - if m: # if in a section, remember section name + m = re.search(r'==+(?P<sectionname>[^=]+)==', text_line) + if m: # if in a section, remember section name sectionname = m.group('sectionname').strip() - pywikibot.output( u'Section: %s' % sectionname ) - else: # else not a section name so look for reference - n = re.search( r'(i?){{(note|ibid)[|]', text_line ) - if n: # if reference found - refsectionname = sectionname # found reference section - pywikibot.output( u'Ref section: %s' % refsectionname ) - break # stop looking + pywikibot.output(u'Section: %s' % sectionname) + else: # else not a section name so look for reference + n = re.search(r'(i?){{(note|ibid)[|]', text_line) + if n: + refsectionname = sectionname # found reference section + pywikibot.output(u'Ref section: %s' % refsectionname) + break return refsectionname
def doFindAllCitationSections(self, original_text, refsectionname): """ Returns list of sections which may contain citations. """ - refsectionlist = [ ( refsectionname) ] + refsectionlist = [refsectionname] sectionname = '' - for text_line in original_text.splitlines(True): # Scan all text line by line + # Scan all text line by line + for text_line in original_text.splitlines(True): # Check if line has a section name - m = re.search( "==[ ]*(?P<sectionname>[^=]+)[ ]*==", text_line ) - if m: # if in a section, remember section name + m = re.search("==[ ]*(?P<sectionname>[^=]+)[ ]*==", text_line) + if m: sectionname = m.group('sectionname').strip() if sectionname.lower().strip() in referencesectionnames: - if sectionname not in refsectionlist: # if not already in list, add to list. - refsectionlist.extend( sectionname ) + # if not already in list, add to list. + if sectionname not in refsectionlist: + refsectionlist.extend(sectionname) return refsectionlist
def doRewriteReferences(self, original_text, refusage, refsectionname): """ - Returns the text which is generated by rewriting references, including duplicate refs. + Returns the text which is generated by rewriting references, including + duplicate refs. + """ - new_text = '' # Default is no text + new_text = '' # Default is no text skipsection = False - for text_line in original_text.splitlines(True): # Scan all text line by line + # Scan all text line by line + for text_line in original_text.splitlines(True): # Check for protected sections - m = re.search( r'==+(?P<sectionname>[^=]+)==', text_line ) - if m: # if in a section, check if should skip this section - if refsectionname != '': # if a certain section name has been identified + m = re.search(r'==+(?P<sectionname>[^=]+)==', text_line) + if m: # if in a section, check if should skip this section + # if a certain section name has been identified + if refsectionname != '': m_section = m.group('sectionname') - pywikibot.output( u'Looking for "%s": "%s"' % (refsectionname,unicode(m_section)) ) + pywikibot.output(u'Looking for "%s": "%s"' + % (refsectionname, unicode(m_section))) if unicode(m_section.strip()) == unicode(refsectionname): - pywikibot.output( u'Found Ref section.') - skipsection = True # skipsection left True so no further links converted - else: # else grab all possible sections - if m.group('sectionname').lower().strip() in referencesectionnames: - pywikibot.output('RefSection found by default names: %s' % m.group('sectionname') ) - skipsection = True # skipsection left True so no further links converted + pywikibot.output(u'Found Ref section.') + skipsection = True + else: # else grab all possible sections + if m.group('sectionname').lower().strip() in \ + referencesectionnames: + pywikibot.output( + 'RefSection found by default names: %s' + % m.group('sectionname')) + skipsection = True if skipsection: - new_text = new_text + text_line # skip section, so retain text. + new_text += text_line else: - # TODO: recognize {{inline}} invisible footnotes when something can be done with them + # TODO: recognize {{inline}} invisible footnotes when something + # can be done with them # - # Data structure: refusage[reference_key] = [ sequence_in_document, count, count_during_dup_handling ] + # Data structure: + # refusage[reference_key] = [sequence_in_document, + # count, count_during_dup_handling] # Check for various references # TODO: compiling the regex each time might be inefficient - Rtext_line = re.compile(r'(?i){{(?P<reftype>ref|ref_num|ref_label)|(?P<refname>[^}|]+?)}}') - m = Rtext_line.search( text_line ) - alphabet26 = u'abcdefghijklmnopqrstuvwxyz' + Rtext_line = re.compile( + r'(?i){{(?P<reftype>ref|ref_num|ref_label)|(?P<refname>[^}|]+?)}}') + m = Rtext_line.search(text_line) while m: # if found a reference - if m.group('reftype').lower() in ('ref', 'ref_num', 'ref_label'): # confirm ref + if m.group('reftype').lower() in ('ref', 'ref_num', + 'ref_label'): refkey = m.group('refname').strip() if refkey != '': if refkey in refusage: - # pywikibot.output( u'refusage[%s] = %s' % (refkey,refusage[refkey]) ) - if refusage[refkey][2] == 0: # if first use of reference - text_line=text_line[:m.start(0)] + '{{ref|%s}}' % (refkey) + text_line[m.end(0):] - refusage[refkey][2] += 1 # count use of reference - else: # else not first use of reference - text_line=text_line[:m.start(0)] + '{{ref_label|%s|%d|%s}}' % (refkey,(refusage[refkey][0])+1,alphabet26[((refusage[refkey][2])-1)%26]) + text_line[m.end(0):] - refusage[refkey][2] += 1 # count use of reference + # pywikibot.output(u'refusage[%s] = %s' % (refkey,refusage[refkey])) + # if first use of reference + if refusage[refkey][2] == 0: + text_line = ( + text_line[:m.start(0)] + + '{{ref|%s}}' % (refkey) + + text_line[m.end(0):]) + # count use of reference + refusage[refkey][2] += 1 + else: # else not first use of reference + text_line = ( + text_line[:m.start(0)] + + '{{ref_label|%s|%d|%s}}' + % (refkey, (refusage[refkey][0]) + 1, + string.ascii_lowercase[ + ((refusage[refkey][2]) - 1) % 26 + ]) + text_line[m.end(0):]) + # count use of reference + refusage[refkey][2] += 1 else: - # Odd, because refusage list is populated the key should exist already. - refusage[refkey] = [len(refusage),1,1] # remember this reference - text_line=text_line[:m.start(0)] + '{{ref|%s}}' % refkey + text_line[m.end(0):] - m = Rtext_line.search( text_line, m.start(0)+1 ) - new_text = new_text + text_line # append new line to new text + # Odd, because refusage list is populated the + # key should exist already. + + # remember this reference + refusage[refkey] = [len(refusage), 1, 1] + text_line = (text_line[:m.start(0)] + + '{{ref|%s}}' % refkey + + text_line[m.end(0):]) + m = Rtext_line.search(text_line, m.start(0) + 1) + new_text += text_line if new_text == '': - new_text = original_text # If somehow no new text, return original text + # If somehow no new text, return original text + new_text = original_text return new_text
- def doGetTitleFromURL(self, extlink_linkname ): + def doGetTitleFromURL(self, extlink_linkname): """ Returns text derived from between <title>...</title> tags through a URL. Obeys robots.txt restrictions. + """ # if no descriptive text get from web site, if not PDF urltitle = u'' urlfile = None urlheaders = None if len(extlink_linkname) > 5: - socket.setdefaulttimeout(20) # timeout in seconds - pywikibot.get_throttle() # throttle down to Wikipedia rate + socket.setdefaulttimeout(20) # timeout in seconds + pywikibot.get_throttle() # throttle down to Wikipedia rate # Obey robots.txt restrictions rp = robotparser.RobotFileParser() - rp.set_url( extlink_linkname ) + rp.set_url(extlink_linkname) try: - rp.read() # read robots.txt + rp.read() # read robots.txt except (IOError, socket.timeout): pywikibot.output(u'Error accessing URL: %s' % unicode(extlink_linkname)) else: urlobj = None - if not rp.can_fetch( "*", extlink_linkname ): + if not rp.can_fetch("*", extlink_linkname): pywikibot.output(u'Robot prohibited: %s' % unicode(extlink_linkname)) else: # else access allowed try: if have_httpcache: cache = HTTPCache(extlink_linkname) - urlfile = cache.filename() # filename of cached date + # filename of cached date + urlfile = cache.filename() urlheaders = cache.info() else: - (urlfile, urlheaders) = urllib.urlretrieve(extlink_linkname) + (urlfile, + urlheaders) = urllib.urlretrieve(extlink_linkname) except IOError: pywikibot.output(u'Error accessing URL. %s' % unicode(extlink_linkname)) @@ -626,13 +698,17 @@ unicode(extlink_linkname))) except: # Ignore other errors pass - if urlfile != None: - urlobj = open( urlfile ) + if urlfile: + urlobj = open(urlfile) if extlink_linkname.lower().endswith('.pdf'): # If file has a PDF suffix - pywikibot.output( u'PDF file.') + pywikibot.output(u'PDF file.') try: - pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, shell=False).communicate()[0] + pdfinfo_out = subprocess.Popen( + [r"pdfinfo", "/dev/stdin"], + stdin=urlobj, + stdout=subprocess.PIPE, + shell=False).communicate()[0] for aline in pdfinfo_out.splitlines(): if aline.lower().startswith('title'): urltitle = aline.split(None)[1:] @@ -646,66 +722,78 @@ urlauthor = ' '.join(urlauthor) if urlauthor: pywikibot.output(u'author: %s' - % urlauthor ) + % urlauthor) except ValueError: - pywikibot.output( u'pdfinfo value error.') + pywikibot.output(u'pdfinfo value error.') except OSError: - pywikibot.output( u'pdfinfo OS error.') - except: # Ignore errors - pywikibot.output( u'PDF processing error.') + pywikibot.output(u'pdfinfo OS error.') + except: # Ignore errors + pywikibot.output(u'PDF processing error.') pass - pywikibot.output( u'PDF done.') + pywikibot.output(u'PDF done.') if urlobj: urlobj.close() else: # urlinfo = urlobj.info() aline = urlobj.read() maxalines = 100 - while maxalines > 0 and aline and urltitle == '': - maxalines -= 1 # reduce number of lines left to consider - titleRE = re.search("(?i)<title>(?P<HTMLtitle>[^<>]+)", aline) + while maxalines > 0 and aline and not urltitle: + # reduce number of lines left to consider + maxalines -= 1 + titleRE = re.search( + "(?i)<title>(?P<HTMLtitle>[^<>]+)", aline) if titleRE: try: - urltitle = unicode(titleRE.group('HTMLtitle'), 'utf-8') + urltitle = unicode(titleRE.group('HTMLtitle'), + 'utf-8') except: - urltitle = u' ' # error, no title - urltitle = u' '.join(urltitle.split()) # merge whitespace - pywikibot.output( u'::::Title: %s' % urltitle ) - break # found a title so stop looking + urltitle = u' ' # error, no title + urltitle = u' '.join(urltitle.split()) + pywikibot.output(u'::::Title: %s' % urltitle) + break # found a title so stop looking else: if maxalines < 1: pywikibot.output( u'No title in URL. %s' - % unicode(extlink_linkname) ) + % unicode(extlink_linkname)) else: - if urlobj != None: - pywikibot.output( u'::+URL: ' + extlink_linkname ) + if urlobj: + pywikibot.output(u'::+URL: ' + extlink_linkname) # urlinfo = urlobj.info() aline = urlobj.read() full_page = '' # while aline and urltitle == '': while aline: full_page = full_page + aline - titleRE = re.search("(?i)<title>(?P<HTMLtitle>[^<>]+)", aline) + titleRE = re.search( + "(?i)<title>(?P<HTMLtitle>[^<>]+)", + aline) if titleRE: if titleRE.group('HTMLtitle'): urltitle = u'' try: - urltitle = unicode(titleRE.group('HTMLtitle'), 'utf-8') - urltitle = u' '.join(urltitle.split()) # merge whitespace - pywikibot.output( u'::::Title: %s' % urltitle ) + urltitle = unicode( + titleRE.group('HTMLtitle'), + 'utf-8') + urltitle = u' '.join( + urltitle.split()) + pywikibot.output( + u'::::Title: %s' + % urltitle) except: aline = urlobj.read() continue else: aline = urlobj.read() continue - break # found a title so stop looking + # found a title so stop looking + break else: aline = urlobj.read() else: aline = urlobj.read() - if urltitle != '': pywikibot.output( u'title: ' + urltitle ) + if urltitle: + pywikibot.output(u'title: ' + urltitle) # Try a more advanced search ##from nltk.parser.probabilistic import * ##from nltk.tokenizer import * @@ -734,37 +822,43 @@ ##WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token) #unitagger.tag(text_token) #britagger.tag(text_token) - ### pywikibot.output( unicode(text_token) ) + ### pywikibot.output(unicode(text_token)) else: - pywikibot.output( u'No data retrieved.') + pywikibot.output(u'No data retrieved.') socket.setdefaulttimeout(200) - urltitle = urltitle.replace(u'|',u':') + urltitle = urltitle.replace(u'|', u':') return urltitle.strip()
- def doConvertLinkTextToReference(self, refsequence, extlink_linkname, extlink_linktext): - """ - Returns the text which is generated by converting a link to + def doConvertLinkTextToReference(self, refsequence, extlink_linkname, + extlink_linktext): + """ Returns the text which is generated by converting a link to a format suitable for the References section. + """ refname = u'refbot.%d' % refsequence - m = re.search("[\w]+://([\w].)*(?P<siteend>[\w.]+)[/\Z]", extlink_linkname) + m = re.search("[\w]+://([\w].)*(?P<siteend>[\w.]+)[/\Z]", + extlink_linkname) if m: - refname = m.group('siteend') + u'.%d' % refsequence # use end of site URL as reference name - new_text = u'# {{note|%s}} %s' % (refname, self.doConvertRefToCitation( extlink_linktext, extlink_linkname, refname ) ) + '\n' + # use end of site URL as reference name + refname = m.group('siteend') + u'.%d' % refsequence + new_text = u'# {{note|%s}} %s' % (refname, + self.doConvertRefToCitation( + extlink_linktext, + extlink_linkname, + refname)) + '\n' return (refname, new_text)
- def doConvertRefToCitation(self, extlink_linktext, extlink_linkname, refname ): - """ - Returns text with a citation created from link information - """ + def doConvertRefToCitation(self, extlink_linktext, extlink_linkname, + refname): + """ Returns text with a citation created from link information """ new_text = u'' now = date.today() - if extlink_linktext == None or len(extlink_linktext.strip()) < 20: - pywikibot.output( u'Fetching URL: %s' % unicode(extlink_linkname) ) - urltitle = self.doGetTitleFromURL( extlink_linkname ) # try to get title from URL - if urltitle == None or urltitle == '': + if not extlink_linktext or len(extlink_linktext.strip()) < 20: + pywikibot.output(u'Fetching URL: %s' % unicode(extlink_linkname)) + urltitle = self.doGetTitleFromURL(extlink_linkname) # try to get title from URL + if not urltitle: urltitle = extlink_linkname - pywikibot.output( u'Title is: %s' % urltitle ) + pywikibot.output(u'Title is: %s' % urltitle) extlink_linktext = urltitle for newref in self.references: # scan through all references if extlink_linkname in newref: # if undescribed linkname same as a previous entry @@ -773,27 +867,29 @@ else: extlink_linktext = extlink_linkname + ' (See above)' break # found a matching previous linkname so stop looking - if extlink_linktext == None or len(extlink_linktext) < 20: + if not extlink_linktext or len(extlink_linktext) < 20: exlink_linktext = urltitle # Look for a news web site for (sitename, newscompany, stripprefix) in newssites: - if refname.startswith( sitename ): + if refname.startswith(sitename): # If there is a prefix to strip from the title if stripprefix and extlink_linktext.startswith(stripprefix): extlink_linktext = extlink_linktext[len(stripprefix):] - new_text = u'{{news reference | title=%s | url=%s | urldate=%s | org=%s }}' % ( extlink_linktext, extlink_linkname, now.isoformat(), newscompany ) + '\n' + new_text = u'{{news reference | title=%s | url=%s | urldate=%s | org=%s }}' % (extlink_linktext, extlink_linkname, now.isoformat(), newscompany) + '\n' break else: # else no special site found - new_text = u'{{web reference | title=%s | url=%s | date=%s }}' % ( extlink_linktext, extlink_linkname, now.isoformat() ) + new_text = u'{{web reference | title=%s | url=%s | date=%s }}' % (extlink_linktext, extlink_linkname, now.isoformat()) return (new_text)
def doConvertDOIToReference(self, refsequence, doi_linktext): """ Returns the text which is generated by converting a DOI reference to a format suitable for the Notes section. + """ # TODO: look up DOI info and create full reference - urltitle = self.doGetTitleFromURL('http://dx.doi.org/' + doi_linktext ) # try to get title from URL + # try to get title from URL + urltitle = self.doGetTitleFromURL('http://dx.doi.org/' + doi_linktext) refname = 'refbot%d' % refsequence if urltitle: new_text = '# {{note|%s}} %s {{doi|%s}}\n' \ @@ -806,6 +902,7 @@ def doBuildSequenceListOfReferences(self, original_text): """ Returns a list with all found references and sequence numbers. + """ duplicatefound = False refusage = {} @@ -813,18 +910,20 @@ for text_line in original_text.splitlines(True): # Scan all text line by line # Check for various references Rtext_line = re.compile(r'(?i){{(?P<reftype>ref|ref_num|ref_label)|(?P<refname>[^}|]+?)}}') - m = Rtext_line.search( text_line ) + m = Rtext_line.search(text_line) while m: # if found a reference - if m.group('reftype').lower() in ('ref', 'ref_num', 'ref_label'): # confirm ref + if m.group('reftype').lower() in ('ref', 'ref_num', 'ref_label'): refkey = m.group('refname').strip() if refkey != '': if refkey in refusage: - refusage[refkey][1] += 1 # duplicate use of reference + # duplicate use of reference + refusage[refkey][1] += 1 duplicatefound = True else: - refusage[refkey] = [len(refusage),0,0] # remember this reference - m = Rtext_line.search( text_line, m.end() ) - pywikibot.output( u'Number of refs: %d' % (len(refusage)) ) + # remember this reference + refusage[refkey] = [len(refusage), 0, 0] + m = Rtext_line.search(text_line, m.end()) + pywikibot.output(u'Number of refs: %d' % (len(refusage))) return (duplicatefound, refusage)
def doReadReferencesSection(self, original_text, refsectionname): @@ -832,180 +931,212 @@ Returns the text which is generated by reading the Notes section. Also appends references to self.references. Contents of all Notes sections will be read. + """ # TODO: support subsections within Notes new_text = '' intargetsection = False - for text_line in original_text.splitlines(True): # Scan all text line by line + for text_line in original_text.splitlines(True): # Check for target section - m = re.search( r'==+(?P<sectionname>[^=]+)==', text_line ) - if m: # if in a section, check if Notes section - if refsectionname != '': # if a certain section name has been identified + m = re.search(r'==+(?P<sectionname>[^=]+)==', text_line) + if m: # if in a section, check if Notes section + # if a certain section name has been identified + if refsectionname != '': m_section = m.group('sectionname') pywikibot.output(u'Looking for "%s": "%s"' - % (refsectionname,m_section) ) + % (refsectionname, m_section)) if unicode(m_section.strip()) == unicode(refsectionname): pywikibot.output(u'Read Ref section.') intargetsection = True - new_text = new_text + text_line + new_text += text_line else: intargetsection = False - else: # else grab all possible sections - if m.group('sectionname').lower().strip() in referencesectionnames: + else: # else grab all possible sections + if m.group('sectionname').lower().strip() in \ + referencesectionnames: intargetsection = True - new_text = new_text + text_line + new_text += text_line else: intargetsection = False else: - if intargetsection: # if inside target section, remember this reference line - if text_line.strip() != '': - if text_line.lstrip()[0] in u'[{': # if line starts with non-Ref WikiSyntax - intargetsection = False # flag as not being in section + # if inside target section, remember this reference line + if intargetsection: + if text_line.strip(): + if text_line.lstrip()[0] in u'[{': # if line starts with non-Ref WikiSyntax + intargetsection = False # TODO: need better way to handle special cases at end of refs if text_line.strip() == u'<!--READ ME!! PLEASE DO NOT JUST ADD NEW NOTES AT THE BOTTOM. See the instructions above on ordering. -->': # This line ends some Notes sections - intargetsection = False # flag as not being in section - if text_line.strip() == u'</div>': # This line ends some Notes sections - intargetsection = False # flag as not being in section + intargetsection = False + if text_line.strip() == u'</div>': # This line ends some Notes sections + intargetsection = False if intargetsection: # if still inside target section # Convert any # wiki list to *; will be converted later if a reference if text_line[0] == '#': text_line = '*' + text_line[1:] self.references.append(text_line.rstrip() + u'\n') - new_text = new_text + text_line.rstrip() + u'\n' + new_text += text_line.rstrip() + u'\n' return new_text
def doReorderReferences(self, references, refusage): """ Returns the new references list after reordering to match refusage list Non-references are moved to top, unused references to bottom. + """ # TODO: add tests for duplicate references/Ibid handling. newreferences = references - if references != [] and refusage != {}: + if references and refusage: newreferences = [] - for i in range(len(references)): # move nonrefs to top of list + for i in xrange(len(references)): # move nonrefs to top of list text_line = references[i] # TODO: compile search? - m = re.search(r'(?i)[*#][\s]*{{(?P<reftype>note)|(?P<refname>[^}|]+?)}}', text_line) + m = re.search( + r'(?i)[*#][\s]*{{(?P<reftype>note)|(?P<refname>[^}|]+?)}}', + text_line) # Special test to ignore Footnote instructions comment. text_line_stripped = text_line.strip() - if text_line_stripped.startswith(u'4) Add ') or not m: # if no ref found - newreferences.append(text_line) # add nonref to new list + # if no ref found + if text_line_stripped.startswith(u'4) Add ') or not m: + newreferences.append(text_line) # add nonref to new list references[i] = None refsort = {} - for refkey in refusage.keys(): # build list of keys in document order - refsort[ refusage[refkey][0] ] = refkey # refsort contains reference key names - alphabet26 = u'abcdefghijklmnopqrstuvwxyz' - for i in range(len(refsort)): # collect references in document order - for search_num in range(len(references)): # find desired entry + # build list of keys in document order + for refkey in refusage.keys(): + # refsort contains reference key names + refsort[refusage[refkey][0]] = refkey + # collect references in document order + for i in xrange(len(refsort)): + for search_num in range(len(references)): # find desired entry search_line = references[search_num] if search_line: # TODO: compile search? - # Note that the expression finds all neighboring note|note_label expressions. - m2 = re.search(r'(?i)[*#]([\s]*{{(?P<reftype>note|note_label)|(?P<refname>[^}|]+?)}})+', search_line) + # Note that the expression finds all neighboring + # note|note_label expressions. + m2 = re.search( + r'(?i)[*#]([\s]*{{(?P<reftype>note|note_label)|(?P<refname>[^}|]+?)}})+', + search_line) if m2: refkey = m2.group('refname').strip() - if refkey == refsort[i]: # if expected ref found + if refkey == refsort[i]: # if expected ref found # Rewrite references - note_text = '# {{note|%s}}' % refkey # rewrite note tag - if refusage[refkey][1] > 1: # if more than one reference to citation - for n in range(refusage[refkey][1]): # loop through all repetitions - note_text = note_text + '{{note_label|%s|%d|%s}}' % (refkey,(refusage[refkey][0])+1,alphabet26[n%26]) - search_line=search_line[:m2.start(0)] + note_text + search_line[m2.end(0):] - newreferences.append(search_line) # found, add entry - del references[search_num] # delete used reference - break # stop the search loop after entry found - newreferences = newreferences + references # append any unused references + note_text = '# {{note|%s}}' % refkey + # if more than one reference to citation + if refusage[refkey][1] > 1: + # loop through all repetitions + for n in xrange(refusage[refkey][1]): + note_text += ( + '{{note_label|%s|%d|%s}}' + % (refkey, + (refusage[refkey][0]) + 1, + string.ascii_lowercase[n % 26])) + search_line = search_line[:m2.start(0)] + \ + note_text + \ + search_line[m2.end(0):] + newreferences.append(search_line) + # delete used reference + del references[search_num] + break # stop the search loop after entry found + newreferences += references # append any unused references return newreferences
- def doUpdateReferencesSection(self, original_text, refusage, refsectionname): + def doUpdateReferencesSection(self, original_text, refusage, + refsectionname): """ Returns the text which is generated by rebuilding the Notes section. Rewrite Notes section from references list. + """ new_text = '' intargetsection = False - for text_line in original_text.splitlines(True): # Scan all text line by line + # Scan all text line by line + for text_line in original_text.splitlines(True): # Check for target section - m = re.search( r'==+(?P<sectionname>[^=]+)==', text_line ) - if m: # if in a section, check if Notes section - if refsectionname != '': # if a certain section name has been identified + m = re.search(r'==+(?P<sectionname>[^=]+)==', text_line) + if m: + if refsectionname != '': m_section = m.group('sectionname') - pywikibot.output( u'Looking for "%s": "%s"' % (refsectionname,m_section) ) + pywikibot.output(u'Looking for "%s": "%s"' + % (refsectionname, m_section)) if unicode(m_section.strip()) == unicode(refsectionname): - pywikibot.output( u'Updating Ref section.') - intargetsection = True # flag as being in section + pywikibot.output(u'Updating Ref section.') + intargetsection = True else: - intargetsection = False # flag as not being in section - else: # else grab all possible sections - if m.group('sectionname').lower().strip() in referencesectionnames: - intargetsection = True # flag as being in section - else: - intargetsection = False # flag as not being in section + intargetsection = False + else: # else grab all possible sections + intargetsection = (m.group('sectionname').lower().strip() + in referencesectionnames) if intargetsection: - new_text = new_text + text_line # append new line to new text + new_text += text_line if self.references != []: - for newref in self.references: # scan through all references - if newref != None: - new_text = new_text + newref.rstrip() + u'\n' # insert references - new_text = new_text + u'\n' # one trailing blank line - self.references = [] # empty references + # scan through all references + for newref in self.references: + if newref: + # insert references + new_text += newref.rstrip() + u'\n' + new_text += u'\n' + self.references = [] else: - new_text = new_text + text_line # copy section headline + new_text += text_line # copy section headline else: if intargetsection: - if text_line.strip() != '': - if text_line.lstrip()[0] in u'[{': # if line starts with non-Ref WikiSyntax - intargetsection = False # flag as not being in section + if text_line.strip(): + # if line starts with non-Ref WikiSyntax + if text_line.lstrip()[0] in u'[{': + # flag as not being in section + intargetsection = False # TODO: need better way to handle special cases at end of refs - if text_line.strip() == u'<!--READ ME!! PLEASE DO NOT JUST ADD NEW NOTES AT THE BOTTOM. See the instructions above on ordering. -->': # This line ends some Notes sections - intargetsection = False # flag as not being in section - if text_line.strip() == u'</div>': # This line ends some Notes sections - intargetsection = False # flag as not being in section - if not intargetsection: # if not in Notes section, remember line - new_text = new_text + text_line # append new line to new text + if text_line.strip() == u'<!--READ ME!! PLEASE DO NOT JUST ADD NEW NOTES AT THE BOTTOM. See the instructions above on ordering. -->': + intargetsection = False + if text_line.strip() == u'</div>': + intargetsection = False + if not intargetsection: + new_text += text_line # If references list not emptied, there was no Notes section found - if self.references != []: + if self.references: # New Notes section needs to be created at bottom. - text_line_counter = 0 # current line - last_text_line_counter_value = 0 # number of last line of possible text - for text_line in original_text.splitlines(True): # Search for last normal text line - text_line_counter += 1 # count this line - if text_line.strip() != '': - if text_line.lstrip()[0].isalnum(): # if line starts with alphanumeric - last_text_line_counter = text_line_counter # number of last line of possible text + text_line_counter = 0 + # number of last line of possible text + last_text_line_counter_value = 0 + # Search for last normal text line + for text_line in original_text.splitlines(True): + text_line_counter += 1 + if text_line.strip(): + if text_line.lstrip()[0].isalnum(): + # number of last line of possible text + last_text_line_counter = text_line_counter else: - if text_line.lstrip()[0] in u'<=!|*#': # if line starts with recognized wiki char - if not text_line.startswith(u'<!--'): # if line not start with a comment - last_text_line_counter = text_line_counter # number of last line of possible content - new_text = '' # erase previous new_text - text_line_counter = 0 # current line - for text_line in original_text.splitlines(True): # Search for last normal text line - text_line_counter += 1 # count this line - if last_text_line_counter == text_line_counter: # if found insertion point - new_text = new_text + text_line # append new line to new text - new_text = new_text + '\n== Notes ==\n' # set to standard name - new_text = new_text + u'{{subst:Footnote3text}}\n' - if self.references != []: - for newref in self.references: # scan through all references + # if line starts with recognized wiki char + if text_line.lstrip()[0] in u'<=!|*#': + if not text_line.startswith(u'<!--'): + # number of last line of possible content + last_text_line_counter = text_line_counter + new_text = '' + text_line_counter = 0 + # Search for last normal text line + for text_line in original_text.splitlines(True): + text_line_counter += 1 + # if found insertion point + if last_text_line_counter == text_line_counter: + new_text += text_line + new_text += '\n== Notes ==\n' # set to standard name + new_text += u'{{subst:Footnote3text}}\n' + if self.references: + for newref in self.references: if newref is not None: - new_text = new_text + newref # insert references - new_text = new_text + u'\n' # one trailing blank line - self.references = [] # empty references + new_text += newref + new_text += u'\n' + self.references = [] else: - new_text = new_text + text_line # append new line to new text - if new_text == '': - new_text = original_text # If somehow no new text, return original text + new_text += text_line + if not new_text: + new_text = original_text return new_text
def run(self): - """ - Starts the robot. - """ + """ Starts the robot. """ # Run the generator which will yield Pages to pages which might need to be # changed. for pl in self.generator: - print '' try: # Load the page's text from the wiki original_text = pl.get() @@ -1038,6 +1169,7 @@ if self.acceptall or choice in ['y', 'Y']: pl.put(new_text, self.summary)
+ def main(): # How we want to retrieve information on which pages need to be changed. # Can either be 'sqldump', 'textfile' or 'userinput'. @@ -1045,15 +1177,16 @@ # Array which will collect commandline parameters. # First element is original text, second element is replacement text. commandline_replacements = [] - # A dictionary where keys are original texts and values are replacement texts. + # A dictionary where keys are original texts and values are replacement + # texts. replacements = {} # Don't edit pages which contain certain texts. exceptions = [] # Should the elements of 'replacements' and 'exceptions' be interpreted # as regular expressions? regex = False - # the dump's path, either absolute or relative, which will be used when source - # is 'sqldump'. + # the dump's path, either absolute or relative, which will be used when + # source is 'sqldump'. sqlfilename = None # the textfile's path, either absolute or relative, which will be used when # source is 'textfile'. @@ -1062,8 +1195,8 @@ categoryname = None # a list of pages which will be used when source is 'userinput'. pagenames = [] - # will become True when the user presses a ('yes to all') or uses the -always - # commandline paramater. + # will become True when the user presses a ('yes to all') or uses the + # -always commandline paramater. acceptall = False # Which namespace should be processed when using a SQL dump # default to -1 which means all namespaces will be processed @@ -1120,20 +1253,26 @@ else: commandline_replacements.append(arg)
- if source == None or len(commandline_replacements) not in [0, 2]: - # syntax error, show help text from the top of this file - pywikibot.output(__doc__, 'utf-8') + if not (source and len(commandline_replacements) in (0, 2)): + # show help text from the top of this file + pywikibot.showHelp() return - if (len(commandline_replacements) == 2): + + if len(commandline_replacements) == 2: replacements[commandline_replacements[0]] = commandline_replacements[1] - editSummary = pywikibot.translate(pywikibot.getSite(), msg) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')' + editSummary = pywikibot.translate(pywikibot.getSite(), msg, + ' (-' + commandline_replacements[0] + + ' +' + commandline_replacements[1] + + ')') else: change = '' - default_summary_message = pywikibot.translate(pywikibot.getSite(), msg) % change + default_summary_message = pywikibot.translate(pywikibot.getSite(), msg, + change) pywikibot.output(u'The summary message will default to: %s' % default_summary_message) summary_message = pywikibot.input( - u'Press Enter to use this default message, or enter a description of the changes your bot will make:') + u'Press Enter to use this default message, or enter a\n' + u'description of the changes your bot will make:') if summary_message == '': summary_message = default_summary_message editSummary = summary_message @@ -1156,7 +1295,7 @@ gen = ReplacePageGenerator(source, replacements, exceptions, regex, namespace, textfilename, sqlfilename, categoryname, pagenames) - preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20) + preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=20) bot = ReplaceRobot(preloadingGen, replacements, refsequence, references, refusage, exceptions, regex, acceptall, editSummary) bot.run()