Revision: 4270 Author: wikipedian Date: 2007-09-13 13:41:54 +0000 (Thu, 13 Sep 2007)
Log Message: ----------- ReplaceRobot: heavily improved the way exceptions are specified. It is now possible to state things like 'fix commas, but not inside URLs and not in pages that start with XY.'
Modified Paths: -------------- trunk/pywikipedia/fixes.py trunk/pywikipedia/replace.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/fixes.py =================================================================== --- trunk/pywikipedia/fixes.py 2007-09-13 13:40:12 UTC (rev 4269) +++ trunk/pywikipedia/fixes.py 2007-09-13 13:41:54 UTC (rev 4270) @@ -5,13 +5,14 @@ help = """ * HTML - Convert HTML tags to wiki syntax, and fix XHTML. + * isbn - Fix badly formatted ISBNs. * syntax - Try to fix bad wiki markup. Do not run this in automatic mode, as the bot may make mistakes. * syntax-safe - Like syntax, but less risky, so you can run this in automatic mode. - * case-de - fix upper/lower case errors in German - * grammar-de - fix grammar and typography in German + * case-de - fix upper/lower case errors in German + * grammar-de - fix grammar and typography in German """
fixes = { @@ -19,9 +20,6 @@ # make remaining tags XHTML compliant. 'HTML': { 'regex': True, - # We don't want to mess up pages which discuss HTML tags, so we skip - # all pages which contain nowiki tags. - 'exceptions': ['<nowiki>'], 'msg': { 'en':u'Robot: converting/fixing HTML', 'de':u'Bot: konvertiere/korrigiere HTML', @@ -61,7 +59,15 @@ (r'(?i)([\r\n]) *<h5> *([^<]+?) *</h5> *([\r\n])', r"\1===== \2 =====\3"), (r'(?i)([\r\n]) *<h6> *([^<]+?) *</h6> *([\r\n])', r"\1====== \2 ======\3"), # TODO: maybe we can make the bot replace <p> tags with \r\n's. - ] + ], + 'exceptions': { + 'inside-tags': [ + 'nowiki', + 'comment', + 'math', + 'pre' + ], + } }, # Grammar fixes for German language 'grammar-de': { @@ -90,11 +96,29 @@ (u'([a-z](]])?) ,(([[)?[a-zA-Z])', r'\1, \3'), #(u'([a-z].)([A-Z])', r'\1 \2'), ], - 'exceptions': [ - 'sic!', - 'Ju 52/3m', # Flugzeugbezeichnung - 'AH-1W', # Hubschrauberbezeichnung - ] + 'exceptions': { + 'inside-tags': [ + 'nowiki', + 'comment', + 'math', + 'pre', # because of code examples + 'startspace', # because of code examples + 'hyperlink', # e.g. commas in URLs + 'gallery', # because of filenames + ], + 'text-contains': [ + r'sic!', + ], + 'inside': [ + r'Ju 52/3m', # Flugzeugbezeichnung + r'AH-1W', # Hubschrauberbezeichnung + r'\d+h \d+m', # Schreibweise für Zeiten, vor allem in Film-Infoboxen. Nicht korrekt, aber dafür schön kurz. + r'(?i)[[(Bild|Image|Media):.+?|', # Dateinamen auslassen + ], + 'title': [ + r'Arsen', # chemische Formel + ], + } }, # Do NOT run this automatically! # Recommendation: First run syntax-safe automatically, afterwards @@ -143,16 +167,24 @@ # mathematical context or program code. (r'{{([^{}]+?)}(?!})', r'{{\1}}'), ], - 'exceptions': [ - r'http://.*?object=tx%5C%7C', # regular dash in URL - r'http://.*?allmusic%5C.com', # regular dash in URL - r'http://.*?allmovie%5C.com', # regular dash in URL - r'http://physics.nist.gov/', # regular dash in URL - r'http://www.forum-seniorenarbeit.de/', # regular dash in URL - r'http://kuenstlerdatenbank.ifa.de/', # regular dash in URL - r'&object=med', # regular dash in URL - r'[CDATA[' # lots of brackets - ] + 'exceptions': { + 'inside-tags': [ + 'nowiki', + 'comment', + 'math', + 'pre', + ], + 'text-contains': [ + r'http://.*?object=tx%5C%7C', # regular dash in URL + r'http://.*?allmusic%5C.com', # regular dash in URL + r'http://.*?allmovie%5C.com', # regular dash in URL + r'http://physics.nist.gov/', # regular dash in URL + r'http://www.forum-seniorenarbeit.de/', # regular dash in URL + r'http://kuenstlerdatenbank.ifa.de/', # regular dash in URL + r'&object=med', # regular dash in URL + r'[CDATA[' # lots of brackets + ], + } }, # The same as syntax, but restricted to replacements that should # be safe to run automatically. @@ -210,7 +242,17 @@ (r'Tag der deutschen Einheit', r'Tag der Deutschen Einheit'), (r'\bzweite(r|n|) Weltkrieg', r'Zweite\1 Weltkrieg'), ], - 'exceptions': ['sic!'], + 'exceptions': { + 'inside-tags': [ + 'nowiki', + 'comment', + 'math', + 'pre', + ], + 'text-contains': [ + r'sic!', + ], + } }, 'vonbis': { 'regex': True, @@ -235,7 +277,13 @@ (u'[[EP]]', u'[[Extended Play|EP]]'), (u'[[MC]]', u'[[Musikkassette|MC]]'), (u'[[Single]]', u'[[Single (Musik)|Single]]'), - ] + ], + 'exceptions': { + 'inside-tags': [ + 'hyperlink', + ] + } + }, # format of dates of birth and death, for de: # python replace.py -fix:datum -ref:Vorlage:Personendaten @@ -257,11 +305,13 @@ #(u'†[[(\d)', u'† [[\1'), (u'[[(\d+. (?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)) (\d{1,4})]]', u'[[\1]] [[\2]]'), ], - 'exceptions': [ - u'[[20. Juli 1944]]', - u'[[17. Juni 1953]]', - u'[[11. September 2001]]', - ] + 'exceptions': { + 'inside': [ + r'[[20. Juli 1944]]', + r'[[17. Juni 1953]]', + r'[[11. September 2001]]', + ], + } }, 'isbn': { 'regex': True, @@ -276,8 +326,17 @@ # or spaces between digits and separators. # Note that these regular expressions also match valid ISBNs, but # these won't be changed. - (ur'ISBN (\d+) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d)(?!\d)', r'ISBN \1-\2-\3-\4-\5'), # ISBN13 + (ur'ISBN (978|979) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d)(?!\d)', r'ISBN \1-\2-\3-\4-\5'), # ISBN13 (r'ISBN (\d+) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d|X|x)(?!\d)', r'ISBN \1-\2-\3-\4'), # ISBN10 ], + 'exceptions': { + 'inside-tags': [ + 'comment', + ], + 'inside': [ + r'ISBN (\d(-?)){12}\d', # matches valid ISBN-13s + r'ISBN (\d(-?)){9}[\dXx]', # matches valid ISBN-10s + ], + } }, }
Modified: trunk/pywikipedia/replace.py =================================================================== --- trunk/pywikipedia/replace.py 2007-09-13 13:40:12 UTC (rev 4269) +++ trunk/pywikipedia/replace.py 2007-09-13 13:41:54 UTC (rev 4270) @@ -23,9 +23,19 @@
-nocase Use case insensitive regular expressions.
--except:XYZ Ignore pages which contain XYZ. If the -regex argument is +-excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex argument + is given, XYZ will be regarded as a regular expression. + +-excepttext:XYZ Skip pages which contain the text XYZ. If the -regex argument is given, XYZ will be regarded as a regular expression.
+-exceptinside:XYZ Skip occurences of the to-be-replaced text which lie within XYZ. + If the -regex argument is given, XYZ will be regarded as a regular + expression. + +-exceptinsidetag:XYZ Skip occurences of the to-be-replaced text which lie within an + XYZ tag. + -summary:XYZ Set the summary message text for the edit to XYZ, bypassing the predefined message texts with original and replacements inserted. @@ -70,10 +80,14 @@
python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0
-If you have a page called 'John Doe' and want to convert HTML tags to wiki -syntax, use: +If you have a page called 'John Doe' and want to fix the format of ISBNs, use:
- python replace.py -page:John_Doe -fix:HTML + python replace.py -page:John_Doe -fix:isbn + +This command will change 'referer' to 'referrer', but not in pages which +talk about HTTP, where the typo has become part of the standard: + + python replace.py referer referrer -file:typos.txt -excepttext:HTTP """ # # (C) Daniel Herding, 2004 @@ -83,7 +97,7 @@
from __future__ import generators import sys, re -import wikipedia, pagegenerators,catlib, config +import wikipedia, pagegenerators, catlib, config
# Imports predefined replacements tasks from fixes.py import fixes @@ -135,35 +149,51 @@ """ Arguments: * xmlFilename - The dump's path, either absolute or relative - * replacements - A list of 2-tuples of original text (as a compiled - regular expression) and replacement text (as a - string). - * exceptions - A list of compiled regular expression; pages which - contain text that matches one of these won't be - changed. + * replacements - A list of 2-tuples of original text (as a + compiled regular expression) and replacement + text (as a string). + * exceptions - A dictionary which defines when to ignore an + occurence. See docu of the ReplaceRobot + constructor below. """
self.xmlFilename = xmlFilename self.replacements = replacements self.exceptions = exceptions
+ self.excsInside = [] + if self.exceptions.has_key('inside-tags'): + self.excsInside += self.exceptions['inside-tags'] + if self.exceptions.has_key('inside'): + self.excsInside += self.exceptions['inside'] + def __iter__(self): import xmlreader mysite = wikipedia.getSite() dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): - skip_page = False - for exception in self.exceptions: - if exception.search(entry.text): - skip_page = True - break - if not skip_page: - # TODO: leave out pages that only have old inside nowiki, comments, math + if not self.isTitleExcepted(entry.title) and not self.isTextExcepted(entry.text): + new_text = entry.text for old, new in self.replacements: - if old.search(entry.text): + new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside) + if new_text != entry.text: yield wikipedia.Page(mysite, entry.title) break
+ def isTitleExcepted(self, title): + if self.exceptions.has_key('title'): + for exc in self.exceptions['title']: + if exc.search(title): + return True + False + + def isTextExcepted(self, text): + if self.exceptions.has_key('text-contains'): + for exc in self.exceptions['text-contains']: + if exc.search(text): + return True + return False + class ReplaceRobot: """ A bot that can do text replacements. @@ -176,13 +206,30 @@ * replacements - A list of 2-tuples of original text (as a compiled regular expression) and replacement text (as a string). - * exceptions - A list of compiled regular expression; pages which - contain text that matches one of these won't be - changed. + * exceptions - A dictionary which defines when not to change an + occurence. See below. * acceptall - If True, the user won't be prompted before changes are made. * allowoverlap - If True, when matches overlap, all of them are replaced. * addedCat - If set to a value, add this category to every page touched. + + Structure of the exceptions dictionary: + This dictionary can have these keys: + + title + A list of regular expressions. All pages with titles that + are matched by one of these regular expressions are skipped. + text-contains + A list of regular expressions. All pages with text that + contains a part which is matched by one of these regular + expressions are skipped. + inside + A list of regular expressions. All occurences are skipped which + lie within a text region which is matched by one of these + regular expressions. + inside-tags + A list of strings. These strings must be keys from the + exceptionRegexes dictionary in wikipedia.replaceExcept(). """ self.generator = generator self.replacements = replacements @@ -192,25 +239,39 @@ self.recursive = recursive self.addedCat = addedCat
- def checkExceptions(self, original_text): + def isTitleExcepted(self, title): """ - If one of the exceptions applies for the given text, returns the - substring which matches the exception. Otherwise it returns None. + Iff one of the exceptions applies for the given title, returns True. """ - for exception in self.exceptions: - hit = exception.search(original_text) - if hit: - return hit.group(0) - return None + if self.exceptions.has_key('title'): + for exc in self.exceptions['title']: + if exc.search(title): + return True + return False
+ def isTextExcepted(self, original_text): + """ + Iff one of the exceptions applies for the given page contents, returns True. + """ + if self.exceptions.has_key('text-contains'): + for exc in self.exceptions['text-contains']: + if exc.search(original_text): + return True + return False + def doReplacements(self, original_text): """ Returns the text which is generated by applying all replacements to the given text. """ new_text = original_text + exceptions = [] + if self.exceptions.has_key('inside-tags'): + exceptions += self.exceptions['inside-tags'] + if self.exceptions.has_key('inside'): + exceptions += self.exceptions['inside'] for old, new in self.replacements: - new_text = wikipedia.replaceExcept(new_text, old, new, ['nowiki', 'comment', 'math', 'pre'], allowoverlap = self.allowoverlap) + new_text = wikipedia.replaceExcept(new_text, old, new, exceptions, allowoverlap = self.allowoverlap) return new_text
def run(self): @@ -231,11 +292,11 @@ continue except wikipedia.IsRedirectPage: original_text = page.get(get_redirect=True) - match = self.checkExceptions(original_text) - # skip all pages that contain certain texts - if match: - wikipedia.output(u'Skipping %s because it contains %s' % (page.aslink(), match)) + if self.isTitleExcepted(page.title()): + wikipedia.output(u'Skipping %s because the title is on the exceptions list.' % page.aslink()) else: + if self.isTextExcepted(original_text): + wikipedia.output(u'Skipping %s because it contains text that is on the exceptions list.' % page.aslink()) new_text = self.doReplacements(original_text) if new_text == original_text: wikipedia.output('No changes were necessary in %s' % page.aslink()) @@ -293,7 +354,12 @@ # A list of 2-tuples of original text and replacement text. replacements = [] # Don't edit pages which contain certain texts. - exceptions = [] + exceptions = { + 'title': [], + 'text-contains': [], + 'inside': [], + 'inside-tags': [], + } # Should the elements of 'replacements' and 'exceptions' be interpreted # as regular expressions? regex = False @@ -340,8 +406,14 @@ PageTitles.append(wikipedia.input(u'Which page do you want to chage?')) else: PageTitles.append(arg[6:]) - elif arg.startswith('-except:'): - exceptions.append(arg[8:]) + elif arg.startswith('-excepttitle:'): + exceptions['title'].append(arg[13:]) + elif arg.startswith('-excepttext:'): + exceptions['text-contains'].append(arg[12:]) + elif arg.startswith('-exceptinside:'): + exceptions['inside'].append(arg[14:]) + elif arg.startswith('-exceptinsidetag:'): + exceptions['inside-tags'].append(arg[17:]) elif arg.startswith('-fix:'): fix = arg[5:] elif arg == '-always': @@ -367,7 +439,7 @@ else: commandline_replacements.append(arg)
- if (len(commandline_replacements)%2): + if (len(commandline_replacements) % 2): raise wikipedia.Error, 'require even number of replacements.' elif (len(commandline_replacements) == 2 and fix == None): replacements.append((commandline_replacements[0], commandline_replacements[1])) @@ -375,10 +447,9 @@ wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')') elif (len(commandline_replacements) > 1): if (fix == None): - #replacements.extend(commandline_replacements) - for i in xrange (0,len(commandline_replacements),2): + for i in xrange (0, len(commandline_replacements), 2): replacements.append((commandline_replacements[i], - commandline_replacements[i+1])) + commandline_replacements[i + 1])) if summary_commandline == None: pairs = [(commandline_replacements[i], commandline_replacements[i + 1]) for i in range(0, len(commandline_replacements), 2)] replacementsDescription = '(' + ', '.join([('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) + ')' @@ -433,15 +504,16 @@ else: oldR = re.compile(old, re.UNICODE) replacements[i] = oldR, new - for i in range(len(exceptions)): - exception = exceptions[i] - if not regex: - exception = re.escape(exception) - if caseInsensitive: - exceptionR = re.compile(exception, re.UNICODE | re.IGNORECASE) - else: - exceptionR = re.compile(exception, re.UNICODE) - exceptions[i] = exceptionR + for exceptionCategory in ['title', 'text-contains', 'inside']: + if exceptions.has_key(exceptionCategory): + patterns = exceptions[exceptionCategory] + if not regex: + patterns = [re.escape(pattern) for pattern in patterns] + if caseInsensitive: + patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE) for pattern in patterns] + else: + patterns = [re.compile(pattern, re.UNICODE) for pattern in patterns] + exceptions[exceptionCategory] = patterns
if xmlFilename: gen = XmlDumpReplacePageGenerator(xmlFilename, replacements, exceptions)
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-09-13 13:40:12 UTC (rev 4269) +++ trunk/pywikipedia/wikipedia.py 2007-09-13 13:41:54 UTC (rev 4270) @@ -2572,6 +2572,11 @@ marker - a string, it will be added to the last replacement, if nothing is changed, it is added at the end """ + # Hyperlink regex is defined in weblinkchecker.py. Only import + # when required. + if 'hyperlink' in exceptions: + import weblinkchecker + exceptionRegexes = { 'comment': re.compile(r'(?s)<!--.*?-->'), 'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'), @@ -2597,6 +2602,8 @@ # depth, we'd need recursion which can't be done in Python's re. # After all, the language of correct parenthesis words is not regular. 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'), + 'hyperlink': weblinkchecker.compileLinkR(), + 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), }
# if we got a string, compile it as a regular expression @@ -2606,10 +2613,15 @@ else: old = re.compile(old)
- #noTouch = '|'.join([exceptions[name] for name in exceptList]) - #noTouchR = re.compile(noTouch) - # How much of the text we have looked at so far - dontTouchRegexes = [exceptionRegexes[name] for name in exceptions] + dontTouchRegexes = [] + for exc in exceptions: + if isinstance(exc, str) or isinstance(exc, unicode): + # assume it's a reference to the exceptionRegexes dictionary + # defined above. + dontTouchRegexes.append(exceptionRegexes[exc]) + else: + # assume it's a regular expression + dontTouchRegexes.append(exc) index = 0 markerpos = len(text) while True: