[Pywikipedia-l] SVN: [4270] trunk/pywikipedia

13 Sep 2007

Revision: 4270
Author:   wikipedian
Date:     2007-09-13 13:41:54 +0000 (Thu, 13 Sep 2007)
Log Message:
-----------
ReplaceRobot: heavily improved the way exceptions are specified. It is now possible to state things 
like 'fix commas, but not inside URLs and not in pages that start with XY.'
Modified Paths:
--------------
    trunk/pywikipedia/fixes.py
    trunk/pywikipedia/replace.py
    trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/fixes.py
===================================================================

--- trunk/pywikipedia/fixes.py	2007-09-13 13:40:12 UTC (rev 4269)
+++ trunk/pywikipedia/fixes.py	2007-09-13 13:41:54 UTC (rev 4270)
@@ -5,13 +5,14 @@
 help = """
                        * HTML       -  Convert HTML tags to wiki syntax, and
                                        fix XHTML.
+                       * isbn        - Fix badly formatted ISBNs.
                        * syntax     -  Try to fix bad wiki markup. Do not run
                                        this in automatic mode, as the bot may
                                        make mistakes.
                        * syntax-safe - Like syntax, but less risky, so you can
                                        run this in automatic mode.
-                       * case-de - fix upper/lower case errors in German
-                       * grammar-de - fix grammar and typography in German
+                       * case-de     - fix upper/lower case errors in German
+                       * grammar-de  - fix grammar and typography in German
 """
fixes = {
@@ -19,9 +20,6 @@
     # make remaining tags XHTML compliant.
     'HTML': {
         'regex': True,
-        # We don't want to mess up pages which discuss HTML tags, so we skip
-        # all pages which contain nowiki tags.
-        'exceptions':  ['<nowiki>'],
         'msg': {
                'en':u'Robot: converting/fixing HTML',
                'de':u'Bot: konvertiere/korrigiere HTML',
@@ -61,7 +59,15 @@
             (r'(?i)([\r\n]) *<h5> *([^<]+?) *</h5> *([\r\n])',  r"\1===== \2 =====\3"),
             (r'(?i)([\r\n]) *<h6> *([^<]+?) *</h6> *([\r\n])',  r"\1====== \2 ======\3"),
             # TODO: maybe we can make the bot replace <p> tags with \r\n's.
-        ]
+        ],
+        'exceptions': {
+            'inside-tags': [
+                'nowiki',
+                'comment',
+                'math',
+                'pre'
+            ],
+        }
     },
     # Grammar fixes for German language
     'grammar-de': {
@@ -90,11 +96,29 @@
             (u'([a-z](]])?) ,(([[)?[a-zA-Z])',                                                                          r'\1, \3'),
             #(u'([a-z].)([A-Z])',                                                                             r'\1 \2'),
         ],
-        'exceptions':  [
-            'sic!',
-            'Ju 52/3m', # Flugzeugbezeichnung
-            'AH-1W',    # Hubschrauberbezeichnung
-        ]
+        'exceptions': {
+            'inside-tags': [
+                'nowiki',
+                'comment',
+                'math',
+                'pre',           # because of code examples
+                'startspace',    # because of code examples
+                'hyperlink',     # e.g. commas in URLs
+                'gallery',       # because of filenames
+            ],
+            'text-contains': [
+                r'sic!',
+            ],
+            'inside': [
+                r'Ju 52/3m', # Flugzeugbezeichnung
+                r'AH-1W',    # Hubschrauberbezeichnung
+                r'\d+h \d+m', # Schreibweise für Zeiten, vor allem in Film-Infoboxen. Nicht korrekt, aber dafür schön kurz.
+                r'(?i)[[(Bild|Image|Media):.+?|', # Dateinamen auslassen
+            ],
+            'title': [
+                r'Arsen',  # chemische Formel
+            ],
+        }
     },
     # Do NOT run this automatically!
     # Recommendation: First run syntax-safe automatically, afterwards
@@ -143,16 +167,24 @@
             # mathematical context or program code.
             (r'{{([^{}]+?)}(?!})',       r'{{\1}}'),
         ],
-        'exceptions': [
-            r'http://.*?object=tx%5C%7C',               # regular dash in URL
-            r'http://.*?allmusic%5C.com',             # regular dash in URL
-            r'http://.*?allmovie%5C.com',             # regular dash in URL
-            r'http://physics.nist.gov/',            # regular dash in URL
-            r'http://www.forum-seniorenarbeit.de/', # regular dash in URL
-            r'http://kuenstlerdatenbank.ifa.de/',   # regular dash in URL
-            r'&object=med',                         # regular dash in URL
-            r'[CDATA['                            # lots of brackets
-        ]
+        'exceptions': {
+            'inside-tags': [
+                'nowiki',
+                'comment',
+                'math',
+                'pre',
+            ],
+            'text-contains': [
+                r'http://.*?object=tx%5C%7C',               # regular dash in URL
+                r'http://.*?allmusic%5C.com',             # regular dash in URL
+                r'http://.*?allmovie%5C.com',             # regular dash in URL
+                r'http://physics.nist.gov/',            # regular dash in URL
+                r'http://www.forum-seniorenarbeit.de/', # regular dash in URL
+                r'http://kuenstlerdatenbank.ifa.de/',   # regular dash in URL
+                r'&object=med',                         # regular dash in URL
+                r'[CDATA['                            # lots of brackets
+            ],
+        }
     },
     # The same as syntax, but restricted to replacements that should
     # be safe to run automatically.
@@ -210,7 +242,17 @@
             (r'Tag der deutschen Einheit', r'Tag der Deutschen Einheit'),
             (r'\bzweite(r|n|) Weltkrieg', r'Zweite\1 Weltkrieg'),
         ],
-        'exceptions':  ['sic!'],
+        'exceptions': {
+            'inside-tags': [
+                'nowiki',
+                'comment',
+                'math',
+                'pre',
+            ],
+            'text-contains': [
+                r'sic!',
+            ],
+        }
     },
     'vonbis': {
         'regex': True,
@@ -235,7 +277,13 @@
             (u'[[EP]]', u'[[Extended Play|EP]]'),
             (u'[[MC]]', u'[[Musikkassette|MC]]'),
             (u'[[Single]]', u'[[Single (Musik)|Single]]'),
-        ]
+        ],
+        'exceptions': {
+            'inside-tags': [
+                'hyperlink',
+            ]
+        }
+
     },
     # format of dates of birth and death, for de:
     # python replace.py -fix:datum -ref:Vorlage:Personendaten
@@ -257,11 +305,13 @@
             #(u'&dagger;[[(\d)', u'† [[\1'),
             (u'[[(\d+. (?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)) (\d{1,4})]]', u'[[\1]] [[\2]]'),
         ],
-        'exceptions': [
-            u'[[20. Juli 1944]]',
-            u'[[17. Juni 1953]]',
-            u'[[11. September 2001]]',
-        ]
+        'exceptions': {
+            'inside': [
+                r'[[20. Juli 1944]]',
+                r'[[17. Juni 1953]]',
+                r'[[11. September 2001]]',
+            ],
+        }
     },
     'isbn': {
         'regex': True,
@@ -276,8 +326,17 @@
             # or spaces between digits and separators.
             # Note that these regular expressions also match valid ISBNs, but
             # these won't be changed.
-            (ur'ISBN (\d+) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d)(?!\d)', r'ISBN \1-\2-\3-\4-\5'), # ISBN13
+            (ur'ISBN (978|979) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d)(?!\d)', r'ISBN \1-\2-\3-\4-\5'), # ISBN13
             (r'ISBN (\d+) *[- –.] *(\d+) *[- –.] *(\d+) *[- –.] *(\d|X|x)(?!\d)', r'ISBN \1-\2-\3-\4'), # ISBN10
         ],
+        'exceptions': {
+            'inside-tags': [
+                'comment',
+            ],
+            'inside': [
+                r'ISBN (\d(-?)){12}\d',    # matches valid ISBN-13s
+                r'ISBN (\d(-?)){9}[\dXx]', # matches valid ISBN-10s
+            ],
+        }
     },
 }
Modified: trunk/pywikipedia/replace.py
===================================================================
--- trunk/pywikipedia/replace.py	2007-09-13 13:40:12 UTC (rev 4269)
+++ trunk/pywikipedia/replace.py	2007-09-13 13:41:54 UTC (rev 4270)
@@ -23,9 +23,19 @@
-nocase           Use case insensitive regular expressions.
--except:XYZ       Ignore pages which contain XYZ. If the -regex argument is
+-excepttitle:XYZ  Skip pages with titles that contain XYZ. If the -regex argument
+                  is given, XYZ will be regarded as a regular expression.
+
+-excepttext:XYZ   Skip pages which contain the text XYZ. If the -regex argument is
                   given, XYZ will be regarded as a regular expression.
+-exceptinside:XYZ Skip occurences of the to-be-replaced text which lie within XYZ.
+                  If the -regex argument is given, XYZ will be regarded as a regular
+                  expression.
+
+-exceptinsidetag:XYZ Skip occurences of the to-be-replaced text which lie within an
+                  XYZ tag.
+
 -summary:XYZ      Set the summary message text for the edit to XYZ, bypassing
                   the predefined message texts with original and replacements
                   inserted.
@@ -70,10 +80,14 @@
python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0
-If you have a page called 'John Doe' and want to convert HTML tags to wiki
-syntax, use:
+If you have a page called 'John Doe' and want to fix the format of ISBNs, use:
-    python replace.py -page:John_Doe -fix:HTML
+    python replace.py -page:John_Doe -fix:isbn
+
+This command will change 'referer' to 'referrer', but not in pages which
+talk about HTTP, where the typo has become part of the standard:
+
+    python replace.py referer referrer -file:typos.txt -excepttext:HTTP
 """
 #
 # (C) Daniel Herding, 2004
@@ -83,7 +97,7 @@
from __future__ import generators
 import sys, re
-import wikipedia, pagegenerators,catlib, config
+import wikipedia, pagegenerators, catlib, config
# Imports predefined replacements tasks from fixes.py
 import fixes
@@ -135,35 +149,51 @@
         """
         Arguments:
             * xmlFilename  - The dump's path, either absolute or relative
-            * replacements - A list of 2-tuples of original text (as a compiled
-                             regular expression) and replacement text (as a
-                             string).
-            * exceptions   - A list of compiled regular expression; pages which
-                             contain text that matches one of these won't be
-                             changed.
+            * replacements - A list of 2-tuples of original text (as a
+                             compiled regular expression) and replacement
+                             text (as a string).
+            * exceptions   - A dictionary which defines when to ignore an
+                             occurence. See docu of the ReplaceRobot
+                             constructor below.
         """
self.xmlFilename = xmlFilename
         self.replacements = replacements
         self.exceptions = exceptions
+        self.excsInside = []
+        if self.exceptions.has_key('inside-tags'):
+            self.excsInside += self.exceptions['inside-tags']
+        if self.exceptions.has_key('inside'):
+            self.excsInside += self.exceptions['inside']
+
     def __iter__(self):
         import xmlreader
         mysite = wikipedia.getSite()
         dump = xmlreader.XmlDump(self.xmlFilename)
         for entry in dump.parse():
-            skip_page = False
-            for exception in self.exceptions:
-                if exception.search(entry.text):
-                    skip_page = True
-                    break
-            if not skip_page:
-                # TODO: leave out pages that only have old inside nowiki, comments, math
+            if not self.isTitleExcepted(entry.title) and not self.isTextExcepted(entry.text):
+                new_text = entry.text
                 for old, new in self.replacements:
-                    if old.search(entry.text):
+                    new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside)
+                    if new_text != entry.text:
                         yield wikipedia.Page(mysite, entry.title)
                         break
+    def isTitleExcepted(self, title):
+        if self.exceptions.has_key('title'):
+            for exc in self.exceptions['title']:
+                if exc.search(title):
+                    return True
+        False
+
+    def isTextExcepted(self, text):
+        if self.exceptions.has_key('text-contains'):
+            for exc in self.exceptions['text-contains']:
+                if exc.search(text):
+                    return True
+        return False
+
 class ReplaceRobot:
     """
     A bot that can do text replacements.
@@ -176,13 +206,30 @@
             * replacements - A list of 2-tuples of original text (as a compiled
                              regular expression) and replacement text (as a
                              string).
-            * exceptions   - A list of compiled regular expression; pages which
-                             contain text that matches one of these won't be
-                             changed.
+            * exceptions   - A dictionary which defines when not to change an
+                             occurence. See below.
             * acceptall    - If True, the user won't be prompted before changes
                              are made.
             * allowoverlap - If True, when matches overlap, all of them are replaced.
             * addedCat     - If set to a value, add this category to every page touched.
+
+        Structure of the exceptions dictionary:
+        This dictionary can have these keys:
+
+            title
+                A list of regular expressions. All pages with titles that
+                are matched by one of these regular expressions are skipped.
+            text-contains
+                A list of regular expressions. All pages with text that
+                contains a part which is matched by one of these regular
+                expressions are skipped.
+            inside
+                A list of regular expressions. All occurences are skipped which
+                lie within a text region which is matched by one of these
+                regular expressions.
+            inside-tags
+                A list of strings. These strings must be keys from the
+                exceptionRegexes dictionary in wikipedia.replaceExcept().
         """
         self.generator = generator
         self.replacements = replacements
@@ -192,25 +239,39 @@
         self.recursive = recursive
         self.addedCat = addedCat
-    def checkExceptions(self, original_text):
+    def isTitleExcepted(self, title):
         """
-        If one of the exceptions applies for the given text, returns the
-        substring which matches the exception. Otherwise it returns None.
+        Iff one of the exceptions applies for the given title, returns True.
         """
-        for exception in self.exceptions:
-            hit = exception.search(original_text)
-            if hit:
-                return hit.group(0)
-        return None
+        if self.exceptions.has_key('title'):
+            for exc in self.exceptions['title']:
+                if exc.search(title):
+                    return True
+        return False
+    def isTextExcepted(self, original_text):
+        """
+        Iff one of the exceptions applies for the given page contents, returns True.
+        """
+        if self.exceptions.has_key('text-contains'):
+            for exc in self.exceptions['text-contains']:
+                if exc.search(original_text):
+                    return True
+        return False
+
     def doReplacements(self, original_text):
         """
         Returns the text which is generated by applying all replacements to the
         given text.
         """
         new_text = original_text
+        exceptions = []
+        if self.exceptions.has_key('inside-tags'):
+            exceptions += self.exceptions['inside-tags']
+        if self.exceptions.has_key('inside'):
+            exceptions += self.exceptions['inside']
         for old, new in self.replacements:
-            new_text = wikipedia.replaceExcept(new_text, old, new, ['nowiki', 'comment', 'math', 'pre'], allowoverlap = self.allowoverlap)
+            new_text = wikipedia.replaceExcept(new_text, old, new, exceptions, allowoverlap = self.allowoverlap)
         return new_text
def run(self):
@@ -231,11 +292,11 @@
                 continue
             except wikipedia.IsRedirectPage:
                 original_text = page.get(get_redirect=True)
-            match = self.checkExceptions(original_text)
-            # skip all pages that contain certain texts
-            if match:
-                wikipedia.output(u'Skipping %s because it contains %s' % (page.aslink(), match))
+            if self.isTitleExcepted(page.title()):
+                wikipedia.output(u'Skipping %s because the title is on the exceptions list.' % page.aslink())
             else:
+                if self.isTextExcepted(original_text):
+                    wikipedia.output(u'Skipping %s because it contains text that is on the exceptions list.' % page.aslink())
                 new_text = self.doReplacements(original_text)
                 if new_text == original_text:
                     wikipedia.output('No changes were necessary in %s' % page.aslink())
@@ -293,7 +354,12 @@
     # A list of 2-tuples of original text and replacement text.
     replacements = []
     # Don't edit pages which contain certain texts.
-    exceptions = []
+    exceptions = {
+        'title':         [],
+        'text-contains': [],
+        'inside':        [],
+        'inside-tags':   [],
+    }
     # Should the elements of 'replacements' and 'exceptions' be interpreted
     # as regular expressions?
     regex = False
@@ -340,8 +406,14 @@
                 PageTitles.append(wikipedia.input(u'Which page do you want to chage?'))
             else:
                 PageTitles.append(arg[6:])
-        elif arg.startswith('-except:'):
-            exceptions.append(arg[8:])
+        elif arg.startswith('-excepttitle:'):
+            exceptions['title'].append(arg[13:])
+        elif arg.startswith('-excepttext:'):
+            exceptions['text-contains'].append(arg[12:])
+        elif arg.startswith('-exceptinside:'):
+            exceptions['inside'].append(arg[14:])
+        elif arg.startswith('-exceptinsidetag:'):
+            exceptions['inside-tags'].append(arg[17:])
         elif arg.startswith('-fix:'):
             fix = arg[5:]
         elif arg == '-always':
@@ -367,7 +439,7 @@
             else:
                 commandline_replacements.append(arg)
-    if (len(commandline_replacements)%2):
+    if (len(commandline_replacements) % 2):
         raise wikipedia.Error, 'require even number of replacements.'
     elif (len(commandline_replacements) == 2 and fix == None):
         replacements.append((commandline_replacements[0], commandline_replacements[1]))
@@ -375,10 +447,9 @@
             wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')')
     elif (len(commandline_replacements) > 1):
         if (fix == None):
-            #replacements.extend(commandline_replacements)
-            for i in xrange (0,len(commandline_replacements),2):
+            for i in xrange (0, len(commandline_replacements), 2):
                 replacements.append((commandline_replacements[i],
-                                     commandline_replacements[i+1]))
+                                     commandline_replacements[i + 1]))
             if summary_commandline == None:
                 pairs = [(commandline_replacements[i], commandline_replacements[i + 1]) for i in range(0, len(commandline_replacements), 2)]
                 replacementsDescription = '(' + ', '.join([('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) + ')'
@@ -433,15 +504,16 @@
         else:
             oldR = re.compile(old, re.UNICODE)
         replacements[i] = oldR, new
-    for i in range(len(exceptions)):
-        exception = exceptions[i]
-        if not regex:
-            exception = re.escape(exception)
-        if caseInsensitive:
-            exceptionR = re.compile(exception, re.UNICODE | re.IGNORECASE)
-        else:
-            exceptionR = re.compile(exception, re.UNICODE)
-        exceptions[i] = exceptionR
+    for exceptionCategory in ['title', 'text-contains', 'inside']:
+        if exceptions.has_key(exceptionCategory):
+            patterns = exceptions[exceptionCategory]
+            if not regex:
+                patterns = [re.escape(pattern) for pattern in patterns]
+            if caseInsensitive:
+                patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE) for pattern in patterns]
+            else:
+                patterns = [re.compile(pattern, re.UNICODE) for pattern in patterns]
+                exceptions[exceptionCategory] = patterns
if xmlFilename:
         gen = XmlDumpReplacePageGenerator(xmlFilename, replacements, exceptions)
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2007-09-13 13:40:12 UTC (rev 4269)
+++ trunk/pywikipedia/wikipedia.py	2007-09-13 13:41:54 UTC (rev 4270)
@@ -2572,6 +2572,11 @@
         marker          - a string, it will be added to the last replacement,
                           if nothing is changed, it is added at the end
     """
+    # Hyperlink regex is defined in weblinkchecker.py. Only import
+    # when required.
+    if 'hyperlink' in exceptions:
+        import weblinkchecker
+
     exceptionRegexes = {
         'comment':     re.compile(r'(?s)<!--.*?-->'),
         'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'),
@@ -2597,6 +2602,8 @@
         # depth, we'd need recursion which can't be done in Python's re.
         # After all, the language of correct parenthesis words is not regular.
         'template':    re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
+        'hyperlink':   weblinkchecker.compileLinkR(),
+        'gallery':     re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
     }
# if we got a string, compile it as a regular expression
@@ -2606,10 +2613,15 @@
         else:
             old = re.compile(old)
-    #noTouch = '|'.join([exceptions[name] for name in exceptList])
-    #noTouchR = re.compile(noTouch)
-    # How much of the text we have looked at so far
-    dontTouchRegexes = [exceptionRegexes[name] for name in exceptions]
+    dontTouchRegexes = []
+    for exc in exceptions:
+        if isinstance(exc, str) or isinstance(exc, unicode):
+            # assume it's a reference to the exceptionRegexes dictionary
+            # defined above.
+            dontTouchRegexes.append(exceptionRegexes[exc])
+        else:
+            # assume it's a regular expression
+            dontTouchRegexes.append(exc)
     index = 0
     markerpos = len(text)
     while True:

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

[Pywikipedia-l] SVN: [4270] trunk/pywikipedia