[Pywikipedia-l] SVN: [4254] trunk/pywikipedia/wikipedia.py - pywikibot

12 Sep 2007

Revision: 4254
Author:   wikipedian
Date:     2007-09-12 12:29:22 +0000 (Wed, 12 Sep 2007)
Log Message:
-----------
heavily simplified the code, using named groups etc.
renamed identifiers to make it more understandable
Modified Paths:
--------------
    trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================

--- trunk/pywikipedia/wikipedia.py	2007-09-12 10:54:24 UTC (rev 4253)
+++ trunk/pywikipedia/wikipedia.py	2007-09-12 12:29:22 UTC (rev 4254)
@@ -2042,67 +2042,71 @@
# Copyright (c) Orgullomoore, Bryan
-        # TODO: document and simplify the code, use understandable variable names
+        # TODO: document and simplify the code
         site = self.site()
text = self.get()
         new_text = text
-        def create_regex(s):
-            s = re.escape(s)
-            return ur'(?:[%s%s]%s)' % (s[0].upper(), s[0].lower(), s[1:])
-        def create_regex_i(s):
+        def caseInsensitivePattern(s):
             """
-            Creates a pattern that matches the string (unescaped), case-insensitively.
-            Somehow an awkward way of doing this.
+            Creates a pattern that matches the string case-insensitively.
             """
+            s = re.escape(s)
             return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(), c.lower()) for c in s])
-        namespaces = ('Image', 'Media') + site.namespace(6, all = True) + site.namespace(-2, all = True)
+        def capitalizationPattern(s):
+            """
+            Given a string, creates a pattern that matches the string, with
+            the first letter case-insensitive if capitalization is switched
+            on on the site you're working on.
+            """
+            s = re.escape(s)
+            if self.site().nocapitalize:
+                return s
+            else:
+                return ur'(?:[%s%s]%s)' % (s[0].upper(), s[0].lower(), s[1:])
+
+        namespaces = set(('Image', 'Media') + site.namespace(6, all = True) + site.namespace(-2, all = True))
         # note that the colon is already included here
-        r_namespace = ur'\s*(?:%s)\s*:\s*' % u'|'.join(map(create_regex_i, namespaces))
-        r_image = u'(%s)' % create_regex(image).replace(r'_', '[ _]')
+        namespacePattern = ur'\s*(?:%s)\s*:\s*' % u'|'.join(map(caseInsensitivePattern, namespaces))
-        def simple_replacer(match, groupNumber = 1):
+        imagePattern = u'(%s)' % capitalizationPattern(image).replace(r'_', '[ _]')
+
+        def filename_replacer(match):
             if replacement == None:
                 return u''
             else:
-                groups = list(match.groups())
-                groups[groupNumber] = replacement
-                return u''.join(groups)
+                old = match.group()
+                return old[:match.start('filename')] + replacement + old[match.end('filename'):]
# The group params contains parameters such as thumb and 200px, as well
         # as the image caption. The caption can contain wiki links, but each
         # link has to be closed properly.
-        r_param = r'(?:|(?:(?![[).|[[.*?]])*?)'
-        rImage = re.compile(ur'([[)(?P<namespace>%s)%s(?P<params>%s*?)(]])' % (r_namespace, r_image, r_param))
+        paramPattern = r'(?:|(?:(?![[).|[[.*?]])*?)'
+        rImage = re.compile(ur'[[(?P<namespace>%s)(?P<filename>%s)(?P<params>%s*?)]]' % (namespacePattern, imagePattern, paramPattern))
+        if replacement == None:
+            new_text = rImage.sub('', new_text)
+        else:
+            new_text = rImage.sub('[[\g<namespace>%s\g<params>]]' % replacement, new_text)
-        while True:
-            m = rImage.search(new_text)
-            if not m:
-                break
-            new_text = new_text[:m.start()] +  simple_replacer(m, 2) + new_text[m.end():]
-
-
         # Remove the image from galleries
-        r_galleries = ur'(?s)(<%s>)(?s)(.*?)(</%s>)' % (create_regex_i('gallery'),
-            create_regex_i('gallery'))
-        r_gallery_item = ur'(?m)^((?:%s)?)%s(\s*(?:|.*?)?\s*)$' % (r_namespace, r_image)
+        galleryR = re.compile(r'(?is)<gallery>(?P<items>.*?)</gallery>')
+        galleryItemR = re.compile(r'(?m)^%s?(?P<filename>%s)\s*(?P<label>|.*?)?\s*$' % (namespacePattern, imagePattern))
         def gallery_replacer(match):
-            return ur'%s%s%s' % (match.group(1),
-                re.sub(r_gallery_item, simple_replacer, match.group(2)),
-                match.group(3))
-        new_text = re.sub(r_galleries, gallery_replacer, new_text)
+            return ur'<gallery>%s<gallery>' % galleryItemR.sub(filename_replacer, match.group('items'))
+        new_text = galleryR.sub(gallery_replacer, new_text)
if (text == new_text) or (not safe):
             # All previous steps did not work, so the image is
             # likely embedded in a complicated template.
-            r_templates = ur'(?s)({{.*?}})'
-            r_complicated = u'(?s)((?:%s)?)%s' % (r_namespace, r_image)
+            # Note: this regular expression can't handle nested templates.
+            templateR = re.compile(ur'(?s){{(?<contents>.*?}}')
+            fileReferenceR = re.compile(u'%s(?P<filename>(?:%s)?)' % (namespacePattern, imagePattern))
def template_replacer(match):
-                return re.sub(r_complicated, simple_replacer, match.group(1))
-            new_text = re.sub(r_templates, template_replacer, new_text)
+                return fileReferenceR.sub(filename_replacer, match.group('contents'))
+            new_text = templateR.sub(template_replacer, new_text)
if put:
             if text != new_text: