Revision: 4254 Author: wikipedian Date: 2007-09-12 12:29:22 +0000 (Wed, 12 Sep 2007)
Log Message: ----------- heavily simplified the code, using named groups etc. renamed identifiers to make it more understandable
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-09-12 10:54:24 UTC (rev 4253) +++ trunk/pywikipedia/wikipedia.py 2007-09-12 12:29:22 UTC (rev 4254) @@ -2042,67 +2042,71 @@
# Copyright (c) Orgullomoore, Bryan
- # TODO: document and simplify the code, use understandable variable names + # TODO: document and simplify the code site = self.site()
text = self.get() new_text = text
- def create_regex(s): - s = re.escape(s) - return ur'(?:[%s%s]%s)' % (s[0].upper(), s[0].lower(), s[1:]) - def create_regex_i(s): + def caseInsensitivePattern(s): """ - Creates a pattern that matches the string (unescaped), case-insensitively. - Somehow an awkward way of doing this. + Creates a pattern that matches the string case-insensitively. """ + s = re.escape(s) return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(), c.lower()) for c in s])
- namespaces = ('Image', 'Media') + site.namespace(6, all = True) + site.namespace(-2, all = True) + def capitalizationPattern(s): + """ + Given a string, creates a pattern that matches the string, with + the first letter case-insensitive if capitalization is switched + on on the site you're working on. + """ + s = re.escape(s) + if self.site().nocapitalize: + return s + else: + return ur'(?:[%s%s]%s)' % (s[0].upper(), s[0].lower(), s[1:]) + + namespaces = set(('Image', 'Media') + site.namespace(6, all = True) + site.namespace(-2, all = True)) # note that the colon is already included here - r_namespace = ur'\s*(?:%s)\s*:\s*' % u'|'.join(map(create_regex_i, namespaces)) - r_image = u'(%s)' % create_regex(image).replace(r'_', '[ _]') + namespacePattern = ur'\s*(?:%s)\s*:\s*' % u'|'.join(map(caseInsensitivePattern, namespaces))
- def simple_replacer(match, groupNumber = 1): + imagePattern = u'(%s)' % capitalizationPattern(image).replace(r'_', '[ _]') + + def filename_replacer(match): if replacement == None: return u'' else: - groups = list(match.groups()) - groups[groupNumber] = replacement - return u''.join(groups) + old = match.group() + return old[:match.start('filename')] + replacement + old[match.end('filename'):]
# The group params contains parameters such as thumb and 200px, as well # as the image caption. The caption can contain wiki links, but each # link has to be closed properly. - r_param = r'(?:|(?:(?![[).|[[.*?]])*?)' - rImage = re.compile(ur'([[)(?P<namespace>%s)%s(?P<params>%s*?)(]])' % (r_namespace, r_image, r_param)) + paramPattern = r'(?:|(?:(?![[).|[[.*?]])*?)' + rImage = re.compile(ur'[[(?P<namespace>%s)(?P<filename>%s)(?P<params>%s*?)]]' % (namespacePattern, imagePattern, paramPattern)) + if replacement == None: + new_text = rImage.sub('', new_text) + else: + new_text = rImage.sub('[[\g<namespace>%s\g<params>]]' % replacement, new_text)
- while True: - m = rImage.search(new_text) - if not m: - break - new_text = new_text[:m.start()] + simple_replacer(m, 2) + new_text[m.end():] - - # Remove the image from galleries - r_galleries = ur'(?s)(<%s>)(?s)(.*?)(</%s>)' % (create_regex_i('gallery'), - create_regex_i('gallery')) - r_gallery_item = ur'(?m)^((?:%s)?)%s(\s*(?:|.*?)?\s*)$' % (r_namespace, r_image) + galleryR = re.compile(r'(?is)<gallery>(?P<items>.*?)</gallery>') + galleryItemR = re.compile(r'(?m)^%s?(?P<filename>%s)\s*(?P<label>|.*?)?\s*$' % (namespacePattern, imagePattern)) def gallery_replacer(match): - return ur'%s%s%s' % (match.group(1), - re.sub(r_gallery_item, simple_replacer, match.group(2)), - match.group(3)) - new_text = re.sub(r_galleries, gallery_replacer, new_text) + return ur'<gallery>%s<gallery>' % galleryItemR.sub(filename_replacer, match.group('items')) + new_text = galleryR.sub(gallery_replacer, new_text)
if (text == new_text) or (not safe): # All previous steps did not work, so the image is # likely embedded in a complicated template. - r_templates = ur'(?s)({{.*?}})' - r_complicated = u'(?s)((?:%s)?)%s' % (r_namespace, r_image) + # Note: this regular expression can't handle nested templates. + templateR = re.compile(ur'(?s){{(?<contents>.*?}}') + fileReferenceR = re.compile(u'%s(?P<filename>(?:%s)?)' % (namespacePattern, imagePattern))
def template_replacer(match): - return re.sub(r_complicated, simple_replacer, match.group(1)) - new_text = re.sub(r_templates, template_replacer, new_text) + return fileReferenceR.sub(filename_replacer, match.group('contents')) + new_text = templateR.sub(template_replacer, new_text)
if put: if text != new_text: