Revision: 3951 Author: wikipedian Date: 2007-08-02 14:56:28 +0000 (Thu, 02 Aug 2007)
Log Message: ----------- heavily simplified Page.replaceImage()
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-08-02 12:11:29 UTC (rev 3950) +++ trunk/pywikipedia/wikipedia.py 2007-08-02 14:56:28 UTC (rev 3951) @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +## -*- coding: utf-8 -*- """ Library to get and put pages on a MediaWiki.
@@ -2003,62 +2003,32 @@ return ur'(?:[%s%s]%s)' % (s[0].upper(), s[0].lower(), s[1:]) def create_regex_i(s): return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(), c.lower()) for c in s]) - + namespaces = ('Image', 'Media') + site.namespace(6, all = True) + site.namespace(-2, all = True) + # note that the colon is already included here r_namespace = ur'\s*(?:%s)\s*:\s*' % u'|'.join(map(create_regex_i, namespaces)) r_image = u'(%s)' % create_regex(image).replace(r'_', '[ _]')
- def simple_replacer(match): + def simple_replacer(match, groupNumber = 1): if replacement == None: return u'' else: groups = list(match.groups()) - groups[1] = replacement + groups[groupNumber] = replacement return u''.join(groups) - - # Previously links in image descriptions will cause - # unexpected behaviour: [[Image:image.jpg|thumb|[[link]] in description]] - # will truncate at the first occurence of ]]. This cannot be - # fixed using one regular expression. - # This means that all ]] after the start of the image - # must be located. If it then does not have an associated - # [[, this one is the closure of the image. - - r_simple_s = u'([[%s)%s' % (r_namespace, r_image) - r_s = '[[' - r_e = ']]' - # First determine where wikilinks start and end - image_starts = [match.start() for match in re.finditer(r_simple_s, text)] - link_starts = [match.start() for match in re.finditer(r_s, text)] - link_ends = [match.end() for match in re.finditer(r_e, text)] - - r_simple = u'([[%s)%s(.*)' % (r_namespace, r_image) - replacements = [] - for image_start in image_starts: - current_link_starts = [link_start for link_start in link_starts - if link_start > image_start] - current_link_ends = [link_end for link_end in link_ends - if link_end > image_start] - end = image_start - if current_link_ends: end = current_link_ends[0] - - while current_link_starts and current_link_ends: - start = current_link_starts.pop(0) - end = current_link_ends.pop(0) - if end <= start and end > image_start: - # Found the end of the image - break - - # Add the replacement to the todo list. Doing the - # replacement right know would alter the indices. - replacements.append((new_text[image_start:end], - re.sub(r_simple, simple_replacer, - new_text[image_start:end]))) - - # Perform the replacements - for old, new in replacements: - if old: new_text = new_text.replace(old, new) - + + # The group params contains parameters such as thumb and 200px, as well + # as the image caption. The caption can contain wiki links, but each + # link has to be closed properly. + r_param = r'(?:|(?:(?![[).|[[.*?]])*?)' + rImage = re.compile(ur'([[)(?P<namespace>%s)%s(?P<params>%s*?)(]])' % (r_namespace, r_image, r_param)) + + while True: + m = rImage.search(new_text) + if not m: + break + new_text = new_text[:m.start()] + simple_replacer(m, 2) + new_text[m.end():] + # Remove the image from galleries r_galleries = ur'(?s)(<%s>)(?s)(.*?)(</%s>)' % (create_regex_i('gallery'), create_regex_i('gallery'))