Revision: 5790 Author: wikipedian Date: 2008-08-07 10:10:46 +0000 (Thu, 07 Aug 2008)
Log Message: ----------- Reverted r5788 by alnokta/Betacommand.
Instead, added an interwiki regex to wikipedia.replaceExcept(). You can now run replace.py with this parameter: -exceptinsidetag:interwiki
Please try to keep things simple and don't add so many parameters to the scripts, especially when they're inconsistent with the existing ones.
Modified Paths: -------------- trunk/pywikipedia/replace.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/replace.py =================================================================== --- trunk/pywikipedia/replace.py 2008-08-07 00:25:37 UTC (rev 5789) +++ trunk/pywikipedia/replace.py 2008-08-07 10:10:46 UTC (rev 5790) @@ -35,7 +35,7 @@
-requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex argument is given, XYZ will be regarded as a regular - expression. + expression.
-excepttext:XYZ Skip pages which contain the text XYZ. If the -regex argument is given, XYZ will be regarded as a regular @@ -212,8 +212,7 @@ and not self.isTextExcepted(entry.text): new_text = entry.text for old, new in self.replacements: - new_text = wikipedia.replaceExcept( - new_text, old, new, self.excsInside) + new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside, self.site) if new_text != entry.text: yield wikipedia.Page(self.site, entry.title) except KeyboardInterrupt: @@ -251,7 +250,7 @@ """ def __init__(self, generator, replacements, exceptions={}, acceptall=False, allowoverlap=False, recursive=False, - addedCat=None, sleep=None, exceptinterwiki=False): + addedCat=None, sleep=None): """ Arguments: * generator - A generator that yields Page objects. @@ -292,7 +291,6 @@ self.acceptall = acceptall self.allowoverlap = allowoverlap self.recursive = recursive - self.exceptinterwiki = exceptinterwiki if addedCat: site = wikipedia.getSite() cat_ns = site.category_namespaces()[0] @@ -339,13 +337,8 @@ for old, new in self.replacements: if self.sleep != None: time.sleep(self.sleep) - if self.exceptinterwiki: - interwikis = wikipedia.getLanguageLinks(new_text) - new_text = wikipedia.removeLanguageLinks(new_text) new_text = wikipedia.replaceExcept(new_text, old, new, exceptions, allowoverlap=self.allowoverlap) - if self.exceptinterwiki: - new_text = wikipedia.replaceLanguageLinks(new_text,interwikis) return new_text
def run(self): @@ -498,8 +491,6 @@ allowoverlap = False # Do not recurse replacement recursive = False - #add flag to ignore interwiki links - exceptinterwiki = False # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. @@ -551,8 +542,6 @@ sleep = float(arg[7:]) elif arg == '-always': acceptall = True - elif arg == '-exceptinterwiki': - exceptinterwiki = True elif arg == '-recursive': recursive = True elif arg == '-nocase': @@ -711,7 +700,7 @@ pageNumber=20, lookahead=100) else: preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60) - bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, exceptinterwiki) + bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep) bot.run()
if __name__ == "__main__":
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2008-08-07 00:25:37 UTC (rev 5789) +++ trunk/pywikipedia/wikipedia.py 2008-08-07 10:10:46 UTC (rev 5790) @@ -3143,7 +3143,7 @@ # All return the modified text as a unicode object
def replaceExcept(text, old, new, exceptions, caseInsensitive=False, - allowoverlap=False, marker = ''): + allowoverlap=False, marker = '', site = None): """ Return text with 'old' replaced by 'new', ignoring specified types of text.
@@ -3169,6 +3169,9 @@ # Hyperlink regex is defined in weblinkchecker.py import weblinkchecker
+ if site is None: + site = getSite() + exceptionRegexes = { 'comment': re.compile(r'(?s)<!--.*?-->'), # section headers @@ -3203,7 +3206,10 @@ 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), # this matches internal wikilinks, but also interwiki, categories, and # images. - 'link': re.compile(r'[[(?P<title>[^]|]*)(|[^]]*)?]]') + 'link': re.compile(r'[[[^]|]*(|[^]]*)?]]'), + 'interwiki': re.compile(r'(?i)[[(%s)\s?:[^]]*]][\s]*' + % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())), + }
# if we got a string, compile it as a regular expression
pywikipedia-l@lists.wikimedia.org