Revision: 5790
Author: wikipedian
Date: 2008-08-07 10:10:46 +0000 (Thu, 07 Aug 2008)
Log Message:
-----------
Reverted r5788 by alnokta/Betacommand.
Instead, added an interwiki regex to wikipedia.replaceExcept(). You can now run replace.py with this parameter:
-exceptinsidetag:interwiki
Please try to keep things simple and don't add so many parameters to the scripts, especially when they're inconsistent with the
existing ones.
Modified Paths:
--------------
trunk/pywikipedia/replace.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/replace.py
===================================================================
--- trunk/pywikipedia/replace.py 2008-08-07 00:25:37 UTC (rev 5789)
+++ trunk/pywikipedia/replace.py 2008-08-07 10:10:46 UTC (rev 5790)
@@ -35,7 +35,7 @@
-requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex
argument is given, XYZ will be regarded as a regular
- expression.
+ expression.
-excepttext:XYZ Skip pages which contain the text XYZ. If the -regex
argument is given, XYZ will be regarded as a regular
@@ -212,8 +212,7 @@
and not self.isTextExcepted(entry.text):
new_text = entry.text
for old, new in self.replacements:
- new_text = wikipedia.replaceExcept(
- new_text, old, new, self.excsInside)
+ new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside, self.site)
if new_text != entry.text:
yield wikipedia.Page(self.site, entry.title)
except KeyboardInterrupt:
@@ -251,7 +250,7 @@
"""
def __init__(self, generator, replacements, exceptions={},
acceptall=False, allowoverlap=False, recursive=False,
- addedCat=None, sleep=None, exceptinterwiki=False):
+ addedCat=None, sleep=None):
"""
Arguments:
* generator - A generator that yields Page objects.
@@ -292,7 +291,6 @@
self.acceptall = acceptall
self.allowoverlap = allowoverlap
self.recursive = recursive
- self.exceptinterwiki = exceptinterwiki
if addedCat:
site = wikipedia.getSite()
cat_ns = site.category_namespaces()[0]
@@ -339,13 +337,8 @@
for old, new in self.replacements:
if self.sleep != None:
time.sleep(self.sleep)
- if self.exceptinterwiki:
- interwikis = wikipedia.getLanguageLinks(new_text)
- new_text = wikipedia.removeLanguageLinks(new_text)
new_text = wikipedia.replaceExcept(new_text, old, new, exceptions,
allowoverlap=self.allowoverlap)
- if self.exceptinterwiki:
- new_text = wikipedia.replaceLanguageLinks(new_text,interwikis)
return new_text
def run(self):
@@ -498,8 +491,6 @@
allowoverlap = False
# Do not recurse replacement
recursive = False
- #add flag to ignore interwiki links
- exceptinterwiki = False
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
@@ -551,8 +542,6 @@
sleep = float(arg[7:])
elif arg == '-always':
acceptall = True
- elif arg == '-exceptinterwiki':
- exceptinterwiki = True
elif arg == '-recursive':
recursive = True
elif arg == '-nocase':
@@ -711,7 +700,7 @@
pageNumber=20, lookahead=100)
else:
preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
- bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, exceptinterwiki)
+ bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep)
bot.run()
if __name__ == "__main__":
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-08-07 00:25:37 UTC (rev 5789)
+++ trunk/pywikipedia/wikipedia.py 2008-08-07 10:10:46 UTC (rev 5790)
@@ -3143,7 +3143,7 @@
# All return the modified text as a unicode object
def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
- allowoverlap=False, marker = ''):
+ allowoverlap=False, marker = '', site = None):
"""
Return text with 'old' replaced by 'new', ignoring specified types of text.
@@ -3169,6 +3169,9 @@
# Hyperlink regex is defined in weblinkchecker.py
import weblinkchecker
+ if site is None:
+ site = getSite()
+
exceptionRegexes = {
'comment': re.compile(r'(?s)<!--.*?-->'),
# section headers
@@ -3203,7 +3206,10 @@
'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
# this matches internal wikilinks, but also interwiki, categories, and
# images.
- 'link': re.compile(r'\[\[(?P<title>[^\]\|]*)(\|[^\]]*)?\]\]')
+ 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
+ 'interwiki': re.compile(r'(?i)\[\[(%s)\s?:[^\]]*\]\][\s]*'
+ % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())),
+
}
# if we got a string, compile it as a regular expression