Revision: 6186
Author: russblau
Date: 2008-12-22 17:05:56 +0000 (Mon, 22 Dec 2008)
Log Message:
-----------
Branching replace.py for port to rewrite
Added Paths:
-----------
branches/rewrite/pywikibot/scripts/replace.py
Copied: branches/rewrite/pywikibot/scripts/replace.py (from rev 6185, trunk/pywikipedia/replace.py)
===================================================================
--- branches/rewrite/pywikibot/scripts/replace.py (rev 0)
+++ branches/rewrite/pywikibot/scripts/replace.py 2008-12-22 17:05:56 UTC (rev 6186)
@@ -0,0 +1,710 @@
+# -*- coding: utf-8 -*-
+"""
+This bot will make direct text replacements. It will retrieve information on
+which pages might need changes either from an XML dump or a text file, or only
+change a single page.
+
+These command line parameters can be used to specify which pages to work on:
+
+¶ms;
+
+-xml Retrieve information from a local XML dump (pages-articles
+ or pages-meta-current, see http://download.wikimedia.org).
+ Argument can also be given as "-xml:filename".
+
+-page Only edit a specific page.
+ Argument can also be given as "-page:pagetitle". You can
+ give this parameter multiple times to edit multiple pages.
+
+Furthermore, the following command line parameters are supported:
+
+-regex Make replacements using regular expressions. If this argument
+ isn't given, the bot will make simple text replacements.
+
+-nocase Use case insensitive regular expressions.
+
+-xmlstart (Only works with -xml) Skip all articles in the XML dump
+ before the one specified (may also be given as
+ -xmlstart:Article).
+
+-addcat:cat_name Adds "cat_name" category to every altered page.
+
+-excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex
+ argument is given, XYZ will be regarded as a regular
+ expression.
+
+-requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex
+ argument is given, XYZ will be regarded as a regular
+ expression.
+
+-excepttext:XYZ Skip pages which contain the text XYZ. If the -regex
+ argument is given, XYZ will be regarded as a regular
+ expression.
+
+-exceptinside:XYZ Skip occurences of the to-be-replaced text which lie
+ within XYZ. If the -regex argument is given, XYZ will be
+ regarded as a regular expression.
+
+-exceptinsidetag:XYZ Skip occurences of the to-be-replaced text which lie
+ within an XYZ tag.
+
+-summary:XYZ Set the summary message text for the edit to XYZ, bypassing
+ the predefined message texts with original and replacements
+ inserted.
+
+-sleep:123 If you use -fix you can check multiple regex at the same time
+ in every page. This can lead to a great waste of CPU because
+ the bot will check every regex without waiting using all the
+ resources. This will slow it down between a regex and another
+ in order not to waste too much CPU.
+
+-fix:XYZ Perform one of the predefined replacements tasks, which are
+ given in the dictionary 'fixes' defined inside the file
+ fixes.py.
+ The -regex and -nocase argument and given replacements will
+ be ignored if you use -fix.
+ Currently available predefined fixes are:
+&fixes-help;
+
+-namespace:n Number or name of namespace to process. The parameter can be
+ used multiple times. It works in combination with all other
+ parameters, except for the -start parameter. If you e.g.
+ want to iterate over all categories starting at M, use
+ -start:Category:M.
+
+-always Don't prompt you for each replacement
+
+-recursive Recurse replacement as long as possible. Be careful, this
+ might lead to an infinite loop.
+
+-allowoverlap When occurences of the pattern overlap, replace all of them.
+ Be careful, this might lead to an infinite loop.
+
+other: First argument is the old text, second argument is the new
+ text. If the -regex argument is given, the first argument
+ will be regarded as a regular expression, and the second
+ argument might contain expressions like \\1 or \g<name>.
+
+Examples:
+
+If you want to change templates from the old syntax, e.g. {{msg:Stub}}, to the
+new syntax, e.g. {{Stub}}, download an XML dump file (pages-articles) from
+http://download.wikimedia.org, then use this command:
+
+ python replace.py -xml -regex "{{msg:(.*?)}}" "{{\\1}}"
+
+If you have a dump called foobar.xml and want to fix typos in articles, e.g.
+Errror -> Error, use this:
+
+ python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0
+
+If you have a page called 'John Doe' and want to fix the format of ISBNs, use:
+
+ python replace.py -page:John_Doe -fix:isbn
+
+This command will change 'referer' to 'referrer', but not in pages which
+talk about HTTP, where the typo has become part of the standard:
+
+ python replace.py referer referrer -file:typos.txt -excepttext:HTTP
+"""
+#
+# (C) Daniel Herding & the Pywikipediabot Team, 2004-2008
+#
+# Distributed under the terms of the MIT license.
+#
+
+from __future__ import generators
+import sys, re, time
+import wikipedia, pagegenerators, catlib, config
+import editarticle
+import webbrowser
+
+# Imports predefined replacements tasks from fixes.py
+import fixes
+
+# This is required for the text that is shown when you run this script
+# with the parameter -help.
+docuReplacements = {
+ '¶ms;': pagegenerators.parameterHelp,
+ '&fixes-help;': fixes.help,
+}
+
+__version__='$Id$'
+
+# Summary messages in different languages
+# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
+# below.`v
+msg = {
+ 'ar':u'%s روبوت : استبدال تلقائي للنص',
+ 'cs':u'Robot automaticky nahradil text: %s',
+ 'de':u'Bot: Automatisierte Textersetzung %s',
+ 'el':u'Ρομπότ: Αυτόματη αντικατάσταση κειμένου %s',
+ 'en':u'Robot: Automated text replacement %s',
+ 'es':u'Robot: Reemplazo automático de texto %s',
+ 'fa':u'ربات: تغییر خودکار متن %s',
+ 'fr':u'Bot : Remplacement de texte automatisé %s',
+ 'he':u'בוט: החלפת טקסט אוטומטית %s',
+ 'hu':u'Robot: Automatikus szövegcsere %s',
+ 'ia':u'Robot: Reimplaciamento automatic de texto %s',
+ 'id':u'Bot: Penggantian teks otomatis %s',
+ 'is':u'Vélmenni: breyti texta %s',
+ 'it':u'Bot: Sostituzione automatica %s',
+ 'ja':u'ロボットによる: 文字置き換え %s',
+ 'ka':u'რობოტი: ტექსტის ავტომატური შეცვლა %s',
+ 'kk':u'Бот: Мәтінді өздікті алмастырды: %s',
+ 'ksh':u'Bot: hät outomatesch Täx jetuusch: %s',
+ 'lt':u'robotas: Automatinis teksto keitimas %s',
+ 'nds':u'Bot: Text automaatsch utwesselt: %s',
+ 'nds-nl':u'Bot: autematisch tekse vervungen %s',
+ 'nl':u'Bot: automatisch tekst vervangen %s',
+ 'nn':u'robot: automatisk teksterstatning: %s',
+ 'no':u'robot: automatisk teksterstatning: %s',
+ 'pl':u'Robot automatycznie zamienia tekst %s',
+ 'pt':u'Bot: Mudança automática %s',
+ 'ru':u'Робот: Автоматизированная замена текста',
+ 'sr':u'Бот: Аутоматска замена текста %s',
+ 'sv':u'Bot: Automatisk textersättning: %s',
+ 'zh': u'機器人:執行文字代換作業 %s',
+ }
+
+
+class XmlDumpReplacePageGenerator:
+ """
+ Iterator that will yield Pages that might contain text to replace.
+
+ These pages will be retrieved from a local XML dump file.
+ Arguments:
+ * xmlFilename - The dump's path, either absolute or relative
+ * xmlStart - Skip all articles in the dump before this one
+ * replacements - A list of 2-tuples of original text (as a
+ compiled regular expression) and replacement
+ text (as a string).
+ * exceptions - A dictionary which defines when to ignore an
+ occurence. See docu of the ReplaceRobot
+ constructor below.
+
+ """
+ def __init__(self, xmlFilename, xmlStart, replacements, exceptions):
+ self.xmlFilename = xmlFilename
+ self.replacements = replacements
+ self.exceptions = exceptions
+ self.xmlStart = xmlStart
+ self.skipping = bool(xmlStart)
+
+ self.excsInside = []
+ if self.exceptions.has_key('inside-tags'):
+ self.excsInside += self.exceptions['inside-tags']
+ if self.exceptions.has_key('inside'):
+ self.excsInside += self.exceptions['inside']
+ import xmlreader
+ self.site = wikipedia.getSite()
+ dump = xmlreader.XmlDump(self.xmlFilename)
+ self.parser = dump.parse()
+
+ def __iter__(self):
+ try:
+ for entry in self.parser:
+ if self.skipping:
+ if entry.title != self.xmlStart:
+ continue
+ self.skipping = False
+ if not self.isTitleExcepted(entry.title) \
+ and not self.isTextExcepted(entry.text):
+ new_text = entry.text
+ for old, new in self.replacements:
+ new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside, self.site)
+ if new_text != entry.text:
+ yield wikipedia.Page(self.site, entry.title)
+ except KeyboardInterrupt:
+ try:
+ if not self.skipping:
+ wikipedia.output(
+ u'To resume, use "-xmlstart:%s" on the command line.'
+ % entry.title)
+ except NameError:
+ pass
+
+ def isTitleExcepted(self, title):
+ if self.exceptions.has_key('title'):
+ for exc in self.exceptions['title']:
+ if exc.search(title):
+ return True
+ if self.exceptions.has_key('require-title'):
+ for req in self.exceptions['require-title']:
+ if not req.search(title): # if not all requirements are met:
+ return True
+
+ return False
+
+ def isTextExcepted(self, text):
+ if self.exceptions.has_key('text-contains'):
+ for exc in self.exceptions['text-contains']:
+ if exc.search(text):
+ return True
+ return False
+
+
+class ReplaceRobot:
+ """
+ A bot that can do text replacements.
+ """
+ def __init__(self, generator, replacements, exceptions={},
+ acceptall=False, allowoverlap=False, recursive=False,
+ addedCat=None, sleep=None):
+ """
+ Arguments:
+ * generator - A generator that yields Page objects.
+ * replacements - A list of 2-tuples of original text (as a
+ compiled regular expression) and replacement
+ text (as a string).
+ * exceptions - A dictionary which defines when not to change an
+ occurence. See below.
+ * acceptall - If True, the user won't be prompted before changes
+ are made.
+ * allowoverlap - If True, when matches overlap, all of them are
+ replaced.
+ * addedCat - If set to a value, add this category to every page
+ touched.
+
+ Structure of the exceptions dictionary:
+ This dictionary can have these keys:
+
+ title
+ A list of regular expressions. All pages with titles that
+ are matched by one of these regular expressions are skipped.
+ text-contains
+ A list of regular expressions. All pages with text that
+ contains a part which is matched by one of these regular
+ expressions are skipped.
+ inside
+ A list of regular expressions. All occurences are skipped which
+ lie within a text region which is matched by one of these
+ regular expressions.
+ inside-tags
+ A list of strings. These strings must be keys from the
+ exceptionRegexes dictionary in wikipedia.replaceExcept().
+
+ """
+ self.generator = generator
+ self.replacements = replacements
+ self.exceptions = exceptions
+ self.acceptall = acceptall
+ self.allowoverlap = allowoverlap
+ self.recursive = recursive
+ if addedCat:
+ site = wikipedia.getSite()
+ cat_ns = site.category_namespaces()[0]
+ self.addedCat = wikipedia.Page(site,
+ cat_ns + ':' + addedCat)
+ self.sleep = sleep
+
+ def isTitleExcepted(self, title):
+ """
+ Iff one of the exceptions applies for the given title, returns True.
+ """
+ if self.exceptions.has_key('title'):
+ for exc in self.exceptions['title']:
+ if exc.search(title):
+ return True
+ if self.exceptions.has_key('require-title'):
+ for req in self.exceptions['require-title']:
+ if not req.search(title):
+ return True
+ return False
+
+ def isTextExcepted(self, original_text):
+ """
+ Iff one of the exceptions applies for the given page contents,
+ returns True.
+ """
+ if self.exceptions.has_key('text-contains'):
+ for exc in self.exceptions['text-contains']:
+ if exc.search(original_text):
+ return True
+ return False
+
+ def doReplacements(self, original_text):
+ """
+ Returns the text which is generated by applying all replacements to
+ the given text.
+ """
+ new_text = original_text
+ exceptions = []
+ if self.exceptions.has_key('inside-tags'):
+ exceptions += self.exceptions['inside-tags']
+ if self.exceptions.has_key('inside'):
+ exceptions += self.exceptions['inside']
+ for old, new in self.replacements:
+ if self.sleep != None:
+ time.sleep(self.sleep)
+ new_text = wikipedia.replaceExcept(new_text, old, new, exceptions,
+ allowoverlap=self.allowoverlap)
+ return new_text
+
+ def run(self):
+ """
+ Starts the robot.
+ """
+ # Run the generator which will yield Pages which might need to be
+ # changed.
+ for page in self.generator:
+ if self.isTitleExcepted(page.title()):
+ wikipedia.output(
+ u'Skipping %s because the title is on the exceptions list.'
+ % page.aslink())
+ continue
+ try:
+ # Load the page's text from the wiki
+ original_text = page.get(get_redirect=True)
+ if not page.canBeEdited():
+ wikipedia.output(u"You can't edit page %s"
+ % page.aslink())
+ continue
+ except wikipedia.NoPage:
+ wikipedia.output(u'Page %s not found' % page.aslink())
+ continue
+ new_text = original_text
+ while True:
+ if self.isTextExcepted(new_text):
+ wikipedia.output(
+ u'Skipping %s because it contains text that is on the exceptions list.'
+ % page.aslink())
+ break
+ new_text = self.doReplacements(new_text)
+ if new_text == original_text:
+ wikipedia.output('No changes were necessary in %s'
+ % page.aslink())
+ break
+ if self.recursive:
+ newest_text = self.doReplacements(new_text)
+ while (newest_text!=new_text):
+ new_text = newest_text
+ newest_text = self.doReplacements(new_text)
+ if hasattr(self, "addedCat"):
+ cats = page.categories(nofollow_redirects=True)
+ if self.addedCat not in cats:
+ cats.append(self.addedCat)
+ new_text = wikipedia.replaceCategoryLinks(new_text,
+ cats)
+ # Show the title of the page we're working on.
+ # Highlight the title in purple.
+ wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
+ % page.title())
+ wikipedia.showDiff(original_text, new_text)
+ if self.acceptall:
+ break
+ choice = wikipedia.inputChoice(
+ u'Do you want to accept these changes?',
+ ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"],
+ ['y', 'N', 'e', 'b', 'a', 'q'], 'N')
+ if choice == 'e':
+ editor = editarticle.TextEditor()
+ as_edited = editor.edit(original_text)
+ # if user didn't press Cancel
+ if as_edited and as_edited != new_text:
+ new_text = as_edited
+ continue
+ if choice == 'b':
+ webbrowser.open("http://%s%s" % (
+ page.site().hostname(),
+ page.site().nice_get_address(page.title())
+ ))
+ wikipedia.input("Press Enter when finished in browser.")
+ original_text = page.get(get_redirect=True, force=True)
+ new_text = original_text
+ continue
+ if choice == 'q':
+ return
+ if choice == 'a':
+ self.acceptall = True
+ if choice == 'y':
+ page.put_async(new_text)
+ # choice must be 'N'
+ break
+ if self.acceptall and new_text != original_text:
+ try:
+ page.put(new_text)
+ except wikipedia.EditConflict:
+ wikipedia.output(u'Skipping %s because of edit conflict'
+ % (page.title(),))
+ except wikipedia.SpamfilterError, e:
+ wikipedia.output(
+ u'Cannot change %s because of blacklist entry %s'
+ % (page.title(), e.url))
+ except wikipedia.PageNotSaved, error:
+ wikipedia.output(u'Error putting page: %s'
+ % (error.args,))
+ except wikipedia.LockedPage:
+ wikipedia.output(u'Skipping %s (locked page)'
+ % (page.title(),))
+
+def prepareRegexForMySQL(pattern):
+ pattern = pattern.replace('\s', '[:space:]')
+ pattern = pattern.replace('\d', '[:digit:]')
+ pattern = pattern.replace('\w', '[:alnum:]')
+
+ pattern = pattern.replace("'", "\\" + "'")
+ #pattern = pattern.replace('\\', '\\\\')
+ #for char in ['[', ']', "'"]:
+ # pattern = pattern.replace(char, '\%s' % char)
+ return pattern
+
+
+def main():
+ add_cat = None
+ gen = None
+ # summary message
+ summary_commandline = None
+ # Array which will collect commandline parameters.
+ # First element is original text, second element is replacement text.
+ commandline_replacements = []
+ # A list of 2-tuples of original text and replacement text.
+ replacements = []
+ # Don't edit pages which contain certain texts.
+ exceptions = {
+ 'title': [],
+ 'text-contains': [],
+ 'inside': [],
+ 'inside-tags': [],
+ 'require-title': [], # using a seperate requirements dict needs some
+ } # major refactoring of code.
+
+ # Should the elements of 'replacements' and 'exceptions' be interpreted
+ # as regular expressions?
+ regex = False
+ # Predefined fixes from dictionary 'fixes' (see above).
+ fix = None
+ # the dump's path, either absolute or relative, which will be used
+ # if -xml flag is present
+ xmlFilename = None
+ useSql = False
+ PageTitles = []
+ # will become True when the user presses a ('yes to all') or uses the
+ # -always flag.
+ acceptall = False
+ # Will become True if the user inputs the commandline parameter -nocase
+ caseInsensitive = False
+ # Which namespaces should be processed?
+ # default to [] which means all namespaces will be processed
+ namespaces = []
+ # Do all hits when they overlap
+ allowoverlap = False
+ # Do not recurse replacement
+ recursive = False
+ # This factory is responsible for processing command line arguments
+ # that are also used by other scripts and that determine on which pages
+ # to work on.
+ genFactory = pagegenerators.GeneratorFactory()
+ # Load default summary message.
+ # BUG WARNING: This is probably incompatible with the -lang parameter.
+ wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))
+ # Between a regex and another (using -fix) sleep some time (not to waste
+ # too much CPU
+ sleep = None
+
+ # Read commandline parameters.
+ for arg in wikipedia.handleArgs():
+ if arg == '-regex':
+ regex = True
+ elif arg.startswith('-xmlstart'):
+ if len(arg) == 9:
+ xmlStart = wikipedia.input(
+ u'Please enter the dumped article to start with:')
+ else:
+ xmlStart = arg[10:]
+ elif arg.startswith('-xml'):
+ if len(arg) == 4:
+ xmlFilename = wikipedia.input(
+ u'Please enter the XML dump\'s filename:')
+ else:
+ xmlFilename = arg[5:]
+ elif arg =='-sql':
+ useSql = True
+ elif arg.startswith('-page'):
+ if len(arg) == 5:
+ PageTitles.append(wikipedia.input(
+ u'Which page do you want to change?'))
+ else:
+ PageTitles.append(arg[6:])
+ elif arg.startswith('-excepttitle:'):
+ exceptions['title'].append(arg[13:])
+ elif arg.startswith('-requiretitle:'):
+ exceptions['require-title'].append(arg[14:])
+ elif arg.startswith('-excepttext:'):
+ exceptions['text-contains'].append(arg[12:])
+ elif arg.startswith('-exceptinside:'):
+ exceptions['inside'].append(arg[14:])
+ elif arg.startswith('-exceptinsidetag:'):
+ exceptions['inside-tags'].append(arg[17:])
+ elif arg.startswith('-fix:'):
+ fix = arg[5:]
+ elif arg.startswith('-sleep:'):
+ sleep = float(arg[7:])
+ elif arg == '-always':
+ acceptall = True
+ elif arg == '-recursive':
+ recursive = True
+ elif arg == '-nocase':
+ caseInsensitive = True
+ elif arg.startswith('-addcat:'):
+ add_cat = arg[8:]
+ elif arg.startswith('-namespace:'):
+ try:
+ namespaces.append(int(arg[11:]))
+ except ValueError:
+ namespaces.append(arg[11:])
+ elif arg.startswith('-summary:'):
+ wikipedia.setAction(arg[9:])
+ summary_commandline = True
+ elif arg.startswith('-allowoverlap'):
+ allowoverlap = True
+ else:
+ generator = genFactory.handleArg(arg)
+ if generator:
+ gen = generator
+ else:
+ commandline_replacements.append(arg)
+
+ if (len(commandline_replacements) % 2):
+ raise wikipedia.Error, 'require even number of replacements.'
+ elif (len(commandline_replacements) == 2 and fix == None):
+ replacements.append((commandline_replacements[0],
+ commandline_replacements[1]))
+ if summary_commandline == None:
+ wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg )
+ % (' (-' + commandline_replacements[0] + ' +'
+ + commandline_replacements[1] + ')'))
+ elif (len(commandline_replacements) > 1):
+ if (fix == None):
+ for i in xrange (0, len(commandline_replacements), 2):
+ replacements.append((commandline_replacements[i],
+ commandline_replacements[i + 1]))
+ if summary_commandline == None:
+ pairs = [( commandline_replacements[i],
+ commandline_replacements[i + 1] )
+ for i in range(0, len(commandline_replacements), 2)]
+ replacementsDescription = '(%s)' % ', '.join(
+ [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
+ wikipedia.setAction(
+ wikipedia.translate(wikipedia.getSite(), msg )
+ % replacementsDescription)
+ else:
+ raise wikipedia.Error(
+ 'Specifying -fix with replacements is undefined')
+ elif fix == None:
+ old = wikipedia.input(u'Please enter the text that should be replaced:')
+ new = wikipedia.input(u'Please enter the new text:')
+ change = '(-' + old + ' +' + new
+ replacements.append((old, new))
+ while True:
+ old = wikipedia.input(
+u'Please enter another text that should be replaced, or press Enter to start:')
+ if old == '':
+ change = change + ')'
+ break
+ new = wikipedia.input(u'Please enter the new text:')
+ change = change + ' & -' + old + ' +' + new
+ replacements.append((old, new))
+ if not summary_commandline == True:
+ default_summary_message = wikipedia.translate(wikipedia.getSite(), msg) % change
+ wikipedia.output(u'The summary message will default to: %s'
+ % default_summary_message)
+ summary_message = wikipedia.input(
+u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')
+ if summary_message == '':
+ summary_message = default_summary_message
+ wikipedia.setAction(summary_message)
+
+ else:
+ # Perform one of the predefined actions.
+ try:
+ fix = fixes.fixes[fix]
+ except KeyError:
+ wikipedia.output(u'Available predefined fixes are: %s'
+ % fixes.fixes.keys())
+ return
+ if fix.has_key('regex'):
+ regex = fix['regex']
+ if fix.has_key('msg'):
+ wikipedia.setAction(
+ wikipedia.translate(wikipedia.getSite(), fix['msg']))
+ if fix.has_key('exceptions'):
+ exceptions = fix['exceptions']
+ replacements = fix['replacements']
+
+ # already compile all regular expressions here to save time later
+ for i in range(len(replacements)):
+ old, new = replacements[i]
+ if not regex:
+ old = re.escape(old)
+ if caseInsensitive:
+ oldR = re.compile(old, re.UNICODE | re.IGNORECASE)
+ else:
+ oldR = re.compile(old, re.UNICODE)
+ replacements[i] = oldR, new
+
+ for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']:
+ if exceptions.has_key(exceptionCategory):
+ patterns = exceptions[exceptionCategory]
+ if not regex:
+ patterns = [re.escape(pattern) for pattern in patterns]
+ if caseInsensitive:
+ patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE)
+ for pattern in patterns]
+ else:
+ patterns = [re.compile(pattern, re.UNICODE)
+ for pattern in patterns]
+ exceptions[exceptionCategory] = patterns
+
+ if xmlFilename:
+ try:
+ xmlStart
+ except NameError:
+ xmlStart = None
+ gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart,
+ replacements, exceptions)
+ elif useSql:
+ whereClause = 'WHERE (%s)' % ' OR '.join(
+ ["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern)
+ for (old, new) in replacements])
+ if exceptions:
+ exceptClause = 'AND NOT (%s)' % ' OR '.join(
+ ["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern)
+ for exc in exceptions])
+ else:
+ exceptClause = ''
+ query = u"""
+SELECT page_namespace, page_title
+FROM page
+JOIN text ON (page_id = old_id)
+%s
+%s
+LIMIT 200""" % (whereClause, exceptClause)
+ gen = pagegenerators.MySQLPageGenerator(query)
+
+ elif PageTitles:
+ pages = [wikipedia.Page(wikipedia.getSite(), PageTitle)
+ for PageTitle in PageTitles]
+ gen = iter(pages)
+
+ if not gen:
+ # syntax error, show help text from the top of this file
+ wikipedia.showHelp('replace')
+ return
+ if namespaces != []:
+ gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
+ if xmlFilename:
+ # XML parsing can be quite slow, so use smaller batches and
+ # longer lookahead.
+ preloadingGen = pagegenerators.PreloadingGenerator(gen,
+ pageNumber=20, lookahead=100)
+ else:
+ preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
+ bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep)
+ bot.run()
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ wikipedia.stopme()
Revision: 6185
Author: russblau
Date: 2008-12-22 17:04:08 +0000 (Mon, 22 Dec 2008)
Log Message:
-----------
documentation cleanup and logger ids
Modified Paths:
--------------
branches/rewrite/pywikibot/README-conversion.txt
branches/rewrite/pywikibot/bot.py
branches/rewrite/pywikibot/comms/http.py
branches/rewrite/pywikibot/comms/threadedhttp.py
branches/rewrite/pywikibot/data/api.py
branches/rewrite/pywikibot/family.py
branches/rewrite/pywikibot/login.py
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/README-conversion.txt
===================================================================
--- branches/rewrite/pywikibot/README-conversion.txt 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/README-conversion.txt 2008-12-22 17:04:08 UTC (rev 6185)
@@ -2,14 +2,17 @@
Pywikipediabot framework to version 2.
Most importantly, note that the version 2 framework *only* supports wikis
-using MediaWiki v.1.12 or higher software. If you need to access a wiki that
+using MediaWiki v.1.14 or higher software. If you need to access a wiki that
uses older software, you should continue using version 1 for this purpose.
-The "root" namespace used in the project has changed from "wikipedia"
+The root namespace used in the project has changed from "wikipedia"
to "pywikibot". References to wikipedia need to be changed globally to
pywikibot. Unless noted in this document, other names have not changed; for
example, wikipedia.Page can be replaced by pywikibot.Page throughout any
-bot.
+bot. An effort has been made to design the interface to be as backwards-
+compatible as possible, so that in most cases it should be possible to convert
+scripts to the new interface simply by changing import statements and doing
+global search-and-replace on module names, as discussed in this document.
With pywikipedia scripts were importing "wikipedia" or "pagegenerators"
libraries; pywikibot is now written as a standard package, and other modules
@@ -17,24 +20,36 @@
most commonly-used names are imported into the pywikibot namespace, so that
module names don't need to be used unless specified in the documentation.
-(To use it, just import "pywikibot", assuming that pywikibot/ is in sys.path)
+Make sure that the directory that contains the "pywikibot" subdirectory (or
+folder) is in sys.path.
+The following changes, at a minimum, need to be made to allow scripts to run:
+
+ change "import wikipedia" to "import pywikibot"
+ change "import pagegenerators" to "from pywikibot import pagegenerators"
+ change "import config" to "from pywikibot import config"
+ change "import catlib" to "from pywikibot import catlib"
+ change "wikipedia." to "pywikibot."
+
== Python librairies ==
[Note: the goal will be to package pywikibot with setuptools easy_install,
so that these dependencies will be loaded automatically when the package is
installed, and users won't need to worry about this...]
-To run pywikibot, you will need the httplib2, simplejson, and setuptools packages--
+To run pywikibot, you will need the httplib2, simplejson, and setuptools
+packages--
* httplib2 : http://code.google.com/p/httplib2/
* setuptools : http://pypi.python.org/pypi/setuptools/
* simplejson : http://svn.red-bean.com/bob/simplejson/tags/simplejson-1.7.1/docs/index.html
-or, if you already have setuptools installed, just execute 'easy_install httplib2'
-and 'easy_install simplejson'
+or, if you already have setuptools installed, just execute
+'easy_install httplib2' and 'easy_install simplejson'
-If you run into errors involving httplib2.urlnorm, update httplib2 to
-0.4.0 (Ubuntu package python-httlib2 for example, is outdated)
+If you run into errors involving httplib2.urlnorm, update httplib2 to 0.4.0
+(Ubuntu package python-httlib2, for example, is outdated). Note that
+httplib2 will run under Python 2.6, but will emit DeprecationWarnings (which
+are annoying but don't affect the ability to use the package).
== Page objects ==
Modified: branches/rewrite/pywikibot/bot.py
===================================================================
--- branches/rewrite/pywikibot/bot.py 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/bot.py 2008-12-22 17:04:08 UTC (rev 6185)
@@ -176,8 +176,10 @@
# ERROR - user error messages
# CRITICAL - fatal error messages
# Accordingly, do ''not'' use print statements in bot code; instead,
- # send output to the pywikibot.output() function which will route it
- # to the logging module.
+ # send output to logging.log(level, text) or one of its equivalents.
+ # For backwards-compatibility, pywikibot.output is supported, which
+ # directs output to logging.info() or other levels as appropriate, but
+ # its use in new code is deprecated.
logging.addLevelName(VERBOSE, "VERBOSE")
# for messages to be displayed on terminal at "verbose" setting
@@ -213,7 +215,7 @@
else:
file_handler.setLevel(VERBOSE)
form = logging.Formatter(
- fmt="%(asctime)s %(filename)-18s:%(lineno)-4d "
+ fmt="%(asctime)s %(filename)18s, %(lineno)d: "
"%(levelname)-8s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
Modified: branches/rewrite/pywikibot/comms/http.py
===================================================================
--- branches/rewrite/pywikibot/comms/http.py 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/comms/http.py 2008-12-22 17:04:08 UTC (rev 6185)
@@ -31,7 +31,7 @@
import cookielib
import threadedhttp
-logger = logging.getLogger("comm")
+logger = logging.getLogger("comms.http")
# global variables
Modified: branches/rewrite/pywikibot/comms/threadedhttp.py
===================================================================
--- branches/rewrite/pywikibot/comms/threadedhttp.py 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/comms/threadedhttp.py 2008-12-22 17:04:08 UTC (rev 6185)
@@ -30,7 +30,7 @@
import cookielib
import sys
-logger = logging.getLogger("comm")
+logger = logging.getLogger("comms.threadedhttp")
# easy_install safeguarded dependencies
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/data/api.py 2008-12-22 17:04:08 UTC (rev 6185)
@@ -23,7 +23,7 @@
from pywikibot import login
from pywikibot.exceptions import *
-logger = logging.getLogger()
+logger = logging.getLogger("data.api")
lagpattern = re.compile(r"Waiting for [\d.]+: (?P<lag>\d+) seconds? lagged")
Modified: branches/rewrite/pywikibot/family.py
===================================================================
--- branches/rewrite/pywikibot/family.py 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/family.py 2008-12-22 17:04:08 UTC (rev 6185)
@@ -9,7 +9,7 @@
from datetime import datetime, timedelta
-logger = logging.getLogger("wiki")
+logger = logging.getLogger("wiki.family")
# Parent class for all wiki families
Modified: branches/rewrite/pywikibot/login.py
===================================================================
--- branches/rewrite/pywikibot/login.py 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/login.py 2008-12-22 17:04:08 UTC (rev 6185)
@@ -50,7 +50,7 @@
from pywikibot import config
from pywikibot.exceptions import *
-logger = logging.getLogger()
+logger = logging.getLogger("wiki.login")
# On some wikis you are only allowed to run a bot if there is a link to
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/page.py 2008-12-22 17:04:08 UTC (rev 6185)
@@ -23,7 +23,7 @@
import unicodedata
import urllib
-logger = logging.getLogger("wiki")
+logger = logging.getLogger("wiki.page")
reNamespace = re.compile("^(.+?) *: *(.*)$")
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-12-22 15:42:59 UTC (rev 6184)
+++ branches/rewrite/pywikibot/site.py 2008-12-22 17:04:08 UTC (rev 6185)
@@ -28,7 +28,7 @@
import threading
import urllib
-logger = logging.getLogger("wiki")
+logger = logging.getLogger("wiki.site")
class PageInUse(pywikibot.Error):
"""Page cannot be reserved for writing due to existing lock."""
Revision: 6184
Author: russblau
Date: 2008-12-22 15:42:59 +0000 (Mon, 22 Dec 2008)
Log Message:
-----------
Merge purodha's recent changes from trunk, fix long lines and spacing
Modified Paths:
--------------
branches/rewrite/pywikibot/textlib.py
Property Changed:
----------------
branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2008-12-22 15:42:33 UTC (rev 6183)
+++ branches/rewrite/pywikibot/textlib.py 2008-12-22 15:42:59 UTC (rev 6184)
@@ -93,7 +93,9 @@
# images.
'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
'interwiki': re.compile(r'(?i)\[\[(%s)\s?:[^\]]*\]\][\s]*'
- % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())),
+ % '|'.join(site.validLanguageLinks()
+ + site.family.obsolete.keys())
+ ),
}
@@ -132,25 +134,28 @@
excMatch.start() < nextExceptionMatch.start()):
nextExceptionMatch = excMatch
- if nextExceptionMatch is not None and nextExceptionMatch.start() <= match.start():
- # an HTML comment or text in nowiki tags stands before the next valid match. Skip.
+ if nextExceptionMatch is not None \
+ and nextExceptionMatch.start() <= match.start():
+ # an HTML comment or text in nowiki tags stands before the next
+ # valid match. Skip.
index = nextExceptionMatch.end()
else:
# We found a valid match. Replace it.
if callable(new):
- # the parameter new can be a function which takes the match as a parameter.
+ # the parameter new can be a function which takes the match
+ # as a parameter.
replacement = new(match)
else:
# it is not a function, but a string.
- # it is a little hack to make \n work. It would be better to fix it
- # previously, but better than nothing.
+ # it is a little hack to make \n work. It would be better
+ # to fix it previously, but better than nothing.
new = new.replace('\\n', '\n')
# We cannot just insert the new string, as it may contain regex
# group references such as \2 or \g<name>.
- # On the other hand, this approach does not work because it can't
- # handle lookahead or lookbehind (see bug #1731008):
+ # On the other hand, this approach does not work because it
+ # can't handle lookahead or lookbehind (see bug #1731008):
#replacement = old.sub(new, text[match.start():match.end()])
#text = text[:match.start()] + replacement + text[match.end():]
@@ -162,8 +167,11 @@
groupMatch = groupR.search(replacement)
if not groupMatch:
break
- groupID = groupMatch.group('name') or int(groupMatch.group('number'))
- replacement = replacement[:groupMatch.start()] + match.group(groupID) + replacement[groupMatch.end():]
+ groupID = (groupMatch.group('name')
+ or int(groupMatch.group('number')))
+ replacement = (replacement[:groupMatch.start()]
+ + match.group(groupID)
+ + replacement[groupMatch.end():])
text = text[:match.start()] + replacement + text[match.end():]
# continue the search on the remaining text
@@ -210,14 +218,45 @@
For the tags parameter, see removeDisabledParts() above.
"""
# Find a marker that is not already in the text.
- marker = '@@'
- while marker in text:
- marker += '@'
+ marker = findmarker(text, '@@', '@')
text = text[:index] + marker + text[index:]
text = removeDisabledParts(text, tags)
return (marker not in text)
+def findmarker(text, startwith = u'@', append = u'@'):
+ # find a string which is not part of text
+ if len(append) <= 0:
+ append = u'@'
+ mymarker = startwith
+ while mymarker in text:
+ mymarker += append
+ return mymarker
+
+
+def expandmarker(text, marker = '', separator = ''):
+ # set to remove any number of separator occurrences plus arbitrary
+ # whitespace before, after, and between them,
+ # by allowing to include them into marker.
+ if separator:
+ firstinmarker = text.find(marker)
+ firstinseparator = firstinmarker
+ lenseparator = len(separator)
+ striploopcontinue = True
+ while firstinseparator > 0 and striploopcontinue:
+ striploopcontinue = False
+ if ( (firstinseparator >= lenseparator) and
+ (separator ==
+ text[firstinseparator-lenseparator:firstinseparator])):
+ firstinseparator -= lenseparator
+ striploopcontinue = True
+ elif text[firstinseparator-1] < ' ':
+ firstinseparator -= 1
+ striploopcontinue = True
+ marker = text[firstinseparator:firstinmarker] + marker
+ return marker
+
+
# Functions dealing with interwiki language links
# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
@@ -289,11 +328,32 @@
interwikiR = re.compile(r'\[\[(%s)\s?:[^\]]*\]\][\s]*'
% languages, re.IGNORECASE)
text = replaceExcept(text, interwikiR, '',
- ['nowiki', 'comment', 'math', 'pre', 'source'], marker=marker)
+ ['nowiki', 'comment', 'math', 'pre', 'source'],
+ marker=marker)
return text.strip()
-def replaceLanguageLinks(oldtext, new, site = None):
+def removeLanguageLinksAndSeparator(text, site = None, marker = '', separator = ''):
+ """
+ Return text with all interlanguage links, plus any preceeding whitespace
+ and separateor occurrences removed.
+
+ If a link to an unknown language is encountered, a warning is printed.
+ If a marker is defined, that string is placed at the location of the
+ last occurence of an interwiki link (at the end if there are no
+ interwiki links).
+
+ """
+ if separator:
+ mymarker = findmarker(text, u'@L@')
+ newtext = removeLanguageLinks(text, site, mymarker)
+ mymarker = expandmarker(newtext, mymarker, separator)
+ return newtext.replace(mymarker, marker)
+ else:
+ return removeLanguageLinks(text, site, marker)
+
+
+def replaceLanguageLinks(oldtext, new, site = None, addOnly = False):
"""Replace interlanguage links in the text with a new set of links.
'new' should be a dict with the Site objects as keys, and Page objects
@@ -302,31 +362,45 @@
"""
# Find a marker that is not already in the text.
- marker = '@@'
- while marker in oldtext:
- marker += '@'
+ marker = findmarker( oldtext, u'@@')
if site == None:
site = pywikibot.getSite()
+ separator = site.family.interwiki_text_separator
+ cseparator = site.family.category_text_separator
+ separatorstripped = separator.strip()
+ cseparatorstripped = cseparator.strip()
+ if addOnly:
+ s2 = oldtext
+ else:
+ s2 = removeLanguageLinksAndSeparator(oldtext, site=site, marker=marker,
+ separator=separatorstripped)
s = interwikiFormat(new, insite = site)
- s2 = removeLanguageLinks(oldtext, site = site, marker = marker)
if s:
separator = site.family.interwiki_text_separator
if site.language() in site.family.interwiki_attop:
newtext = s + separator + s2.replace(marker,'').strip()
else:
# calculate what was after the language links on the page
- firstafter = s2.find(marker) + len(marker)
+ firstafter = s2.find(marker)
+ if firstafter < 0:
+ firstafter = len(s2)
+ else:
+ firstafter += len(marker)
# Any text in 'after' part that means we should keep it after?
if "</noinclude>" in s2[firstafter:]:
- newtext = s2[:firstafter] + s + s2[firstafter:]
+ if separatorstripped:
+ s = separator + s
+ newtext = s2[:firstafter].replace(marker,'') + s \
+ + s2[firstafter:]
elif site.language() in site.family.categories_last:
cats = getCategoryLinks(s2, site = site)
- s2 = removeCategoryLinks(s2.replace(marker,'').strip(),
- site) + separator + s
- newtext = replaceCategoryLinks(s2, cats, site=site)
+ s2 = removeCategoryLinksAndSeparator(
+ s2.replace(marker, '', cseparatorstripped).strip(),
+ site) + separator + s
+ newtext = replaceCategoryLinks(s2, cats, site=site,
+ addOnly=True)
else:
newtext = s2.replace(marker,'').strip() + separator + s
- newtext = newtext.replace(marker,'')
else:
newtext = s2.replace(marker,'')
return newtext
@@ -385,7 +459,8 @@
del sites[sites.index(site)]
firstsites = firstsites + [site]
sites = firstsites + sites
- if insite.interwiki_putfirst_doubled(sites): #some implementations return False
+ if insite.interwiki_putfirst_doubled(sites):
+ #some (all?) implementations return False
sites = insite.interwiki_putfirst_doubled(sites) + sites
return sites
@@ -420,7 +495,7 @@
"""Return text with all category links removed.
Put the string marker after the last replacement (at the end of the text
- if there is no replacement).
+ if there is no replacement).
"""
# This regular expression will find every link that is possibly an
@@ -429,13 +504,34 @@
# ASCII letters and hyphens.
catNamespace = '|'.join(site.category_namespaces())
categoryR = re.compile(r'\[\[\s*(%s)\s*:.*?\]\]\s*' % catNamespace, re.I)
- text = replaceExcept(text, categoryR, '', ['nowiki', 'comment', 'math', 'pre', 'source'], marker = marker)
+ text = replaceExcept(text, categoryR, '',
+ ['nowiki', 'comment', 'math', 'pre', 'source'],
+ marker=marker)
if marker:
#avoid having multiple linefeeds at the end of the text
- text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker, text.strip())
+ text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker,
+ text.strip())
return text.strip()
+def removeCategoryLinksAndSeparator(text, site=None, marker='', separator=''):
+ """
+ Return text with all category links, plus any preceeding whitespace
+ and separateor occurrences removed.
+
+ Put the string marker after the last replacement (at the end of the text
+ if there is no replacement).
+
+ """
+ if separator:
+ mymarker = findmarker(text, u'@C@')
+ newtext = removeCategoryLinks(text, site, mymarker)
+ mymarker = expandmarker(newtext, mymarker, separator)
+ return newtext.replace(mymarker, marker)
+ else:
+ return removeCategoryLinks(text, site, marker)
+
+
def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None):
"""Replace the category oldcat with the category newcat and return
the modified text.
@@ -453,7 +549,7 @@
# title might not be capitalized correctly on the wiki
if title[0].isalpha() and not site.nocapitalize:
title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:]
- # spaces and underscores in page titles are interchangeable, and collapsible
+ # spaces and underscores in page titles are interchangeable and collapsible
title = title.replace(r"\ ", "[ _]+").replace(r"\_", "[ _]+")
categoryR = re.compile(r'\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])'
% (catNamespace, title), re.I)
@@ -470,31 +566,34 @@
def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
"""Replace the category links given in the wikitext given
- in oldtext by the new links given in new.
+ in oldtext by the new links given in new.
- 'new' should be a list of Category objects.
+ 'new' should be a list of Category objects.
- If addOnly is True, the old category won't be deleted and
- the category(s) given will be added
- (and so they won't replace anything).
+ If addOnly is True, the old category won't be deleted andthe
+ category(s) given will be added (and so they won't replace anything).
+
"""
-
# Find a marker that is not already in the text.
- marker = '@@'
- while marker in oldtext:
- marker += '@'
-
+ marker = findmarker( oldtext, u'@@')
if site is None:
site = pywikibot.getSite()
if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext:
- raise Error('The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia on pages that contain the person data template because of the non-standard placement of that template. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006…')
-
- s = categoryFormat(new, insite = site)
+ raise Error("""\
+The PyWikipediaBot is no longer allowed to touch categories on the German
+Wikipedia on pages that contain the Personendaten template because of the
+non-standard placement of that template.
+See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006…""")
+ separator = site.family.category_text_separator
+ iseparator = site.family.interwiki_text_separator
+ separatorstripped = separator.strip()
+ iseparatorstripped = iseparator.strip()
if addOnly:
s2 = oldtext
else:
- s2 = removeCategoryLinks(oldtext, site = site, marker = marker)
-
+ s2 = removeCategoryLinksAndSeparator(oldtext, site=site, marker=marker,
+ separator=separatorstripped)
+ s = categoryFormat(new, insite = site)
if s:
separator = site.family.category_text_separator
if site.language() in site.family.category_attop:
@@ -502,20 +601,28 @@
else:
# calculate what was after the categories links on the page
firstafter = s2.find(marker)
- # Any text in 'after' part that means we should keep it after?
+ if firstafter < 0:
+ firstafter = len(s2)
+ else:
+ firstafter += len(marker)
+ # Is there text in the 'after' part that means we should keep it
+ # after?
if "</noinclude>" in s2[firstafter:]:
- newtext = s2[:firstafter] + s + s2[firstafter:]
+ if separatorstripped:
+ s = separator + s
+ newtext = (s2[:firstafter].replace(marker,'') + s
+ + s2[firstafter:])
elif site.language() in site.family.categories_last:
newtext = s2.replace(marker,'').strip() + separator + s
else:
interwiki = getLanguageLinks(s2)
- s2 = removeLanguageLinks(s2.replace(marker,''), site
- ) + separator + s
- newtext = replaceLanguageLinks(s2, interwiki, site)
- newtext = newtext.replace(marker,'')
+ s2 = removeLanguageLinksAndSeparator(
+ s2.replace(marker,''), site, '', iseparatorstripped
+ ) + separator + s
+ newtext = replaceLanguageLinks(s2, interwiki, site=site,
+ addOnly=True)
else:
- s2 = s2.replace(marker,'')
- return s2
+ newtext = s2.replace(marker,'')
return newtext.strip()
@@ -559,7 +666,9 @@
# not allowed inside links. For example, in this wiki text:
# ''Please see http://www.example.org.''
# .'' shouldn't be considered as part of the link.
- regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside + ']*[^' + notAtEnd + '])'
+ regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
+ + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside \
+ + ']*[^' + notAtEnd + '])'
if withoutBracketed:
regex = r'(?<!\[)' + regex
@@ -568,6 +677,7 @@
linkR = re.compile(regex)
return linkR
+
def extract_templates_and_params(text, get_redirect=False):
"""Return list of template calls found in text.
Property changes on: branches/rewrite/pywikibot/textlib.py
___________________________________________________________________
Added: svn:mergeinfo
+ /trunk/pywikipedia/wikipedia.py:6154-6179
Revision: 6183
Author: russblau
Date: 2008-12-22 15:42:33 +0000 (Mon, 22 Dec 2008)
Log Message:
-----------
nocapitalize implemented
Modified Paths:
--------------
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-12-22 15:29:51 UTC (rev 6182)
+++ branches/rewrite/pywikibot/site.py 2008-12-22 15:42:33 UTC (rev 6183)
@@ -608,10 +608,10 @@
}
self.sitelock = threading.Lock()
self._msgcache = {}
+ self.nocapitalize = self.code in self.family.nocapitalize
return
# ANYTHING BELOW THIS POINT IS NOT YET IMPLEMENTED IN __init__()
- self.nocapitalize = self.__code in self.family.nocapitalize
# Calculating valid languages took quite long, so we calculate it once
# in initialization instead of each time it is used.
self._validlanguages = []