Revision: 4235
Author: wikipedian
Date: 2007-09-11 12:57:06 +0000 (Tue, 11 Sep 2007)
Log Message:
-----------
replaceExcept(): handle nested templates (up to level 2)
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-09-11 12:47:08 UTC (rev 4234)
+++ trunk/pywikipedia/wikipedia.py 2007-09-11 12:57:06 UTC (rev 4235)
@@ -2574,10 +2574,15 @@
'startspace': re.compile(r'(?m)^ (.*?)$'),
# tables often have whitespace that is used to improve wiki
# source code readability.
+ # TODO: handle nested tables.
'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
# templates with parameters often have whitespace that is used to
# improve wiki source code readability.
- 'template': re.compile(r'(?s){{.*?}}'),
+ # 'template': re.compile(r'(?s){{.*?}}'),
+ # The regex above fails on nested templates. This regex can handle
+ # templates inside templates, but no deeper cascades.
+ 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
+
}
# if we got a string, compile it as a regular expression
Revision: 4233
Author: wikipedian
Date: 2007-09-11 12:38:49 +0000 (Tue, 11 Sep 2007)
Log Message:
-----------
skip existing references sections that are commented out
created wikipedia.isDisabled() to find out if a certain part of a page is commented out
Modified Paths:
--------------
trunk/pywikipedia/noreferences.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/noreferences.py
===================================================================
--- trunk/pywikipedia/noreferences.py 2007-09-11 12:29:26 UTC (rev 4232)
+++ trunk/pywikipedia/noreferences.py 2007-09-11 12:38:49 UTC (rev 4233)
@@ -171,10 +171,13 @@
sectionR = re.compile(r'\r\n=+ *%s *=+\r\n' % section)
match = sectionR.search(oldText)
if match:
- wikipedia.output(u'Adding references tag to existing %s section...\n' % section)
- newText = oldText[:match.end()] + u'\n<references/>\n' + oldText[match.end():]
- self.save(page, newText)
- return
+ if wikipedia.isDisabled(oldText, match.start()):
+ wikipedia.output('Existing %s section is commented out, skipping.' % section)
+ else:
+ wikipedia.output(u'Adding references tag to existing %s section...\n' % section)
+ newText = oldText[:match.end()] + u'\n<references/>\n' + oldText[match.end():]
+ self.save(page, newText)
+ return
# Create a new section for the references tag
for section in wikipedia.translate(wikipedia.getSite(), placeBeforeSections):
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-09-11 12:29:26 UTC (rev 4232)
+++ trunk/pywikipedia/wikipedia.py 2007-09-11 12:38:49 UTC (rev 4233)
@@ -2642,7 +2642,7 @@
text = text[:markerpos] + marker + text[markerpos:]
return text
-def removeDisabledParts(text, parts=['*']):
+def removeDisabledParts(text, tags = ['*']):
"""
Removes those parts of a wiki text where wiki markup is disabled, i.e.
* HTML comments
@@ -2659,11 +2659,26 @@
'nowiki': r'<nowiki>.*?</nowiki>',
'pre': r'<pre>.*?</pre>',
}
- if '*' in parts:
- parts = regexes.keys()
- toRemoveR = re.compile('|'.join([regexes[p] for p in parts]), re.IGNORECASE | re.DOTALL)
+ if '*' in tags:
+ tags = regexes.keys()
+ toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]), re.IGNORECASE | re.DOTALL)
return toRemoveR.sub('', text)
+def isDisabled(text, index, tags = ['*']):
+ """
+ Checks whether the text part at the given location is disabled, e.g.
+ by a comment or by nowiki tags.
+
+ For the tags parameter, see removeDisabledParts() above.
+ """
+ # Find a marker that is not already in the text.
+ marker = '@@'
+ while marker in text:
+ marker += '@'
+ text = text[:index] + marker + text[index:]
+ text = removeDisabledParts(text, tags)
+ return (marker not in text)
+
# Part of library dealing with interwiki links
def getLanguageLinks(text, insite = None, pageLink = "[[]]"):
Revision: 4232
Author: wikipedian
Date: 2007-09-11 12:29:26 +0000 (Tue, 11 Sep 2007)
Log Message:
-----------
first try to add the <references/> to an existing section
added docu etc.
Modified Paths:
--------------
trunk/pywikipedia/noreferences.py
Modified: trunk/pywikipedia/noreferences.py
===================================================================
--- trunk/pywikipedia/noreferences.py 2007-09-11 12:09:57 UTC (rev 4231)
+++ trunk/pywikipedia/noreferences.py 2007-09-11 12:29:26 UTC (rev 4232)
@@ -20,6 +20,8 @@
want to iterate over all categories starting at M, use
-start:Category:M.
+ -always Don't prompt you for each replacement.
+
All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
@@ -52,7 +54,7 @@
# For example, on an English wiki, the script would place the "References"
# section in front of the "Further reading" section, if that existed.
# Otherwise, it would try to do it
-placeBeforeSection = {
+placeBeforeSections = {
'de': [ # no explicit policy on where to put the references
u'Literatur',
u'Weblinks',
@@ -67,9 +69,18 @@
}
# How the references section should look like.
-referencesSection = {
- 'de': u'\n== Einzelnachweise ==\n\n<references/>\n', # The "Einzelnachweise" title is disputed, some people prefer "Quellen", "Quellenangaben", "Fußnoten", etc.
- 'en': u'\n== References ==\n\n<references/>\n',
+referencesSections = {
+ 'de': [
+ u'Einzelnachweise', # The "Einzelnachweise" title is disputed, some people prefer the other variants
+ u'Quellen',
+ u'Quellenangaben',
+ u'Fußnoten',
+ ],
+ 'en': [ # not sure about which ones are preferred.
+ u'References',
+ u'Footnotes',
+ u'Notes',
+ ],
}
# Templates which include a <references/> tag. If there is no such template
@@ -116,7 +127,10 @@
except KeyError:
self.referencesTemplates = []
- def needsTreatment(self, page):
+ def lacksReferences(self, page):
+ """
+ Checks whether or not the page is lacking a references tag.
+ """
# Show the title of the page we're working on.
# Highlight the title in purple.
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
@@ -144,48 +158,72 @@
wikipedia.output(u"Page %s is locked?!" % page.aslink())
return False
- def treat(self, page):
+ def addReferences(self, page):
+ """
+ Tries to add a references tag into an existing section where it fits
+ into. If there is no such section, creates a new section containing
+ the references tag.
+ """
oldText = page.get()
- for section in wikipedia.translate(wikipedia.getSite(), placeBeforeSection):
- sectionR = re.compile(r'\r\n=+ *%s *=+' % section)
+
+ # Is there an existing section where we can add the references tag?
+ for section in wikipedia.translate(wikipedia.getSite(), referencesSections):
+ sectionR = re.compile(r'\r\n=+ *%s *=+\r\n' % section)
match = sectionR.search(oldText)
if match:
+ wikipedia.output(u'Adding references tag to existing %s section...\n' % section)
+ newText = oldText[:match.end()] + u'\n<references/>\n' + oldText[match.end():]
+ self.save(page, newText)
+ return
+
+ # Create a new section for the references tag
+ for section in wikipedia.translate(wikipedia.getSite(), placeBeforeSections):
+ # Find out where to place the new section
+ sectionR = re.compile(r'\r\n=+ *%s *=+\r\n' % section)
+ match = sectionR.search(oldText)
+ if match:
wikipedia.output(u'Adding references section...\n')
pos = match.start()
- newSection = wikipedia.translate(wikipedia.getSite(), referencesSection)
+ newSection = u'\n== %s ==\n\n<references/>\n' % wikipedia.translate(wikipedia.getSite(), referencesSections)[0]
newText = oldText[:match.start()] + newSection + oldText[match.start():]
- wikipedia.showDiff(oldText, newText)
- if not self.always:
- choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'Always yes'], ['y', 'N', 'a'], 'N')
- if choice == 'n':
- return
- elif choice == 'a':
- self.always = True
+ self.save(page, newText)
+ return
+ # TODO: Think of a clever way of handling this.
+ wikipedia.output(u'Found no section that can be preceeded by a new references section. Please add a references section.')
- if self.always:
- try:
- page.put(newText)
- except wikipedia.EditConflict:
- wikipedia.output(u'Skipping %s because of edit conflict' % (page.title(),))
- except wikipedia.SpamfilterError, e:
- wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
- except wikipedia.LockedPage:
- wikipedia.output(u'Skipping %s (locked page)' % (page.title(),))
- else:
- # Save the page in the background. No need to catch exceptions.
- page.put_async(newText)
+ def save(self, page, newText):
+ """
+ Saves the page to the wiki, if the user accepts the changes made.
+ """
+ wikipedia.showDiff(page.get(), newText)
+ if not self.always:
+ choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'Always yes'], ['y', 'N', 'a'], 'N')
+ if choice == 'n':
return
- # TODO: Think of a more clever way of doing this.
- wikipedia.output(u'Found no section that can be preceeded by a new references section. Please fix it manually.')
-
+ elif choice == 'a':
+ self.always = True
+ if self.always:
+ try:
+ page.put(newText)
+ except wikipedia.EditConflict:
+ wikipedia.output(u'Skipping %s because of edit conflict' % (page.title(),))
+ except wikipedia.SpamfilterError, e:
+ wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
+ except wikipedia.LockedPage:
+ wikipedia.output(u'Skipping %s (locked page)' % (page.title(),))
+ else:
+ # Save the page in the background. No need to catch exceptions.
+ page.put_async(newText)
+ return
+
def run(self):
comment = wikipedia.translate(wikipedia.getSite(), msg)
wikipedia.setAction(comment)
for page in self.generator:
- if self.needsTreatment(page):
- self.treat(page)
+ if self.lacksReferences(page):
+ self.addReferences(page)
def main():
#page generator
Revision: 4228
Author: wikipedian
Date: 2007-09-11 11:56:03 +0000 (Tue, 11 Sep 2007)
Log Message:
-----------
new script to add missing <references/> tags
Added Paths:
-----------
trunk/pywikipedia/noreferences.py
Added: trunk/pywikipedia/noreferences.py
===================================================================
--- trunk/pywikipedia/noreferences.py (rev 0)
+++ trunk/pywikipedia/noreferences.py 2007-09-11 11:56:03 UTC (rev 4228)
@@ -0,0 +1,241 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+This script goes over multiple pages, searches for pages where <references/>
+is missing although a <ref> tag is present, and in that case adds a new
+references section.
+
+These command line parameters can be used to specify which pages to work on:
+
+¶ms;
+
+ -xml Retrieve information from a local XML dump (pages-articles
+ or pages-meta-current, see http://download.wikimedia.org).
+ Argument can also be given as "-xml:filename".
+
+ -namespace:n Number of namespace to process. The parameter can be used
+ multiple times. It works in combination with all other
+ parameters, except for the -start parameter. If you e.g.
+ want to iterate over all categories starting at M, use
+ -start:Category:M.
+
+All other parameters will be regarded as part of the title of a single page,
+and the bot will only work on that single page.
+
+It is strongly recommended not to run this script over the entire article
+namespace (using the -start) parameter, as that would consume too much
+bandwidth. Instead, use the -xml parameter, or use another way to generate
+a list of affected articles
+"""
+
+__version__='$Id: selflink.py 4187 2007-09-03 11:37:19Z wikipedian $'
+
+import wikipedia, pagegenerators, catlib
+import editarticle
+import re, sys
+
+# This is required for the text that is shown when you run this script
+# with the parameter -help.
+docuReplacements = {
+ '¶ms;': pagegenerators.parameterHelp,
+}
+
+# Summary messages in different languages
+msg = {
+ 'de':u'Bot: Trage fehlendes <references /> nach',
+ 'en':u'Robot: Adding missing <references /> tag',
+}
+
+# References sections are usually placed before further reading / external
+# link sections. This dictionary defines these sections, sorted by priority.
+# For example, on an English wiki, the script would place the "References"
+# section in front of the "Further reading" section, if that existed.
+# Otherwise, it would try to do it
+placeBeforeSection = {
+ 'de': [ # no explicit policy on where to put the references
+ u'Literatur',
+ u'Weblinks',
+ u'Siehe auch'
+ ],
+ 'en': [ # no explicit policy on where to put the references
+ u'Further reading',
+ u'External links',
+ u'See also',
+ u'Notes'
+ ],
+}
+
+# How the references section should look like.
+referencesSection = {
+ 'de': u'\n== Einzelnachweise ==\n\n<references/>\n', # The "Einzelnachweise" title is disputed, some people prefer "Quellen", "Quellenangaben", "Fußnoten", etc.
+ 'en': u'\n== References ==\n\n<references/>\n',
+}
+
+# Templates which include a <references/> tag. If there is no such template
+# on your wiki, you don't have to enter anything here.
+referencesTemplates = {
+ 'wikipedia': {
+ 'en': [u'Reflist'],
+ },
+}
+
+class XmlDumpNoReferencesPageGenerator:
+ """
+ Generator which will yield Pages that might lack a references tag.
+ These pages will be retrieved from a local XML dump file
+ (pages-articles or pages-meta-current).
+ """
+ def __init__(self, xmlFilename):
+ """
+ Arguments:
+ * xmlFilename - The dump's path, either absolute or relative
+ """
+ self.xmlFilename = xmlFilename
+ self.refR = re.compile('</ref>')
+ self.referencesR = re.compile('<references */>')
+
+ def __iter__(self):
+ import xmlreader
+ mysite = wikipedia.getSite()
+ dump = xmlreader.XmlDump(self.xmlFilename)
+ for entry in dump.parse():
+ if self.refR.search(entry.text) and not self.referencesR.search(entry.text):
+ yield wikipedia.Page(mysite, entry.title)
+
+class NoReferencesBot:
+
+ def __init__(self, generator, always = False):
+ self.generator = generator
+ self.always = always
+
+ self.refR = re.compile('</ref>')
+ self.referencesR = re.compile('<references */>')
+ try:
+ self.referencesTemplates = referencesTemplates[wikipedia.getSite().family.name][wikipedia.getSite().lang]
+ except KeyError:
+ self.referencesTemplates = []
+
+ def needsTreatment(self, page):
+ # Show the title of the page we're working on.
+ # Highlight the title in purple.
+ wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
+ try:
+ oldText = page.get()
+ oldTextCleaned = wikipedia.removeDisabledParts(oldText)
+ if not self.refR.search(oldTextCleaned):
+ wikipedia.output(u'No changes necessary: no ref tags found.')
+ return False
+ elif self.referencesR.search(oldTextCleaned):
+ wikipedia.output(u'No changes necessary: references tags found.')
+ return False
+ else:
+ for template in page.templates():
+ if template in self.referencesTemplates:
+ wikipedia.output(u'No changes necessary: references template found.')
+ return False
+ wikipedia.output(u'Found ref without references.')
+ return True
+ except wikipedia.NoPage:
+ wikipedia.output(u"Page %s does not exist?!" % page.aslink())
+ except wikipedia.IsRedirectPage:
+ wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
+ except wikipedia.LockedPage:
+ wikipedia.output(u"Page %s is locked?!" % page.aslink())
+ return False
+
+ def treat(self, page):
+ oldText = page.get()
+ for section in wikipedia.translate(wikipedia.getSite(), placeBeforeSection):
+ sectionR = re.compile(r'\r\n=+ +%s +=+' % section)
+ match = sectionR.search(oldText)
+ if match:
+ wikipedia.output(u'Adding references section...\n')
+ pos = match.start()
+ newSection = wikipedia.translate(wikipedia.getSite(), referencesSection)
+ newText = oldText[:match.start()] + newSection + oldText[match.start():]
+ wikipedia.showDiff(oldText, newText)
+ if not self.always:
+ choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'Always yes'], ['y', 'N', 'a'], 'N')
+ if choice == 'n':
+ return
+ elif choice == 'a':
+ self.always = True
+
+ if self.always:
+ try:
+ page.put(newText)
+ except wikipedia.EditConflict:
+ wikipedia.output(u'Skipping %s because of edit conflict' % (page.title(),))
+ except wikipedia.SpamfilterError, e:
+ wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
+ except wikipedia.LockedPage:
+ wikipedia.output(u'Skipping %s (locked page)' % (page.title(),))
+ else:
+ # Save the page in the background. No need to catch exceptions.
+ page.put_async(newText)
+ return
+ # TODO: Think of a more clever way of doing this.
+ wikipedia.output(u'Found no section that can be preceeded by a new references section. Please fix it manually.')
+
+
+ def run(self):
+ comment = wikipedia.translate(wikipedia.getSite(), msg)
+ wikipedia.setAction(comment)
+
+ for page in self.generator:
+ if self.needsTreatment(page):
+ self.treat(page)
+
+def main():
+ #page generator
+ gen = None
+ # This temporary array is used to read the page title if one single
+ # page to work on is specified by the arguments.
+ pageTitle = []
+ # Which namespaces should be processed?
+ # default to [] which means all namespaces will be processed
+ namespaces = []
+ # Never ask before changing a page
+ always = False
+ # This factory is responsible for processing command line arguments
+ # that are also used by other scripts and that determine on which pages
+ # to work on.
+ genFactory = pagegenerators.GeneratorFactory()
+
+ for arg in wikipedia.handleArgs():
+ if arg.startswith('-xml'):
+ if len(arg) == 4:
+ xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
+ else:
+ xmlFilename = arg[5:]
+ gen = XmlDumpNoReferencesPageGenerator(xmlFilename)
+ elif arg.startswith('-namespace:'):
+ namespaces.append(int(arg[11:]))
+ elif arg == '-always':
+ always = True
+ else:
+ generator = genFactory.handleArg(arg)
+ if generator:
+ gen = generator
+ else:
+ pageTitle.append(arg)
+
+ if pageTitle:
+ page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
+ gen = iter([page])
+ if not gen:
+ wikipedia.showHelp('noreferences')
+ else:
+ if namespaces != []:
+ gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
+ preloadingGen = pagegenerators.PreloadingGenerator(gen)
+ bot = NoReferencesBot(preloadingGen, always = always)
+ bot.run()
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ wikipedia.stopme()
+
Revision: 4227
Author: wikipedian
Date: 2007-09-11 11:48:50 +0000 (Tue, 11 Sep 2007)
Log Message:
-----------
added -help parameter to help (somewhat useless, but who knows...)
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-09-11 10:44:08 UTC (rev 4226)
+++ trunk/pywikipedia/wikipedia.py 2007-09-11 11:48:50 UTC (rev 4227)
@@ -4743,6 +4743,8 @@
wikipedia, wiktionary, wikitravel, ...
This will override the configuration in user-config.py.
+-help Shows this help text.
+
-log Enable the logfile. Logs will be stored in the logs
subdirectory.