Revision: 8576
Author: xqt
Date: 2010-09-19 17:10:15 +0000 (Sun, 19 Sep 2010)
Log Message:
-----------
update cc from trunk
Modified Paths:
--------------
branches/rewrite/scripts/cosmetic_changes.py
Modified: branches/rewrite/scripts/cosmetic_changes.py
===================================================================
--- branches/rewrite/scripts/cosmetic_changes.py 2010-09-19 16:42:57 UTC (rev 8575)
+++ branches/rewrite/scripts/cosmetic_changes.py 2010-09-19 17:10:15 UTC (rev 8576)
@@ -32,7 +32,8 @@
all of them, but be careful if you do.
"""
__version__ = '$Id$'
-import pywikibot, isbn
+import pywikibot
+import isbn
from pywikibot import pagegenerators
import sys
import re
@@ -49,6 +50,7 @@
# Summary message when using this module as a stand-alone script
msg_standalone = {
+ 'commons': u'Bot: [[Commons talk:Tools/pywiki file description
cleanup|desc page fmt]]',
'als':u'Bötli: chleineri Änderige',
'ar': u'روبوت: تغييرات تجميلية',
'be-x-old': u'Робат: касмэтычныя зьмены',
@@ -111,6 +113,7 @@
# Summary message that will be appended to the normal message when
# cosmetic changes are made on the fly
msg_append = {
+ 'commons': u'; [[Commons talk:Tools/pywiki file description cleanup|desc
page fmt]]',
'als':u'; chleineri Änderige',
'ar': u'; تغييرات تجميلية',
'be-x-old': u'; касмэтычныя зьмены',
@@ -170,33 +173,97 @@
'zh': u'; 細部更改',
}
+nn_iw_msg = u'<!--interwiki (no, sv, da first; then other languages alphabetically
by name)-->'
+
+# This is from interwiki.py;
+# move it to family file and implement global instances
+moved_links = {
+ 'ca' : (u'ús de la plantilla', u'/ús'),
+ 'cs' : (u'dokumentace', u'/doc'),
+ 'de' : (u'dokumentation', u'/Meta'),
+ 'en' : ([u'documentation',
+ u'template documentation',
+ u'template doc',
+ u'doc',
+ u'documentation, template'], u'/doc'),
+ 'es' : ([u'documentación', u'documentación de plantilla'],
u'/doc'),
+ 'fr' : (u'/documentation', u'/Documentation'),
+ 'hu' : (u'sablondokumentáció', u'/doc'),
+ 'id' : (u'template doc', u'/doc'),
+ 'ja' : (u'documentation', u'/doc'),
+ 'ka' : (u'თარგის ინფო', u'/ინფო'),
+ 'ko' : (u'documentation', u'/설명문서'),
+ 'ms' : (u'documentation', u'/doc'),
+ 'pl' : (u'dokumentacja', u'/opis'),
+ 'pt' : ([u'documentação', u'/doc'], u'/doc'),
+ 'ro' : (u'documentaţie', u'/doc'),
+ 'ru' : (u'doc', u'/doc'),
+ 'sv' : (u'dokumentation', u'/dok'),
+ 'vi' : (u'documentation', u'/doc'),
+ 'zh' : ([u'documentation', u'doc'], u'/doc'),
+}
+
+# Template which should be replaced or removed.
+# Use a list with two entries. The first entry will be replaced by the second.
+# Examples:
+# For removing {{Foo}}, the list must be:
+# (u'Foo', None),
+#
+# The following also works:
+# (u'Foo', ''),
+#
+# For replacing {{Foo}} with {{Bar}} the list must be:
+# (u'Foo', u'Bar'),
+#
+# This also removes all template parameters of {{Foo}}
+# For replacing {{Foo}} with {{Bar}} but keep the template
+# parameters in its original order, please use:
+# (u'Foo', u'Bar\g<parameters>'),
+
+deprecatedTemplates = {
+ 'wikipedia': {
+ 'de': [
+ (u'Belege', u'Belege fehlen\g<parameters>'),
+ (u'Quelle', u'Belege fehlen\g<parameters>'),
+ (u'Quellen', u'Belege fehlen\g<parameters>'),
+ (u'Quellen fehlen', u'Belege fehlen\g<parameters>'),
+ ],
+ }
+}
+
class CosmeticChangesToolkit:
- def __init__(self, site, debug=False, redirect=False, namespace=None):
+ def __init__(self, site, debug=False, redirect=False, namespace=None,
pageTitle=None):
self.site = site
self.debug = debug
self.redirect = redirect
self.namespace = namespace
self.template = (self.namespace == 10)
self.talkpage = self.namespace >= 0 and self.namespace % 2 == 1
+ self.title = pageTitle
def change(self, text):
"""
Given a wiki source code text, return the cleaned up version.
"""
oldText = text
+ if self.site.sitename()== u'commons:commons' and self.namespace == 6:
+ text = self.commonsfiledesc(text)
text = self.fixSelfInterwiki(text)
- text = self.standardizeInterwiki(text)
- text = self.standardizeCategories(text)
+ text = self.standardizePageFooter(text)
text = self.cleanUpLinks(text)
text = self.cleanUpSectionHeaders(text)
text = self.putSpacesInLists(text)
text = self.translateAndCapitalizeNamespaces(text)
+ text = self.replaceDeprecatedTemplates(text)
text = self.resolveHtmlEntities(text)
text = self.validXhtml(text)
text = self.removeUselessSpaces(text)
text = self.removeNonBreakingSpaceBeforePercent(text)
text = self.fixSyntaxSave(text)
text = self.fixHtml(text)
+ text = self.fixStyle(text)
+ text = self.fixTypo(text)
+ text = self.fixArabicLetters(text)
try:
text = isbn.hyphenateIsbnNumbers(text)
except isbn.InvalidIsbnException, error:
@@ -210,21 +277,13 @@
Interwiki links to the site itself are displayed like local links.
Remove their language code prefix.
"""
- interwikiR = re.compile(r'\[\[%s\s?:([^\[\]\n]*)\]\]' % self.site.lang)
- text = interwikiR.sub(r'[[\1]]', text)
- return text
-
- def standardizeInterwiki(self, text):
- """
- Makes sure that interwiki links are put to the correct position and
- into the right order.
- """
if not self.talkpage and pywikibot.calledModuleName() <>
'interwiki':
- interwikiLinks = pywikibot.getLanguageLinks(text, insite = self.site)
- text = pywikibot.replaceLanguageLinks(text, interwikiLinks, site = self.site,
template = self.template)
+ interwikiR = re.compile(r'\[\[%s\s?:([^\[\]\n]*)\]\]' %
self.site.lang)
+ text = interwikiR.sub(r'[[\1]]', text)
return text
- def standardizeCategories(self, text):
+
+ def standardizePageFooter(self, text):
"""
Makes sure that categories are put to the correct position, but
does not sort them.
@@ -256,7 +315,12 @@
continue
namespaces = list(family.namespace(self.site.lang, nsNumber, all = True))
thisNs = namespaces.pop(0)
-
+ if nsNumber == 6 and family.name == 'wikipedia' and \
+ self.site.lang in ('en', 'fr'):
+ # do not change "Image" on en-wiki and fr-wiki
+ for image in [u'Image', u'image']:
+ if image in namespaces:
+ namespaces.remove(image)
# skip main (article) namespace
if thisNs and namespaces:
text = pywikibot.replaceExcept(text, r'\[\[\s*(' +
'|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)\]\]',
r'[[' + thisNs + ':\g<nameAndLabel>]]', exceptions)
@@ -435,10 +499,26 @@
and French Wikipedia. It might be that it is not wanted on other wikis.
If there are any complaints, please file a bug report.
"""
- if not self.redirect:
- text = pywikibot.replaceExcept(text,
r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)',
'\g<bullet> \g<char>', ['comment', 'math',
'nowiki', 'pre'])
+ exceptions = ['comment', 'math', 'nowiki', 'pre',
'source', 'timeline']
+ if not self.redirect and pywikibot.calledModuleName() <>
'capitalize_redirects':
+ text = pywikibot.replaceExcept(text,
r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)',
'\g<bullet> \g<char>', exceptions)
return text
+ def replaceDeprecatedTemplates(self, text):
+ exceptions = ['comment', 'math', 'nowiki',
'pre']
+ if self.site.family.name in deprecatedTemplates and self.site.lang in
deprecatedTemplates[self.site.family.name]:
+ for template in deprecatedTemplates[self.site.family.name][self.site.lang]:
+ old = template[0]
+ new = template[1]
+ if new == None:
+ new = ''
+ else:
+ new = '{{'+new+'}}'
+ if not self.site.nocapitalize:
+ old = '[' + old[0].upper() + old[0].lower() + ']' +
old[1:]
+ text = pywikibot.replaceExcept(text, r'\{\{([mM][sS][gG]:)?' +
old + '(?P<parameters>\|[^}]+|)}}', new, exceptions)
+ return text
+
#from fixes.py
def fixSyntaxSave(self, text):
exceptions = ['nowiki', 'comment', 'math', 'pre',
'source', 'startspace']
@@ -469,11 +549,131 @@
# horizontal line with attributes; can't be done with wiki syntax
# so we only make it XHTML compliant
text = pywikibot.replaceExcept(text, r'(?i)<hr ([^>/]+?)>',
r'<hr \1 />', exceptions)
+ # a header where only spaces are in the same line
+ for level in range(1, 7):
+ equals = '\\1%s \\2 %s\\3' % ("="*level,
"="*level)
+ text = pywikibot.replaceExcept(text,
+ r'(?i)([\r\n]) *<h%d> *([^<]+?)
*</h%d> *([\r\n])'%(level, level),
+ r'%s'%equals, exceptions)
+ #remove empty <ref/>-tag
+ text = pywikibot.replaceExcept(text, r'(?i)<ref\s*/>', r'',
exceptions)
# TODO: maybe we can make the bot replace <p> tags with \r\n's.
return text
+ def fixStyle(self, text):
+ exceptions = ['nowiki', 'comment', 'math', 'pre',
'source', 'startspace']
+ # convert prettytable to wikitable class
+ if self.site.language in ('de', 'en'):
+ text = pywikibot.replaceExcept(text,
ur'(class="[^"]*)prettytable([^"]*")',
ur'\1wikitable\2', exceptions)
+ return text
+
+ def fixTypo(self, text):
+ exceptions = ['nowiki', 'comment', 'math', 'pre',
'source', 'startspace', 'gallery', 'hyperlink',
'interwiki', 'link']
+ # change <number> ccm -> <number> cm³
+ text = pywikibot.replaceExcept(text, ur'(\d)\s* ccm',
ur'\1 cm³', exceptions)
+ text = pywikibot.replaceExcept(text, ur'(\d)\s*ccm',
ur'\1 cm³', exceptions)
+ # Solve wrong Nº sign with °C or °F
+ # additional exception requested on fr-wiki for this stuff
+ pattern = re.compile(u'«.*?»', re.UNICODE)
+ exceptions.append(pattern)
+ text = pywikibot.replaceExcept(text, ur'(\d)\s* [º°]([CF])',
ur'\1 °\2', exceptions)
+ text = pywikibot.replaceExcept(text, ur'(\d)\s*[º°]([CF])',
ur'\1 °\2', exceptions)
+ text = pywikibot.replaceExcept(text, ur'º([CF])', ur'°\1',
exceptions)
+ return text
+
+ def fixArabicLetters(self, text):
+ if self.site.lang=='ckb':
+ exceptions = [
+ 'gallery',
+ 'hyperlink',
+ 'interwiki',
+ # but changes letters inside wikilinks
+ #'link',
+ 'math',
+ 'pre',
+ 'template',
+ 'timeline',
+ 'ref',
+ 'source',
+ 'startspace',
+ 'inputbox',
+ ]
+ # do not change inside file links
+ namespaces = list(self.site.namespace(6, all = True))
+ pattern = re.compile(u'\[\[(' + '|'.join(namespaces) +
'):.+?\..+?\]\]', re.UNICODE)
+ exceptions.append(pattern)
+ text = pywikibot.replaceExcept(text, u',', u'،', exceptions)
+ text = pywikibot.replaceExcept(text, ur'ه([.،_<\]\s])',
ur'ە\1', exceptions)
+ text = pywikibot.replaceExcept(text, u'ه', u'ە',
exceptions)
+ text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions)
+ text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions)
+ text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی',
exceptions)
+ # replace persian digits
+ for i in range(0,10):
+ text = pywikibot.replaceExcept(text, u'۰۱۲۳۴۵۶۷۸۹'[i],
u'٠١٢٣٤٥٦٧٨٩'[i], exceptions)
+ # do not change digits in class, style and table params
+ pattern = re.compile(u'=".*?"', re.UNICODE)
+ exceptions.append(pattern)
+ # do not change digits inside html-tags
+ pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
+ exceptions.append(pattern)
+ for i in range(0,10):
+ text = pywikibot.replaceExcept(text, str(i), u'٠١٢٣٤٥٦٧٨٩'[i],
exceptions)
+ return text
+
+ # Retrieved from
"http://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cleanup"
+ def commonsfiledesc(self, text):
+ # section headers to {{int:}} versions
+ exceptions = ['comment', 'includeonly', 'math',
'noinclude', 'nowiki',
+ 'pre', 'source', 'ref',
'timeline']
+ text = pywikibot.replaceExcept(text,
+ r"([\r\n]|^)\=\= *Summary *\=\=",
+ r"\1== {{int:filedesc}} ==",
+ exceptions, True)
+ text = pywikibot.replaceExcept(
+ text,
+ r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=",
+ r"\1== {{int:license}} ==", exceptions, True)
+ text = pywikibot.replaceExcept(
+ text,
+ r"([\r\n])\=\= *(Licensing|License information|{{int:license-header}})
*\=\=",
+ r"\1== {{int:license}} ==", exceptions, True)
+
+ # frequent field values to {{int:}} versions
+ text = pywikibot.replaceExcept(
+ text,
+ r'([\r\n]\|[Ss]ource *\= *)(?:[Oo]wn work by uploader|[Oo]wn
work|[Ee]igene [Aa]rbeit) *([\r\n])',
+ r'\1{{own}}\2', exceptions, True)
+ text = pywikibot.replaceExcept(
+ text,
+ r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])',
+ r'\1\2', exceptions, True)
+
+ # added to transwikied pages
+ text = pywikibot.replaceExcept(text, r'__NOTOC__', '',
exceptions, True)
+
+ # tracker element for js upload form
+ text = pywikibot.replaceExcept(
+ text,
+ r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->',
+ '', exceptions[1:], True)
+ text = pywikibot.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}',
+ '', exceptions, True)
+
+ # duplicated section headers
+ text = pywikibot.replaceExcept(
+ text,
+ r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\=
*{{int:filedesc}} *\=\=',
+ r'\1== {{int:filedesc}} ==', exceptions, True)
+ text = pywikibot.replaceExcept(
+ text,
+ r'([\r\n]|^)\=\= *{{int:license}} *\=\=(?:[\r\n ]*)\=\= *{{int:license}}
*\=\=',
+ r'\1== {{int:license}} ==', exceptions, True)
+ return text
+
class CosmeticChangesBot:
- def __init__(self, generator, acceptall = False, comment=u'Robot: Cosmetic
changes'):
+ def __init__(self, generator, acceptall = False,
+ comment=u'Robot: Cosmetic changes'):
self.generator = generator
self.acceptall = acceptall
self.comment = comment
@@ -483,13 +683,17 @@
try:
# Show the title of the page we're working on.
# Highlight the title in purple.
- pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<" % page.title())
- ccToolkit = CosmeticChangesToolkit(page.site, debug = True, namespace =
page.namespace())
+ pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<"
+ % page.title())
+ ccToolkit = CosmeticChangesToolkit(page.site, debug=True,
+ namespace=page.namespace(),
+ pageTitle=page.title())
changedText = ccToolkit.change(page.get())
if changedText.strip() != page.get().strip():
if not self.acceptall:
- choice = pywikibot.inputChoice(u'Do you want to accept these
changes?',
- ['Yes', 'No',
'All', 'Quit'], ['y', 'N', 'a', 'q'],
'N')
+ choice = pywikibot.inputChoice(
+ u'Do you want to accept these changes?',
+ ['Yes', 'No', 'All', 'Quit'],
['y', 'N', 'a', 'q'], 'N')
if choice == 'a':
self.acceptall = True
elif choice == 'q':
@@ -498,15 +702,19 @@
if self.acceptall or choice == 'y':
page.put(changedText, comment=self.comment)
else:
- pywikibot.output('No changes were necessary in %s' %
page.title())
+ pywikibot.output('No changes were necessary in %s'
+ % page.title())
except pywikibot.NoPage:
- pywikibot.output("Page %s does not exist?!" % page.aslink())
+ pywikibot.output("Page %s does not exist?!"
+ % page.title(asLink=True))
except pywikibot.IsRedirectPage:
- pywikibot.output("Page %s is a redirect; skipping." %
page.aslink())
+ pywikibot.output("Page %s is a redirect; skipping."
+ % page.title(asLink=True))
except pywikibot.LockedPage:
- pywikibot.output("Page %s is locked?!" % page.aslink())
+ pywikibot.output("Page %s is locked?!" % page.title(asLink=True))
except pywikibot.EditConflict:
- pywikibot.output("An edit conflict has occured at %s." %
page.aslink())
+ pywikibot.output("An edit conflict has occured at %s."
+ % page.title(asLink=True))
def run(self):
try:
@@ -540,16 +748,6 @@
if editSummary == '':
# Load default summary message.
editSummary = pywikibot.translate(pywikibot.getSite(), msg_standalone)
-
- # Disabled this check. Although the point is still valid, there
- # is now a warning and a prompt (see below).
- #if pywikibot.getSite() == pywikibot.getSite('nl','wikipedia'):
- #print "Deze bot is op WikipediaNL niet gewenst."
- #print "Het toevoegen van cosmetic changes bij andere wijzigingen is
toegestaan,"
- #print "maar cosmetic_changes als stand-alone bot niet."
- #print "Zoek alstublieft een nuttig gebruik voor uw bot."
- #sys.exit()
-
if pageTitle:
site = pywikibot.getSite()
gen = iter([pywikibot.Page(pywikibot.Link(t, site)) for t in pageTitle])
@@ -558,11 +756,14 @@
if not gen:
pywikibot.showHelp()
elif not always:
- answer = pywikibot.inputChoice(warning + '\nDo you really want to
continue?', ['yes', 'no'], ['y', 'N'], 'N')
+ answer = pywikibot.inputChoice(
+ warning + '\nDo you really want to continue?',
+ ['yes', 'no'], ['y', 'N'], 'N')
if answer == 'y':
preloadingGen = pagegenerators.PreloadingGenerator(gen)
- bot = CosmeticChangesBot(preloadingGen, acceptall=always, comment=editSummary)
+ bot = CosmeticChangesBot(preloadingGen, acceptall=always,
+ comment=editSummary)
bot.run()
if __name__ == "__main__":