Revision: 8576 Author: xqt Date: 2010-09-19 17:10:15 +0000 (Sun, 19 Sep 2010)
Log Message: ----------- update cc from trunk
Modified Paths: -------------- branches/rewrite/scripts/cosmetic_changes.py
Modified: branches/rewrite/scripts/cosmetic_changes.py =================================================================== --- branches/rewrite/scripts/cosmetic_changes.py 2010-09-19 16:42:57 UTC (rev 8575) +++ branches/rewrite/scripts/cosmetic_changes.py 2010-09-19 17:10:15 UTC (rev 8576) @@ -32,7 +32,8 @@ all of them, but be careful if you do. """ __version__ = '$Id$' -import pywikibot, isbn +import pywikibot +import isbn from pywikibot import pagegenerators import sys import re @@ -49,6 +50,7 @@
# Summary message when using this module as a stand-alone script msg_standalone = { + 'commons': u'Bot: [[Commons talk:Tools/pywiki file description cleanup|desc page fmt]]', 'als':u'Bötli: chleineri Änderige', 'ar': u'روبوت: تغييرات تجميلية', 'be-x-old': u'Робат: касмэтычныя зьмены', @@ -111,6 +113,7 @@ # Summary message that will be appended to the normal message when # cosmetic changes are made on the fly msg_append = { + 'commons': u'; [[Commons talk:Tools/pywiki file description cleanup|desc page fmt]]', 'als':u'; chleineri Änderige', 'ar': u'; تغييرات تجميلية', 'be-x-old': u'; касмэтычныя зьмены', @@ -170,33 +173,97 @@ 'zh': u'; 細部更改', }
+nn_iw_msg = u'<!--interwiki (no, sv, da first; then other languages alphabetically by name)-->' + +# This is from interwiki.py; +# move it to family file and implement global instances +moved_links = { + 'ca' : (u'ús de la plantilla', u'/ús'), + 'cs' : (u'dokumentace', u'/doc'), + 'de' : (u'dokumentation', u'/Meta'), + 'en' : ([u'documentation', + u'template documentation', + u'template doc', + u'doc', + u'documentation, template'], u'/doc'), + 'es' : ([u'documentación', u'documentación de plantilla'], u'/doc'), + 'fr' : (u'/documentation', u'/Documentation'), + 'hu' : (u'sablondokumentáció', u'/doc'), + 'id' : (u'template doc', u'/doc'), + 'ja' : (u'documentation', u'/doc'), + 'ka' : (u'თარგის ინფო', u'/ინფო'), + 'ko' : (u'documentation', u'/설명문서'), + 'ms' : (u'documentation', u'/doc'), + 'pl' : (u'dokumentacja', u'/opis'), + 'pt' : ([u'documentação', u'/doc'], u'/doc'), + 'ro' : (u'documentaţie', u'/doc'), + 'ru' : (u'doc', u'/doc'), + 'sv' : (u'dokumentation', u'/dok'), + 'vi' : (u'documentation', u'/doc'), + 'zh' : ([u'documentation', u'doc'], u'/doc'), +} + +# Template which should be replaced or removed. +# Use a list with two entries. The first entry will be replaced by the second. +# Examples: +# For removing {{Foo}}, the list must be: +# (u'Foo', None), +# +# The following also works: +# (u'Foo', ''), +# +# For replacing {{Foo}} with {{Bar}} the list must be: +# (u'Foo', u'Bar'), +# +# This also removes all template parameters of {{Foo}} +# For replacing {{Foo}} with {{Bar}} but keep the template +# parameters in its original order, please use: +# (u'Foo', u'Bar\g<parameters>'), + +deprecatedTemplates = { + 'wikipedia': { + 'de': [ + (u'Belege', u'Belege fehlen\g<parameters>'), + (u'Quelle', u'Belege fehlen\g<parameters>'), + (u'Quellen', u'Belege fehlen\g<parameters>'), + (u'Quellen fehlen', u'Belege fehlen\g<parameters>'), + ], + } +} + class CosmeticChangesToolkit: - def __init__(self, site, debug=False, redirect=False, namespace=None): + def __init__(self, site, debug=False, redirect=False, namespace=None, pageTitle=None): self.site = site self.debug = debug self.redirect = redirect self.namespace = namespace self.template = (self.namespace == 10) self.talkpage = self.namespace >= 0 and self.namespace % 2 == 1 + self.title = pageTitle
def change(self, text): """ Given a wiki source code text, return the cleaned up version. """ oldText = text + if self.site.sitename()== u'commons:commons' and self.namespace == 6: + text = self.commonsfiledesc(text) text = self.fixSelfInterwiki(text) - text = self.standardizeInterwiki(text) - text = self.standardizeCategories(text) + text = self.standardizePageFooter(text) text = self.cleanUpLinks(text) text = self.cleanUpSectionHeaders(text) text = self.putSpacesInLists(text) text = self.translateAndCapitalizeNamespaces(text) + text = self.replaceDeprecatedTemplates(text) text = self.resolveHtmlEntities(text) text = self.validXhtml(text) text = self.removeUselessSpaces(text) text = self.removeNonBreakingSpaceBeforePercent(text) text = self.fixSyntaxSave(text) text = self.fixHtml(text) + text = self.fixStyle(text) + text = self.fixTypo(text) + text = self.fixArabicLetters(text) try: text = isbn.hyphenateIsbnNumbers(text) except isbn.InvalidIsbnException, error: @@ -210,21 +277,13 @@ Interwiki links to the site itself are displayed like local links. Remove their language code prefix. """ - interwikiR = re.compile(r'[[%s\s?:([^[]\n]*)]]' % self.site.lang) - text = interwikiR.sub(r'[[\1]]', text) - return text - - def standardizeInterwiki(self, text): - """ - Makes sure that interwiki links are put to the correct position and - into the right order. - """ if not self.talkpage and pywikibot.calledModuleName() <> 'interwiki': - interwikiLinks = pywikibot.getLanguageLinks(text, insite = self.site) - text = pywikibot.replaceLanguageLinks(text, interwikiLinks, site = self.site, template = self.template) + interwikiR = re.compile(r'[[%s\s?:([^[]\n]*)]]' % self.site.lang) + text = interwikiR.sub(r'[[\1]]', text) return text
- def standardizeCategories(self, text): + + def standardizePageFooter(self, text): """ Makes sure that categories are put to the correct position, but does not sort them. @@ -256,7 +315,12 @@ continue namespaces = list(family.namespace(self.site.lang, nsNumber, all = True)) thisNs = namespaces.pop(0) - + if nsNumber == 6 and family.name == 'wikipedia' and \ + self.site.lang in ('en', 'fr'): + # do not change "Image" on en-wiki and fr-wiki + for image in [u'Image', u'image']: + if image in namespaces: + namespaces.remove(image) # skip main (article) namespace if thisNs and namespaces: text = pywikibot.replaceExcept(text, r'[[\s*(' + '|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)]]', r'[[' + thisNs + ':\g<nameAndLabel>]]', exceptions) @@ -435,10 +499,26 @@ and French Wikipedia. It might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ - if not self.redirect: - text = pywikibot.replaceExcept(text, r'(?m)^(?P<bullet>[:;]*(*+|#+)[:;*#]*)(?P<char>[^\s*#:;].+?)', '\g<bullet> \g<char>', ['comment', 'math', 'nowiki', 'pre']) + exceptions = ['comment', 'math', 'nowiki', 'pre', 'source', 'timeline'] + if not self.redirect and pywikibot.calledModuleName() <> 'capitalize_redirects': + text = pywikibot.replaceExcept(text, r'(?m)^(?P<bullet>[:;]*(*+|#+)[:;*#]*)(?P<char>[^\s*#:;].+?)', '\g<bullet> \g<char>', exceptions) return text
+ def replaceDeprecatedTemplates(self, text): + exceptions = ['comment', 'math', 'nowiki', 'pre'] + if self.site.family.name in deprecatedTemplates and self.site.lang in deprecatedTemplates[self.site.family.name]: + for template in deprecatedTemplates[self.site.family.name][self.site.lang]: + old = template[0] + new = template[1] + if new == None: + new = '' + else: + new = '{{'+new+'}}' + if not self.site.nocapitalize: + old = '[' + old[0].upper() + old[0].lower() + ']' + old[1:] + text = pywikibot.replaceExcept(text, r'{{([mM][sS][gG]:)?' + old + '(?P<parameters>|[^}]+|)}}', new, exceptions) + return text + #from fixes.py def fixSyntaxSave(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] @@ -469,11 +549,131 @@ # horizontal line with attributes; can't be done with wiki syntax # so we only make it XHTML compliant text = pywikibot.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1 />', exceptions) + # a header where only spaces are in the same line + for level in range(1, 7): + equals = '\1%s \2 %s\3' % ("="*level, "="*level) + text = pywikibot.replaceExcept(text, + r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])'%(level, level), + r'%s'%equals, exceptions) + #remove empty <ref/>-tag + text = pywikibot.replaceExcept(text, r'(?i)<ref\s*/>', r'', exceptions) # TODO: maybe we can make the bot replace <p> tags with \r\n's. return text
+ def fixStyle(self, text): + exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] + # convert prettytable to wikitable class + if self.site.language in ('de', 'en'): + text = pywikibot.replaceExcept(text, ur'(class="[^"]*)prettytable([^"]*")', ur'\1wikitable\2', exceptions) + return text + + def fixTypo(self, text): + exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace', 'gallery', 'hyperlink', 'interwiki', 'link'] + # change <number> ccm -> <number> cm³ + text = pywikibot.replaceExcept(text, ur'(\d)\s* ccm', ur'\1 cm³', exceptions) + text = pywikibot.replaceExcept(text, ur'(\d)\s*ccm', ur'\1 cm³', exceptions) + # Solve wrong Nº sign with °C or °F + # additional exception requested on fr-wiki for this stuff + pattern = re.compile(u'«.*?»', re.UNICODE) + exceptions.append(pattern) + text = pywikibot.replaceExcept(text, ur'(\d)\s* [º°]([CF])', ur'\1 °\2', exceptions) + text = pywikibot.replaceExcept(text, ur'(\d)\s*[º°]([CF])', ur'\1 °\2', exceptions) + text = pywikibot.replaceExcept(text, ur'º([CF])', ur'°\1', exceptions) + return text + + def fixArabicLetters(self, text): + if self.site.lang=='ckb': + exceptions = [ + 'gallery', + 'hyperlink', + 'interwiki', + # but changes letters inside wikilinks + #'link', + 'math', + 'pre', + 'template', + 'timeline', + 'ref', + 'source', + 'startspace', + 'inputbox', + ] + # do not change inside file links + namespaces = list(self.site.namespace(6, all = True)) + pattern = re.compile(u'[[(' + '|'.join(namespaces) + '):.+?..+?]]', re.UNICODE) + exceptions.append(pattern) + text = pywikibot.replaceExcept(text, u',', u'،', exceptions) + text = pywikibot.replaceExcept(text, ur'ه([.،_<]\s])', ur'ە\1', exceptions) + text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) + text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) + text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) + text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) + # replace persian digits + for i in range(0,10): + text = pywikibot.replaceExcept(text, u'۰۱۲۳۴۵۶۷۸۹'[i], u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) + # do not change digits in class, style and table params + pattern = re.compile(u'=".*?"', re.UNICODE) + exceptions.append(pattern) + # do not change digits inside html-tags + pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) + exceptions.append(pattern) + for i in range(0,10): + text = pywikibot.replaceExcept(text, str(i), u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) + return text + + # Retrieved from "http://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_clea..." + def commonsfiledesc(self, text): + # section headers to {{int:}} versions + exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki', + 'pre', 'source', 'ref', 'timeline'] + text = pywikibot.replaceExcept(text, + r"([\r\n]|^)== *Summary *==", + r"\1== {{int:filedesc}} ==", + exceptions, True) + text = pywikibot.replaceExcept( + text, + r"([\r\n])== *[[Commons:Copyright tags|Licensing]]: *==", + r"\1== {{int:license}} ==", exceptions, True) + text = pywikibot.replaceExcept( + text, + r"([\r\n])== *(Licensing|License information|{{int:license-header}}) *==", + r"\1== {{int:license}} ==", exceptions, True) + + # frequent field values to {{int:}} versions + text = pywikibot.replaceExcept( + text, + r'([\r\n]|[Ss]ource *= *)(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *([\r\n])', + r'\1{{own}}\2', exceptions, True) + text = pywikibot.replaceExcept( + text, + r'(| *Permission *=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])', + r'\1\2', exceptions, True) + + # added to transwikied pages + text = pywikibot.replaceExcept(text, r'__NOTOC__', '', exceptions, True) + + # tracker element for js upload form + text = pywikibot.replaceExcept( + text, + r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->', + '', exceptions[1:], True) + text = pywikibot.replaceExcept(text, r'{{ImageUpload|(?:basic|full)}}', + '', exceptions, True) + + # duplicated section headers + text = pywikibot.replaceExcept( + text, + r'([\r\n]|^)== *{{int:filedesc}} *==(?:[\r\n ]*)== *{{int:filedesc}} *==', + r'\1== {{int:filedesc}} ==', exceptions, True) + text = pywikibot.replaceExcept( + text, + r'([\r\n]|^)== *{{int:license}} *==(?:[\r\n ]*)== *{{int:license}} *==', + r'\1== {{int:license}} ==', exceptions, True) + return text + class CosmeticChangesBot: - def __init__(self, generator, acceptall = False, comment=u'Robot: Cosmetic changes'): + def __init__(self, generator, acceptall = False, + comment=u'Robot: Cosmetic changes'): self.generator = generator self.acceptall = acceptall self.comment = comment @@ -483,13 +683,17 @@ try: # Show the title of the page we're working on. # Highlight the title in purple. - pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) - ccToolkit = CosmeticChangesToolkit(page.site, debug = True, namespace = page.namespace()) + pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % page.title()) + ccToolkit = CosmeticChangesToolkit(page.site, debug=True, + namespace=page.namespace(), + pageTitle=page.title()) changedText = ccToolkit.change(page.get()) if changedText.strip() != page.get().strip(): if not self.acceptall: - choice = pywikibot.inputChoice(u'Do you want to accept these changes?', - ['Yes', 'No', 'All', 'Quit'], ['y', 'N', 'a', 'q'], 'N') + choice = pywikibot.inputChoice( + u'Do you want to accept these changes?', + ['Yes', 'No', 'All', 'Quit'], ['y', 'N', 'a', 'q'], 'N') if choice == 'a': self.acceptall = True elif choice == 'q': @@ -498,15 +702,19 @@ if self.acceptall or choice == 'y': page.put(changedText, comment=self.comment) else: - pywikibot.output('No changes were necessary in %s' % page.title()) + pywikibot.output('No changes were necessary in %s' + % page.title()) except pywikibot.NoPage: - pywikibot.output("Page %s does not exist?!" % page.aslink()) + pywikibot.output("Page %s does not exist?!" + % page.title(asLink=True)) except pywikibot.IsRedirectPage: - pywikibot.output("Page %s is a redirect; skipping." % page.aslink()) + pywikibot.output("Page %s is a redirect; skipping." + % page.title(asLink=True)) except pywikibot.LockedPage: - pywikibot.output("Page %s is locked?!" % page.aslink()) + pywikibot.output("Page %s is locked?!" % page.title(asLink=True)) except pywikibot.EditConflict: - pywikibot.output("An edit conflict has occured at %s." % page.aslink()) + pywikibot.output("An edit conflict has occured at %s." + % page.title(asLink=True))
def run(self): try: @@ -540,16 +748,6 @@ if editSummary == '': # Load default summary message. editSummary = pywikibot.translate(pywikibot.getSite(), msg_standalone) - - # Disabled this check. Although the point is still valid, there - # is now a warning and a prompt (see below). - #if pywikibot.getSite() == pywikibot.getSite('nl','wikipedia'): - #print "Deze bot is op WikipediaNL niet gewenst." - #print "Het toevoegen van cosmetic changes bij andere wijzigingen is toegestaan," - #print "maar cosmetic_changes als stand-alone bot niet." - #print "Zoek alstublieft een nuttig gebruik voor uw bot." - #sys.exit() - if pageTitle: site = pywikibot.getSite() gen = iter([pywikibot.Page(pywikibot.Link(t, site)) for t in pageTitle]) @@ -558,11 +756,14 @@ if not gen: pywikibot.showHelp() elif not always: - answer = pywikibot.inputChoice(warning + '\nDo you really want to continue?', ['yes', 'no'], ['y', 'N'], 'N') + answer = pywikibot.inputChoice( + warning + '\nDo you really want to continue?', + ['yes', 'no'], ['y', 'N'], 'N')
if answer == 'y': preloadingGen = pagegenerators.PreloadingGenerator(gen) - bot = CosmeticChangesBot(preloadingGen, acceptall=always, comment=editSummary) + bot = CosmeticChangesBot(preloadingGen, acceptall=always, + comment=editSummary) bot.run()
if __name__ == "__main__":
pywikipedia-svn@lists.wikimedia.org