http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10288
Revision: 10288 Author: xqt Date: 2012-06-03 14:35:52 +0000 (Sun, 03 Jun 2012) Log Message: ----------- update from trunk, some functions are disabled because of the framework changes
Modified Paths: -------------- branches/rewrite/scripts/cosmetic_changes.py
Modified: branches/rewrite/scripts/cosmetic_changes.py =================================================================== --- branches/rewrite/scripts/cosmetic_changes.py 2012-06-03 14:05:32 UTC (rev 10287) +++ branches/rewrite/scripts/cosmetic_changes.py 2012-06-03 14:35:52 UTC (rev 10288) @@ -33,7 +33,7 @@ """ # # (C) xqt, 2009-2011 -# (C) Pywikipedia bot team, 2006-2010 +# (C) Pywikipedia bot team, 2006-2012 # # Distributed under the terms of the MIT license. # @@ -46,10 +46,11 @@ import sys import re
-warning = """ATTENTION: You can run this script as a stand-alone for testing purposes. -However, the changes are that are made are only minor, and other users +warning = """ +ATTENTION: You can run this script as a stand-alone for testing purposes. +However, the changes that are made are only minor, and other users might get angry if you fill the version histories and watchlists with such -irrelevant changes.""" +irrelevant changes. Some wikis prohibit stand-alone running."""
docuReplacements = { '¶ms;': pagegenerators.parameterHelp, @@ -110,12 +111,14 @@ (u'Belege', u'Belege fehlen\g<parameters>'), (u'Quelle', u'Belege fehlen\g<parameters>'), (u'Quellen', u'Belege fehlen\g<parameters>'), + (u'Quellen fehlen', u'Belege fehlen\g<parameters>'), ], } }
class CosmeticChangesToolkit: - def __init__(self, site, debug=False, redirect=False, namespace=None, pageTitle=None): + def __init__(self, site, debug=False, redirect=False, namespace=None, + pageTitle=None): self.site = site self.debug = debug self.redirect = redirect @@ -133,20 +136,24 @@ text = self.commonsfiledesc(text) text = self.fixSelfInterwiki(text) text = self.standardizePageFooter(text) + text = self.fixSyntaxSave(text) text = self.cleanUpLinks(text) text = self.cleanUpSectionHeaders(text) text = self.putSpacesInLists(text) - text = self.translateAndCapitalizeNamespaces(text) +## text = self.translateAndCapitalizeNamespaces(text) ##excluded since family.namespaces does not exist anymore +## text = self.translateMagicWords(text) text = self.replaceDeprecatedTemplates(text) text = self.resolveHtmlEntities(text) text = self.validXhtml(text) text = self.removeUselessSpaces(text) text = self.removeNonBreakingSpaceBeforePercent(text) - text = self.fixSyntaxSave(text) + text = self.fixHtml(text) + text = self.fixReferences(text) text = self.fixStyle(text) text = self.fixTypo(text) - text = self.fixArabicLetters(text) + if self.site.lang in ['ckb', 'fa']: + text = self.fixArabicLetters(text) try: text = isbn.hyphenateIsbnNumbers(text) except isbn.InvalidIsbnException, error: @@ -191,19 +198,46 @@ if not family.isDefinedNSLanguage(nsNumber, self.site.lang): # Skip undefined namespaces continue - namespaces = list(family.namespace(self.site.lang, nsNumber, all=True)) + if nsNumber in (2, 3): + # Skip user namespace, maybe gender is used + continue + namespaces = list(self.site.namespace(nsNumber, all=True)) thisNs = namespaces.pop(0) - if nsNumber == 6 and family.name == 'wikipedia' and \ - self.site.lang in ('en', 'fr'): - # do not change "Image" on en-wiki and fr-wiki - for image in [u'Image', u'image']: - if image in namespaces: - namespaces.remove(image) + if nsNumber == 6 and family.name == 'wikipedia': + if self.site.lang in ('en', 'fr'): + # do not change "Image" on en-wiki and fr-wiki + for image in [u'Image', u'image']: + if image in namespaces: + namespaces.remove(image) + elif self.site.lang == 'pt': + # bug #3346901 should be implemented + continue # skip main (article) namespace if thisNs and namespaces: - text = pywikibot.replaceExcept(text, r'[[\s*(' + '|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)]]', r'[[' + thisNs + ':\g<nameAndLabel>]]', exceptions) + text = pywikibot.replaceExcept( + text, + r'[[\s*(' + '|'.join(namespaces) + \ + ') *:(?P<nameAndLabel>.*?)]]', r'[[' + thisNs + \ + ':\g<nameAndLabel>]]', exceptions) return text
+ def translateMagicWords(self, text): + """ + Makes sure that localized namespace names are used. + """ + # not wanted at ru + # arz uses english stylish codes + if self.site.lang not in ['arz', 'ru']: + exceptions = ['nowiki', 'comment', 'math', 'pre'] + for magicWord in ['img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none', + 'img_framed', 'img_frameless', 'img_border', 'img_upright',]: + aliases = self.site.siteinfo('magicwords').get(magicWord) + if not aliases: continue + text = pywikibot.replaceExcept(text, r'[[(?P<left>.+?:.+?..+?|) *(' + '|'.join(aliases) +') *(?P<right>(|.*?)?]])', + r'[[\g<left>' + aliases[0] + '\g<right>', + exceptions) + return text + def cleanUpLinks(self, text): # helper function which works on one link and either returns it # unmodified, or returns a replacement. @@ -238,17 +272,20 @@ if not trailingChars: titleLength = len(titleWithSection) titleWithSection = titleWithSection.rstrip() - hadTrailingSpaces = (len(titleWithSection) != titleLength) + hadTrailingSpaces = (len(titleWithSection) != + titleLength)
# Convert URL-encoded characters to unicode - titleWithSection = pywikibot.url2unicode(titleWithSection, site = self.site) + titleWithSection = pywikibot.url2unicode(titleWithSection, + site=self.site)
if titleWithSection == '': # just skip empty links. return match.group()
# Remove unnecessary initial and final spaces from label. - # Please note that some editors prefer spaces around pipes. (See [[en:Wikipedia:Semi-bots]]). We remove them anyway. + # Please note that some editors prefer spaces around pipes. + # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway. if label is not None: # Remove unnecessary leading spaces from label, # but remember if we did this because we want @@ -268,18 +305,26 @@ if trailingChars: label += trailingChars
- if titleWithSection == label or titleWithSection[0].lower() + titleWithSection[1:] == label: + if titleWithSection == label or \ + titleWithSection[0].lower() + \ + titleWithSection[1:] == label: newLink = "[[%s]]" % label - # Check if we can create a link with trailing characters instead of a pipelink - elif len(titleWithSection) <= len(label) and label[:len(titleWithSection)] == titleWithSection and re.sub(trailR, '', label[len(titleWithSection):]) == '': - newLink = "[[%s]]%s" % (label[:len(titleWithSection)], label[len(titleWithSection):]) + # Check if we can create a link with trailing characters + # instead of a pipelink + elif len(titleWithSection) <= len(label) and \ + label[:len(titleWithSection)] == titleWithSection and \ + re.sub(trailR, '', + label[len(titleWithSection):]) == '': + newLink = "[[%s]]%s" % (label[:len(titleWithSection)], + label[len(titleWithSection):]) else: # Try to capitalize the first letter of the title. # Maybe this feature is not useful for languages that # don't capitalize nouns... #if not self.site.nocapitalize: if self.site.sitename() == 'wikipedia:de': - titleWithSection = titleWithSection[0].upper() + titleWithSection[1:] + titleWithSection = titleWithSection[0].upper() + \ + titleWithSection[1:] newLink = "[[%s|%s]]" % (titleWithSection, label) # re-add spaces that were pulled out of the link. # Examples: @@ -298,15 +343,20 @@ return match.group()
trailR = re.compile(self.site.linktrail()) - # The regular expression which finds links. Results consist of four groups: - # group title is the target page title, that is, everything before | or ]. - # group section is the page section. It'll include the # to make life easier for us. - # group label is the alternative link title, that's everything between | and ]. - # group linktrail is the link trail, that's letters after ]] which are part of the word. - # note that the definition of 'letter' varies from language to language. - linkR = re.compile(r'(?P<newline>[\n]*)[[(?P<titleWithSection>[^]|]+)(|(?P<label>[^]|]*))?]](?P<linktrail>' + self.site.linktrail() + ')') + # The regular expression which finds links. Results consist of four groups: + # group <newline> depends whether the links starts with a new line. + # group <titleWithSection> is the page title and section, that is, + # everything before | or ]. It'll include the # to make life easier for us. + # group <label> is the alternative link title between | and ]. + # group <linktrail> is the link trail after ]] which are part of the word. + # note that the definition of 'letter' varies from language to language. + linkR = re.compile( + r'(?P<newline>[\n]*)[[(?P<titleWithSection>[^]|]+)(|(?P<label>[^]|]*))?]](?P<linktrail>' + \ + self.site.linktrail() + ')')
- text = pywikibot.replaceExcept(text, linkR, handleOneLink, ['comment', 'math', 'nowiki', 'pre', 'startspace']) + text = pywikibot.replaceExcept(text, linkR, handleOneLink, + ['comment', 'math', 'nowiki', 'pre', + 'startspace']) return text
def resolveHtmlEntities(self, text): @@ -320,6 +370,8 @@ 124, # Vertical bar (??) - used intentionally in navigation bar templates on de: 160, # Non-breaking space ( ) - not supported by Firefox textareas 173, # Soft-hypen (­) - enable editing + 8206, # left-to-right mark (<r;) + 8207, # right-to-left mark (&rtl;) ] # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=23... #if self.site.lang == 'eo': @@ -330,7 +382,8 @@ return text
def validXhtml(self, text): - text = pywikibot.replaceExcept(text, r'(?i)<br[ /]*>', r'<br />', ['comment', 'math', 'nowiki', 'pre']) + text = pywikibot.replaceExcept(text, r'(?i)<br[ /]*>', r'<br />', + ['comment', 'math', 'nowiki', 'pre']) return text
def removeUselessSpaces(self, text): @@ -405,19 +458,35 @@
#from fixes.py def fixSyntaxSave(self, text): - exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] + exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', + 'startspace'] + # link to the wiki working on + ## TODO: disable this for difflinks and titled links + ## http://de.wikipedia.org/w/index.php?title=Wikipedia%3aVandalismusmeldung&... +## text = pywikibot.replaceExcept(text, +## r'[https?://%s.%s.org/wiki/(?P<link>\S+)\s+(?P<title>.+?)\s?]' +## % (self.site.lang, self.site.family.name), +## r'[[\g<link>|\g<title>]]', exceptions) # external link in double brackets - text = pywikibot.replaceExcept(text, r'[[(?P<url>https?://[^]]+?)]]', r'[\g<url>]', exceptions) + text = pywikibot.replaceExcept(text, + r'[[(?P<url>https?://[^]]+?)]]', + r'[\g<url>]', exceptions) # external link starting with double bracket - text = pywikibot.replaceExcept(text, r'[[(?P<url>https?://.+?)]', r'[\g<url>]', exceptions) + text = pywikibot.replaceExcept(text, + r'[[(?P<url>https?://.+?)]', + r'[\g<url>]', exceptions) # external link and description separated by a dash, with # whitespace in front of the dash, so that it is clear that # the dash is not a legitimate part of the URL. - text = pywikibot.replaceExcept(text, r'[(?P<url>https?://[^|] \r\n]+?) +| *(?P<label>[^|]]+?)]', r'[\g<url> \g<label>]', exceptions) + text = pywikibot.replaceExcept(text, + r'[(?P<url>https?://[^|] \r\n]+?) +| *(?P<label>[^|]]+?)]', + r'[\g<url> \g<label>]', exceptions) # dash in external link, where the correct end of the URL can # be detected from the file extension. It is very unlikely that # this will cause mistakes. - text = pywikibot.replaceExcept(text, r'[(?P<url>https?://[^|] ]+?(.pdf|.html|.htm|.php|.asp|.aspx|.jsp)) *| *(?P<label>[^|]]+?)]', r'[\g<url> \g<label>]', exceptions) + text = pywikibot.replaceExcept(text, + r'[(?P<url>https?://[^|] ]+?(.pdf|.html|.htm|.php|.asp|.aspx|.jsp)) *| *(?P<label>[^|]]+?)]', + r'[\g<url> \g<label>]', exceptions) return text
def fixHtml(self, text): @@ -439,11 +508,20 @@ text = pywikibot.replaceExcept(text, r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])'%(level, level), r'%s'%equals, exceptions) - #remove empty <ref/>-tag - text = pywikibot.replaceExcept(text, r'(?i)<ref\s*/>', r'', exceptions) # TODO: maybe we can make the bot replace <p> tags with \r\n's. return text
+ def fixReferences(self, text): + #http://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixe... + exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] + + # it should be name = " or name=" NOT name =" + text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text) + #remove empty <ref/>-tag + text = pywikibot.replaceExcept(text, r'(?i)(<ref\s*/>|<ref *>\s*</ref>)', r'', exceptions) + text = pywikibot.replaceExcept(text, r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>', r'<ref \1/>', exceptions) + return text + def fixStyle(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # convert prettytable to wikitable class @@ -466,60 +544,55 @@ return text
def fixArabicLetters(self, text): - if self.site.lang=='ckb' or self.site.lang=='fa': - exceptions = [ - 'gallery', - 'hyperlink', - 'interwiki', - # but changes letters inside wikilinks - #'link', - 'math', - 'pre', - 'template', - 'timeline', - 'ref', - 'source', - 'startspace', - 'inputbox', - ] - # do not change inside file links - namespaces = list(self.site.namespace(6, all = True)) - pattern = re.compile(u'[[(' + '|'.join(namespaces) + '):.+?..+?]]', - re.UNICODE) - exceptions.append(pattern) - text = pywikibot.replaceExcept(text, u',', u'،', exceptions) - if self.site.lang=='ckb': - text = pywikibot.replaceExcept(text, - ur'ه([.،_<]\s])', - ur'ە\1', exceptions) - text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) - text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) - text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) - text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) - # replace persian digits - for i in range(0,10): - if self.site.lang=='ckb': - text = pywikibot.replaceExcept(text, - u'۰۱۲۳۴۵۶۷۸۹'[i], - u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) - else: - text = pywikibot.replaceExcept(text, - u'٠١٢٣٤٥٦٧٨٩'[i], - u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions) - # do not change digits in class, style and table params - pattern = re.compile(u'=".*?"', re.UNICODE) - exceptions.append(pattern) - # do not change digits inside html-tags - pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) - exceptions.append(pattern) - exceptions.append('table') #exclude tables for now - for i in range(0,10): - if self.site.lang=='ckb': - text = pywikibot.replaceExcept(text, str(i), - u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) - else: - text = pywikibot.replaceExcept(text, str(i), - u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions) + exceptions = [ + 'gallery', + 'hyperlink', + 'interwiki', + # but changes letters inside wikilinks + #'link', + 'math', + 'pre', + 'template', + 'timeline', + 'ref', + 'source', + 'startspace', + 'inputbox', + ] + # valid digits + digits = { + 'ckb' : u'٠١٢٣٤٥٦٧٨٩', + 'fa' : u'۰۱۲۳۴۵۶۷۸۹' + } + new = digits.pop(self.site.lang) + # This only works if there are only two items in digits dict + old = digits[digits.keys()[0]] + # do not change inside file links + namespaces = list(self.site.namespace(6, all = True)) + pattern = re.compile(u'[[(' + '|'.join(namespaces) + '):.+?..+?]]', + re.UNICODE) + exceptions.append(pattern) + text = pywikibot.replaceExcept(text, u',', u'،', exceptions) + if self.site.lang=='ckb': + text = pywikibot.replaceExcept(text, + ur'ه([.،_<]\s])', + ur'ە\1', exceptions) + text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) + text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) + text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) + text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) + # replace persian digits + for i in range(0,10): + text = pywikibot.replaceExcept(text, old[i], new[i], exceptions) + # do not change digits in class, style and table params + pattern = re.compile(u'\w+=(".+?"|\d+)', re.UNICODE) + exceptions.append(pattern) + # do not change digits inside html-tags + pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) + exceptions.append(pattern) + exceptions.append('table') #exclude tables for now + for i in range(0,10): + text = pywikibot.replaceExcept(text, str(i), new[i], exceptions) return text
# Retrieved from "http://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_clea..."