[Pywikipedia-svn] SVN: [10288] branches/rewrite/scripts/cosmetic_changes.py

3 Jun 2012

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10288
Revision: 10288
Author:   xqt
Date:     2012-06-03 14:35:52 +0000 (Sun, 03 Jun 2012)
Log Message:
-----------
update from trunk, some functions are disabled because of the framework changes
Modified Paths:
--------------
    branches/rewrite/scripts/cosmetic_changes.py
Modified: branches/rewrite/scripts/cosmetic_changes.py
===================================================================

--- branches/rewrite/scripts/cosmetic_changes.py	2012-06-03 14:05:32 UTC (rev 10287)
+++ branches/rewrite/scripts/cosmetic_changes.py	2012-06-03 14:35:52 UTC (rev 10288)
@@ -33,7 +33,7 @@
 """
 #
 # (C) xqt, 2009-2011
-# (C) Pywikipedia bot team, 2006-2010
+# (C) Pywikipedia bot team, 2006-2012
 #
 # Distributed under the terms of the MIT license.
 #
@@ -46,10 +46,11 @@
 import sys
 import re
-warning = """ATTENTION: You can run this script as a stand-alone for testing purposes.
-However, the changes are that are made are only minor, and other users
+warning = """
+ATTENTION: You can run this script as a stand-alone for testing purposes.
+However, the changes that are made are only minor, and other users
 might get angry if you fill the version histories and watchlists with such
-irrelevant changes."""
+irrelevant changes. Some wikis prohibit stand-alone running."""
docuReplacements = {
     '&params;': pagegenerators.parameterHelp,
@@ -110,12 +111,14 @@
             (u'Belege', u'Belege fehlen\g<parameters>'),
             (u'Quelle', u'Belege fehlen\g<parameters>'),
             (u'Quellen', u'Belege fehlen\g<parameters>'),
+            (u'Quellen fehlen', u'Belege fehlen\g<parameters>'),
         ],
     }
 }
class CosmeticChangesToolkit:
-    def __init__(self, site, debug=False, redirect=False, namespace=None, pageTitle=None):
+    def __init__(self, site, debug=False, redirect=False, namespace=None,
+                 pageTitle=None):
         self.site = site
         self.debug = debug
         self.redirect = redirect
@@ -133,20 +136,24 @@
             text = self.commonsfiledesc(text)
         text = self.fixSelfInterwiki(text)
         text = self.standardizePageFooter(text)
+        text = self.fixSyntaxSave(text)
         text = self.cleanUpLinks(text)
         text = self.cleanUpSectionHeaders(text)
         text = self.putSpacesInLists(text)
-        text = self.translateAndCapitalizeNamespaces(text)
+##        text = self.translateAndCapitalizeNamespaces(text) ##excluded since family.namespaces does not exist anymore
+##        text = self.translateMagicWords(text)
         text = self.replaceDeprecatedTemplates(text)
         text = self.resolveHtmlEntities(text)
         text = self.validXhtml(text)
         text = self.removeUselessSpaces(text)
         text = self.removeNonBreakingSpaceBeforePercent(text)
-        text = self.fixSyntaxSave(text)
+
         text = self.fixHtml(text)
+        text = self.fixReferences(text)
         text = self.fixStyle(text)
         text = self.fixTypo(text)
-        text = self.fixArabicLetters(text)
+        if self.site.lang in ['ckb', 'fa']:
+            text = self.fixArabicLetters(text)
         try:
             text = isbn.hyphenateIsbnNumbers(text)
         except isbn.InvalidIsbnException, error:
@@ -191,19 +198,46 @@
             if not family.isDefinedNSLanguage(nsNumber, self.site.lang):
                 # Skip undefined namespaces
                 continue
-            namespaces = list(family.namespace(self.site.lang, nsNumber, all=True))
+            if nsNumber in (2, 3):
+                # Skip user namespace, maybe gender is used
+                continue
+            namespaces = list(self.site.namespace(nsNumber, all=True))
             thisNs = namespaces.pop(0)
-            if nsNumber == 6 and family.name == 'wikipedia' and \
-               self.site.lang in ('en', 'fr'):
-                # do not change "Image" on en-wiki and fr-wiki
-                for image in [u'Image', u'image']:
-                    if image in namespaces:
-                        namespaces.remove(image)
+            if nsNumber == 6 and family.name == 'wikipedia':
+                if self.site.lang in ('en', 'fr'):
+                    # do not change "Image" on en-wiki and fr-wiki
+                    for image in [u'Image', u'image']:
+                        if image in namespaces:
+                            namespaces.remove(image)
+                elif self.site.lang == 'pt':
+                    # bug #3346901 should be implemented
+                    continue
             # skip main (article) namespace
             if thisNs and namespaces:
-                text = pywikibot.replaceExcept(text, r'[[\s*(' + '|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)]]', r'[[' + thisNs + ':\g<nameAndLabel>]]', exceptions)
+                text = pywikibot.replaceExcept(
+                    text,
+                    r'[[\s*(' + '|'.join(namespaces) + \
+                    ') *:(?P<nameAndLabel>.*?)]]', r'[[' + thisNs + \
+                    ':\g<nameAndLabel>]]', exceptions)
         return text
+    def translateMagicWords(self, text):
+        """
+        Makes sure that localized namespace names are used.
+        """
+        # not wanted at ru
+        # arz uses english stylish codes
+        if self.site.lang not in ['arz', 'ru']:
+            exceptions = ['nowiki', 'comment', 'math', 'pre']
+            for magicWord in ['img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none',
+                              'img_framed', 'img_frameless', 'img_border', 'img_upright',]:
+                aliases = self.site.siteinfo('magicwords').get(magicWord)
+                if not aliases: continue
+                text = pywikibot.replaceExcept(text, r'[[(?P<left>.+?:.+?..+?|) *(' + '|'.join(aliases) +') *(?P<right>(|.*?)?]])',
+                                               r'[[\g<left>' + aliases[0] + '\g<right>',
+                                               exceptions)
+        return text
+
     def cleanUpLinks(self, text):
         # helper function which works on one link and either returns it
         # unmodified, or returns a replacement.
@@ -238,17 +272,20 @@
                     if not trailingChars:
                         titleLength = len(titleWithSection)
                         titleWithSection = titleWithSection.rstrip()
-                        hadTrailingSpaces = (len(titleWithSection) != titleLength)
+                        hadTrailingSpaces = (len(titleWithSection) !=
+                                             titleLength)
# Convert URL-encoded characters to unicode
-                    titleWithSection = pywikibot.url2unicode(titleWithSection, site = self.site)
+                    titleWithSection = pywikibot.url2unicode(titleWithSection,
+                                                             site=self.site)
if titleWithSection == '':
                         # just skip empty links.
                         return match.group()
# Remove unnecessary initial and final spaces from label.
-                    # Please note that some editors prefer spaces around pipes. (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
+                    # Please note that some editors prefer spaces around pipes.
+                    # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
                     if label is not None:
                         # Remove unnecessary leading spaces from label,
                         # but remember if we did this because we want
@@ -268,18 +305,26 @@
                     if trailingChars:
                         label += trailingChars
-                    if titleWithSection == label or titleWithSection[0].lower() + titleWithSection[1:] == label:
+                    if titleWithSection == label or \
+                       titleWithSection[0].lower() + \
+                       titleWithSection[1:] == label:
                         newLink = "[[%s]]" % label
-                    # Check if we can create a link with trailing characters instead of a pipelink
-                    elif len(titleWithSection) <= len(label) and label[:len(titleWithSection)] == titleWithSection and re.sub(trailR, '', label[len(titleWithSection):]) == '':
-                        newLink = "[[%s]]%s" % (label[:len(titleWithSection)], label[len(titleWithSection):])
+                    # Check if we can create a link with trailing characters
+                    # instead of a pipelink
+                    elif len(titleWithSection) <= len(label) and \
+                         label[:len(titleWithSection)] == titleWithSection and \
+                         re.sub(trailR, '',
+                                label[len(titleWithSection):]) == '':
+                        newLink = "[[%s]]%s" % (label[:len(titleWithSection)],
+                                                label[len(titleWithSection):])
                     else:
                         # Try to capitalize the first letter of the title.
                         # Maybe this feature is not useful for languages that
                         # don't capitalize nouns...
                         #if not self.site.nocapitalize:
                         if self.site.sitename() == 'wikipedia:de':
-                            titleWithSection = titleWithSection[0].upper() + titleWithSection[1:]
+                            titleWithSection = titleWithSection[0].upper() + \
+                                               titleWithSection[1:]
                         newLink = "[[%s|%s]]" % (titleWithSection, label)
                     # re-add spaces that were pulled out of the link.
                     # Examples:
@@ -298,15 +343,20 @@
             return match.group()
trailR = re.compile(self.site.linktrail())
-        # The regular expression which finds links. Results consist of four groups:
-        # group title is the target page title, that is, everything before | or ].
-        # group section is the page section. It'll include the # to make life easier for us.
-        # group label is the alternative link title, that's everything between | and ].
-        # group linktrail is the link trail, that's letters after ]] which are part of the word.
-        # note that the definition of 'letter' varies from language to language.
-        linkR = re.compile(r'(?P<newline>[\n]*)[[(?P<titleWithSection>[^]|]+)(|(?P<label>[^]|]*))?]](?P<linktrail>' + self.site.linktrail() + ')')
+    # The regular expression which finds links. Results consist of four groups:
+    # group <newline> depends whether the links starts with a new line.
+    # group <titleWithSection> is the page title and section, that is,
+    # everything before | or ]. It'll include the # to make life easier for us.
+    # group <label> is the alternative link title between | and ].
+    # group <linktrail> is the link trail after ]] which are part of the word.
+    # note that the definition of 'letter' varies from language to language.
+        linkR = re.compile(
+            r'(?P<newline>[\n]*)[[(?P<titleWithSection>[^]|]+)(|(?P<label>[^]|]*))?]](?P<linktrail>' + \
+            self.site.linktrail() + ')')
-        text = pywikibot.replaceExcept(text, linkR, handleOneLink, ['comment', 'math', 'nowiki', 'pre', 'startspace'])
+        text = pywikibot.replaceExcept(text, linkR, handleOneLink,
+                                       ['comment', 'math', 'nowiki', 'pre',
+                                        'startspace'])
         return text
def resolveHtmlEntities(self, text):
@@ -320,6 +370,8 @@
             124,     # Vertical bar (??) - used intentionally in navigation bar templates on de:
             160,     # Non-breaking space (&nbsp;) - not supported by Firefox textareas
             173,     # Soft-hypen (&shy;) - enable editing
+           8206,     # left-to-right mark (&ltr;)
+           8207,     # right-to-left mark (&rtl;)
         ]
         # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=23...
         #if self.site.lang == 'eo':
@@ -330,7 +382,8 @@
         return text
def validXhtml(self, text):
-        text = pywikibot.replaceExcept(text, r'(?i)<br[ /]*>', r'<br />', ['comment', 'math', 'nowiki', 'pre'])
+        text = pywikibot.replaceExcept(text, r'(?i)<br[ /]*>', r'<br />',
+                                       ['comment', 'math', 'nowiki', 'pre'])
         return text
def removeUselessSpaces(self, text):
@@ -405,19 +458,35 @@
#from fixes.py
     def fixSyntaxSave(self, text):
-        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace']
+        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
+                      'startspace']
+        # link to the wiki working on
+        ## TODO: disable this for difflinks and titled links
+        ## http://de.wikipedia.org/w/index.php?title=Wikipedia%3aVandalismusmeldung&...
+##        text = pywikibot.replaceExcept(text,
+##                                       r'[https?://%s.%s.org/wiki/(?P<link>\S+)\s+(?P<title>.+?)\s?]'
+##                                       % (self.site.lang, self.site.family.name),
+##                                       r'[[\g<link>|\g<title>]]', exceptions)
         # external link in double brackets
-        text = pywikibot.replaceExcept(text, r'[[(?P<url>https?://[^]]+?)]]', r'[\g<url>]', exceptions)
+        text = pywikibot.replaceExcept(text,
+                                       r'[[(?P<url>https?://[^]]+?)]]',
+                                       r'[\g<url>]', exceptions)
         # external link starting with double bracket
-        text = pywikibot.replaceExcept(text, r'[[(?P<url>https?://.+?)]', r'[\g<url>]', exceptions)
+        text = pywikibot.replaceExcept(text,
+                                       r'[[(?P<url>https?://.+?)]',
+                                       r'[\g<url>]', exceptions)
         # external link and description separated by a dash, with
         # whitespace in front of the dash, so that it is clear that
         # the dash is not a legitimate part of the URL.
-        text = pywikibot.replaceExcept(text, r'[(?P<url>https?://[^|] \r\n]+?) +| *(?P<label>[^|]]+?)]', r'[\g<url> \g<label>]', exceptions)
+        text = pywikibot.replaceExcept(text,
+                                       r'[(?P<url>https?://[^|] \r\n]+?) +| *(?P<label>[^|]]+?)]',
+                                       r'[\g<url> \g<label>]', exceptions)
         # dash in external link, where the correct end of the URL can
         # be detected from the file extension. It is very unlikely that
         # this will cause mistakes.
-        text = pywikibot.replaceExcept(text, r'[(?P<url>https?://[^|] ]+?(.pdf|.html|.htm|.php|.asp|.aspx|.jsp)) *| *(?P<label>[^|]]+?)]', r'[\g<url> \g<label>]', exceptions)
+        text = pywikibot.replaceExcept(text,
+                                       r'[(?P<url>https?://[^|] ]+?(.pdf|.html|.htm|.php|.asp|.aspx|.jsp)) *| *(?P<label>[^|]]+?)]',
+                                       r'[\g<url> \g<label>]', exceptions)
         return text
def fixHtml(self, text):
@@ -439,11 +508,20 @@
             text = pywikibot.replaceExcept(text,
                                            r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])'%(level, level),
                                            r'%s'%equals, exceptions)
-        #remove empty <ref/>-tag
-        text = pywikibot.replaceExcept(text, r'(?i)<ref\s*/>', r'', exceptions)
         # TODO: maybe we can make the bot replace <p> tags with \r\n's.
         return text
+    def fixReferences(self, text):
+        #http://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixe...
+        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace']
+
+        # it should be name = " or name=" NOT name   ="
+        text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text)
+        #remove empty <ref/>-tag
+        text = pywikibot.replaceExcept(text, r'(?i)(<ref\s*/>|<ref *>\s*</ref>)', r'', exceptions)
+        text = pywikibot.replaceExcept(text, r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>', r'<ref \1/>', exceptions)
+        return text
+
     def fixStyle(self, text):
         exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace']
         # convert prettytable to wikitable class
@@ -466,60 +544,55 @@
         return text
def fixArabicLetters(self, text):
-        if self.site.lang=='ckb' or self.site.lang=='fa':
-            exceptions = [
-                'gallery',
-                'hyperlink',
-                'interwiki',
-                # but changes letters inside wikilinks
-                #'link',
-                'math',
-                'pre',
-                'template',
-                'timeline',
-                'ref',
-                'source',
-                'startspace',
-                'inputbox',
-            ]
-            # do not change inside file links
-            namespaces = list(self.site.namespace(6, all = True))
-            pattern = re.compile(u'[[(' + '|'.join(namespaces) + '):.+?..+?]]',
-                                 re.UNICODE)
-            exceptions.append(pattern)
-            text = pywikibot.replaceExcept(text, u',', u'،', exceptions)
-            if self.site.lang=='ckb':
-                text = pywikibot.replaceExcept(text,
-                                               ur'ه([.،_<]\s])',
-                                               ur'ە\1', exceptions)
-                text = pywikibot.replaceExcept(text, u'ه‌', u'ە', exceptions)
-                text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions)
-            text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions)
-            text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions)
-            # replace persian digits
-            for i in range(0,10):
-                if self.site.lang=='ckb':
-                    text = pywikibot.replaceExcept(text,
-                                                   u'۰۱۲۳۴۵۶۷۸۹'[i],
-                                                   u'٠١٢٣٤٥٦٧٨٩'[i], exceptions)
-                else:
-                    text = pywikibot.replaceExcept(text,
-                                                   u'٠١٢٣٤٥٦٧٨٩'[i],
-                                                   u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions)
-            # do not change digits in class, style and table params
-            pattern = re.compile(u'=".*?"', re.UNICODE)
-            exceptions.append(pattern)
-            # do not change digits inside html-tags
-            pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
-            exceptions.append(pattern)
-            exceptions.append('table') #exclude tables for now
-            for i in range(0,10):
-                if self.site.lang=='ckb':
-                    text = pywikibot.replaceExcept(text, str(i),
-                                                   u'٠١٢٣٤٥٦٧٨٩'[i], exceptions)
-                else:
-                    text = pywikibot.replaceExcept(text, str(i),
-                                                   u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions)
+        exceptions = [
+            'gallery',
+            'hyperlink',
+            'interwiki',
+            # but changes letters inside wikilinks
+            #'link',
+            'math',
+            'pre',
+            'template',
+            'timeline',
+            'ref',
+            'source',
+            'startspace',
+            'inputbox',
+        ]
+        # valid digits
+        digits = {
+            'ckb' : u'٠١٢٣٤٥٦٧٨٩',
+            'fa'  : u'۰۱۲۳۴۵۶۷۸۹'
+        }
+        new = digits.pop(self.site.lang)
+        # This only works if there are only two items in digits dict
+        old = digits[digits.keys()[0]]
+        # do not change inside file links
+        namespaces = list(self.site.namespace(6, all = True))
+        pattern = re.compile(u'[[(' + '|'.join(namespaces) + '):.+?..+?]]',
+                             re.UNICODE)
+        exceptions.append(pattern)
+        text = pywikibot.replaceExcept(text, u',', u'،', exceptions)
+        if self.site.lang=='ckb':
+            text = pywikibot.replaceExcept(text,
+                                           ur'ه([.،_<]\s])',
+                                           ur'ە\1', exceptions)
+            text = pywikibot.replaceExcept(text, u'ه‌', u'ە', exceptions)
+            text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions)
+        text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions)
+        text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions)
+        # replace persian digits
+        for i in range(0,10):
+            text = pywikibot.replaceExcept(text, old[i], new[i], exceptions)
+        # do not change digits in class, style and table params
+        pattern = re.compile(u'\w+=(".+?"|\d+)', re.UNICODE)
+        exceptions.append(pattern)
+        # do not change digits inside html-tags
+        pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
+        exceptions.append(pattern)
+        exceptions.append('table') #exclude tables for now
+        for i in range(0,10):
+            text = pywikibot.replaceExcept(text, str(i), new[i], exceptions)
         return text
# Retrieved from "http://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_clea..."

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [10288] branches/rewrite/scripts/cosmetic_changes.py