http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10282
Revision: 10282 Author: xqt Date: 2012-06-03 11:13:46 +0000 (Sun, 03 Jun 2012) Log Message: ----------- fix some wiki syntax before cleanup links. translateMagicWords implemented but not freed yet, minor changes due PEP 8
Modified Paths: -------------- trunk/pywikipedia/cosmetic_changes.py
Modified: trunk/pywikipedia/cosmetic_changes.py =================================================================== --- trunk/pywikipedia/cosmetic_changes.py 2012-06-03 10:47:51 UTC (rev 10281) +++ trunk/pywikipedia/cosmetic_changes.py 2012-06-03 11:13:46 UTC (rev 10282) @@ -135,6 +135,7 @@ (u'Belege', u'Belege fehlen\g<parameters>'), (u'Quelle', u'Belege fehlen\g<parameters>'), (u'Quellen', u'Belege fehlen\g<parameters>'), + (u'Quellen fehlen', u'Belege fehlen\g<parameters>'), ], } } @@ -159,16 +160,18 @@ text = self.commonsfiledesc(text) text = self.fixSelfInterwiki(text) text = self.standardizePageFooter(text) + text = self.fixSyntaxSave(text) text = self.cleanUpLinks(text) text = self.cleanUpSectionHeaders(text) text = self.putSpacesInLists(text) text = self.translateAndCapitalizeNamespaces(text) +## text = self.translateMagicWords(text) text = self.replaceDeprecatedTemplates(text) text = self.resolveHtmlEntities(text) text = self.validXhtml(text) text = self.removeUselessSpaces(text) text = self.removeNonBreakingSpaceBeforePercent(text) - text = self.fixSyntaxSave(text) + text = self.fixHtml(text) text = self.fixReferences(text) text = self.fixStyle(text) @@ -245,7 +248,9 @@ # German Wikipedia. See # http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/1#Positio... # ignoring nn-wiki of cause of the comment line above iw section - if not self.template and not '{{Personendaten' in text: + if not self.template and not '{{Personendaten' in text and \ + not '{{SORTIERUNG' in text and not '{{DEFAULTSORT' in text and \ + not self.site.lang in ('et', 'it', 'bg', 'ru'): categories = pywikibot.getCategoryLinks(text, site = self.site)
if not self.talkpage:# and pywikibot.calledModuleName() <> 'interwiki': @@ -366,6 +371,23 @@ ':\g<nameAndLabel>]]', exceptions) return text
+ def translateMagicWords(self, text): + """ + Makes sure that localized namespace names are used. + """ + # not wanted at ru + # arz uses english stylish codes + if self.site.lang not in ['arz', 'ru']: + exceptions = ['nowiki', 'comment', 'math', 'pre'] + for magicWord in ['img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none', + 'img_framed', 'img_frameless', 'img_border', 'img_upright',]: + aliases = self.site.siteinfo('magicwords').get(magicWord) + if not aliases: continue + text = pywikibot.replaceExcept(text, r'[[(?P<left>.+?:.+?..+?|) *(' + '|'.join(aliases) +') *(?P<right>(|.*?)?]])', + r'[[\g<left>' + aliases[0] + '\g<right>', + exceptions) + return text + def cleanUpLinks(self, text): # helper function which works on one link and either returns it # unmodified, or returns a replacement. @@ -590,19 +612,35 @@
#from fixes.py def fixSyntaxSave(self, text): - exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] + exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', + 'startspace'] + # link to the wiki working on + ## TODO: disable this for difflinks and titled links + ## http://de.wikipedia.org/w/index.php?title=Wikipedia%3aVandalismusmeldung&... +## text = pywikibot.replaceExcept(text, +## r'[https?://%s.%s.org/wiki/(?P<link>\S+)\s+(?P<title>.+?)\s?]' +## % (self.site.lang, self.site.family.name), +## r'[[\g<link>|\g<title>]]', exceptions) # external link in double brackets - text = pywikibot.replaceExcept(text, r'[[(?P<url>https?://[^]]+?)]]', r'[\g<url>]', exceptions) + text = pywikibot.replaceExcept(text, + r'[[(?P<url>https?://[^]]+?)]]', + r'[\g<url>]', exceptions) # external link starting with double bracket - text = pywikibot.replaceExcept(text, r'[[(?P<url>https?://.+?)]', r'[\g<url>]', exceptions) + text = pywikibot.replaceExcept(text, + r'[[(?P<url>https?://.+?)]', + r'[\g<url>]', exceptions) # external link and description separated by a dash, with # whitespace in front of the dash, so that it is clear that # the dash is not a legitimate part of the URL. - text = pywikibot.replaceExcept(text, r'[(?P<url>https?://[^|] \r\n]+?) +| *(?P<label>[^|]]+?)]', r'[\g<url> \g<label>]', exceptions) + text = pywikibot.replaceExcept(text, + r'[(?P<url>https?://[^|] \r\n]+?) +| *(?P<label>[^|]]+?)]', + r'[\g<url> \g<label>]', exceptions) # dash in external link, where the correct end of the URL can # be detected from the file extension. It is very unlikely that # this will cause mistakes. - text = pywikibot.replaceExcept(text, r'[(?P<url>https?://[^|] ]+?(.pdf|.html|.htm|.php|.asp|.aspx|.jsp)) *| *(?P<label>[^|]]+?)]', r'[\g<url> \g<label>]', exceptions) + text = pywikibot.replaceExcept(text, + r'[(?P<url>https?://[^|] ]+?(.pdf|.html|.htm|.php|.asp|.aspx|.jsp)) *| *(?P<label>[^|]]+?)]', + r'[\g<url> \g<label>]', exceptions) return text
def fixHtml(self, text):