http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10292
Revision: 10292
Author: xqt
Date: 2012-06-04 13:41:38 +0000 (Mon, 04 Jun 2012)
Log Message:
-----------
interwiki_graph and titletranslate are librarys. Moved it to the pywikibot folder;
some parts from cosmetic_changes updated from trunk
Modified Paths:
--------------
branches/rewrite/scripts/cosmetic_changes.py
branches/rewrite/scripts/interwiki.py
Added Paths:
-----------
branches/rewrite/pywikibot/interwiki_graph.py
branches/rewrite/pywikibot/titletranslate.py
Removed Paths:
-------------
branches/rewrite/scripts/interwiki_graph.py
branches/rewrite/scripts/titletranslate.py
Copied: branches/rewrite/pywikibot/interwiki_graph.py (from rev 10289, branches/rewrite/scripts/interwiki_graph.py)
===================================================================
--- branches/rewrite/pywikibot/interwiki_graph.py (rev 0)
+++ branches/rewrite/pywikibot/interwiki_graph.py 2012-06-04 13:41:38 UTC (rev 10292)
@@ -0,0 +1,150 @@
+""" Module with the graphviz drawing calls """
+#
+# (C) Pywikipedia bot team, 2006-2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+import threading
+pydotfound = True
+try:
+ import pydot
+except ImportError:
+ pydotfound = False
+import pywikibot
+from pywikibot import config2 as config
+
+class GraphImpossible(Exception):
+ "Drawing a graph is not possible on your system."
+
+class GraphSavingThread(threading.Thread):
+ """
+ Rendering a graph can take extremely long. We use
+ multithreading because of that.
+
+ TODO: Find out if several threads running in parallel
+ can slow down the system too much. Consider adding a
+ mechanism to kill a thread if it takes too long.
+ """
+
+ def __init__(self, graph, originPage):
+ threading.Thread.__init__(self)
+ self.graph = graph
+ self.originPage = originPage
+
+ def run(self):
+ for format in config.interwiki_graph_formats:
+ filename = 'interwiki-graphs/' + getFilename(self.originPage,
+ format)
+ if self.graph.write(filename, prog = 'dot', format = format):
+ pywikibot.output(u'Graph saved as %s' % filename)
+ else:
+ pywikibot.output(u'Graph could not be saved as %s' % filename)
+
+class GraphDrawer:
+ def __init__(self, subject):
+ if not pydotfound:
+ raise GraphImpossible, 'pydot is not installed.'
+ self.graph = None
+ self.subject = subject
+
+ def getLabel(self, page):
+ return (u'"\"%s:%s\""' % (page.site.language(),
+ page.title())).encode('utf-8')
+
+ def addNode(self, page):
+ node = pydot.Node(self.getLabel(page), shape = 'rectangle')
+ node.set_URL("\"http://%s%s\""
+ % (page.site.hostname(),
+ page.site.get_address(page.urlname())))
+ node.set_style('filled')
+ node.set_fillcolor('white')
+ node.set_fontsize('11')
+ if not page.exists():
+ node.set_fillcolor('red')
+ elif page.isRedirectPage():
+ node.set_fillcolor('blue')
+ elif page.isDisambig():
+ node.set_fillcolor('orange')
+ if page.namespace() != self.subject.originPage.namespace():
+ node.set_color('green')
+ node.set_style('filled,bold')
+ # if we found more than one valid page for this language:
+ if len(filter(lambda p: p.site == page.site and p.exists() \
+ and not p.isRedirectPage(),
+ self.subject.foundIn.keys())) > 1:
+ # mark conflict by octagonal node
+ node.set_shape('octagon')
+ self.graph.add_node(node)
+
+ def addDirectedEdge(self, page, refPage):
+ # if page was given as a hint, referrers would be [None]
+ if refPage is not None:
+ sourceLabel = self.getLabel(refPage)
+ targetLabel = self.getLabel(page)
+ edge = pydot.Edge(sourceLabel, targetLabel)
+
+ oppositeEdge = self.graph.get_edge(targetLabel, sourceLabel)
+ if oppositeEdge:
+ if isinstance(oppositeEdge, list):
+ # bugfix for pydot >= 1.0.3
+ oppositeEdge = oppositeEdge[0]
+ #oppositeEdge.set_arrowtail('normal')
+ oppositeEdge.set_dir('both')
+ # workaround for bug [ 1722739 ]: prevent duplicate edges
+ # (it is unclear why duplicate edges occur)
+ elif self.graph.get_edge(sourceLabel, targetLabel):
+ pywikibot.output(
+ u'BUG: Tried to create duplicate edge from %s to %s'
+ % (refPage.title(asLink=True), page.title(asLink=True)))
+ # duplicate edges would be bad because then get_edge() would
+ # give a list of edges, not a single edge when we handle the
+ # opposite edge.
+ else:
+ # add edge
+ if refPage.site == page.site:
+ edge.set_color('blue')
+ elif not page.exists():
+ # mark dead links
+ edge.set_color('red')
+ elif refPage.isDisambig() != page.isDisambig():
+ # mark links between disambiguation and non-disambiguation
+ # pages
+ edge.set_color('orange')
+ if refPage.namespace() != page.namespace():
+ edge.set_color('green')
+ self.graph.add_edge(edge)
+
+ def saveGraphFile(self):
+ thread = GraphSavingThread(self.graph, self.subject.originPage)
+ thread.start()
+
+ def createGraph(self):
+ """
+ See http://meta.wikimedia.org/wiki/Interwiki_graphs
+ """
+ pywikibot.output(u'Preparing graph for %s'
+ % self.subject.originPage.title())
+ # create empty graph
+ self.graph = pydot.Dot()
+ # self.graph.set('concentrate', 'true')
+ for page in self.subject.foundIn.iterkeys():
+ # a node for each found page
+ self.addNode(page)
+ # mark start node by pointing there from a black dot.
+ firstLabel = self.getLabel(self.subject.originPage)
+ self.graph.add_node(pydot.Node('start', shape = 'point'))
+ self.graph.add_edge(pydot.Edge('start', firstLabel))
+ for page, referrers in self.subject.foundIn.iteritems():
+ for refPage in referrers:
+ self.addDirectedEdge(page, refPage)
+ self.saveGraphFile()
+
+def getFilename(page, extension = None):
+ filename = '%s-%s-%s' % (page.site.family.name,
+ page.site.language(),
+ page.titleForFilename())
+ if extension:
+ filename += '.%s' % extension
+ return filename
+
Copied: branches/rewrite/pywikibot/titletranslate.py (from rev 10289, branches/rewrite/scripts/titletranslate.py)
===================================================================
--- branches/rewrite/pywikibot/titletranslate.py (rev 0)
+++ branches/rewrite/pywikibot/titletranslate.py 2012-06-04 13:41:38 UTC (rev 10292)
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+#
+# (C) Rob W.W. Hooft, 2003
+# (C) Yuri Astrakhan, 2005
+# (C) Pywikipedia bot team, 2003-2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+import re
+
+import pywikibot
+import pywikibot.date as date
+
+def translate(page, hints = None, auto = True, removebrackets = False):
+ """
+ Please comment your source code! --Daniel
+
+ Does some magic stuff. Returns a list of Links.
+ """
+ result = []
+ site = page.site
+ if hints:
+ for h in hints:
+ if ':' not in h:
+ # argument given as -hint:xy where xy is a language code
+ codes = h
+ newname = ''
+ else:
+ codes, newname = h.split(':', 1)
+ if newname == '':
+ # if given as -hint:xy or -hint:xy:, assume that there should
+ # be a page in language xy with the same title as the page
+ # we're currently working on ...
+ ns = page.namespace()
+ if ns:
+ newname = u'%s:%s' % (site.family.namespace('_default', ns),
+ page.title(withNamespace=False))
+ else:
+ # article in the main namespace
+ newname = page.title()
+ # ... unless we do want brackets
+ if removebrackets:
+ newname = re.sub(re.compile(ur"\W*?\(.*?\)\W*?", re.UNICODE), u" ", newname)
+ try:
+ number = int(codes)
+ codes = site.family.languages_by_size[:number]
+ except ValueError:
+ if codes == 'all':
+ codes = site.family.languages_by_size
+ elif codes in site.family.language_groups:
+ codes = site.family.language_groups[codes]
+ else:
+ codes = codes.split(',')
+ for newcode in codes:
+ if newcode in site.languages():
+ if newcode != site.code:
+ x = pywikibot.Link(site.getSite(code=newcode), newname)
+ if x not in result:
+ result.append(x)
+ else:
+ if pywikibot.verbose:
+ pywikibot.output(u"Ignoring unknown language code %s"
+ % newcode)
+
+ # Autotranslate dates into all other languages, the rest will come from
+ # existing interwiki links.
+ if auto:
+ # search inside all dictionaries for this link
+ dictName, value = date.getAutoFormat(page.site.code,
+ page.title())
+ if dictName:
+ if not (dictName == 'yearsBC' and
+ page.site.code in date.maxyearBC and
+ value > date.maxyearBC[page.site.code]) or \
+ (dictName == 'yearsAD' and
+ page.site.code in date.maxyearAD and
+ value > date.maxyearAD[page.site.code]):
+ pywikibot.output(
+ u'TitleTranslate: %s was recognized as %s with value %d'
+ % (page.title(), dictName, value))
+ for entryLang, entry in date.formats[dictName].iteritems():
+ if entryLang != page.site.code:
+ if dictName == 'yearsBC' and \
+ entryLang in date.maxyearBC and \
+ value > date.maxyearBC[entryLang]:
+ pass
+ elif dictName == 'yearsAD' and \
+ entryLang in date.maxyearAD and \
+ value > date.maxyearAD[entryLang]:
+ pass
+ else:
+ newname = entry(value)
+ x = pywikibot.Link(
+ newname,
+ pywikibot.getSite(code=entryLang,
+ fam=site.family))
+ if x not in result:
+ result.append(x) # add new page
+ return result
+
+bcDateErrors = [u'[[ko:%d년]]']
+
+def appendFormatedDates( result, dictName, value ):
+ for code, func in date.formats[dictName].iteritems():
+ result.append( u'[[%s:%s]]' % (code,func(value)) )
+
+def getPoisonedLinks(pl):
+ """Returns a list of known corrupted links that should be removed if seen
+
+ """
+ result = []
+ pywikibot.output(u'getting poisoned links for %s' % pl.title())
+ dictName, value = date.getAutoFormat(pl.site.code, pl.title())
+ if dictName is not None:
+ pywikibot.output( u'date found in %s' % dictName )
+ # errors in year BC
+ if dictName in date.bcFormats:
+ for fmt in bcDateErrors:
+ result.append( fmt % value )
+ # i guess this is like friday the 13th for the years
+ if value == 398 and dictName == 'yearsBC':
+ appendFormatedDates(result, dictName, 399)
+ if dictName == 'yearsBC':
+ appendFormatedDates(result, 'decadesBC', value)
+ appendFormatedDates(result, 'yearsAD', value)
+ if dictName == 'yearsAD':
+ appendFormatedDates(result, 'decadesAD', value)
+ appendFormatedDates(result, 'yearsBC', value)
+ if dictName == 'centuriesBC':
+ appendFormatedDates(result, 'decadesBC', value * 100 + 1)
+ if dictName == 'centuriesAD':
+ appendFormatedDates(result, 'decadesAD', value * 100 + 1)
+ return result
Modified: branches/rewrite/scripts/cosmetic_changes.py
===================================================================
--- branches/rewrite/scripts/cosmetic_changes.py 2012-06-03 22:01:00 UTC (rev 10291)
+++ branches/rewrite/scripts/cosmetic_changes.py 2012-06-04 13:41:38 UTC (rev 10292)
@@ -39,12 +39,12 @@
#
__version__ = '$Id$'
#
+import sys, re
import pywikibot
import isbn
from pywikibot import pagegenerators
from pywikibot import i18n
-import sys
-import re
+from pywikibot import config2 as config
warning = """
ATTENTION: You can run this script as a stand-alone for testing purposes.
@@ -57,7 +57,13 @@
'&warning;': warning,
}
-nn_iw_msg = u'<!--interwiki (no, sv, da first; then other languages alphabetically by name)-->'
+# Interwiki message on top of iw links
+# 2nd line is a regex if needed
+msg_interwiki = {
+ 'fr' : u'<!-- Autres langues -->',
+ 'nn' : (u'<!--interwiki (no, sv, da first; then other languages alphabetically by name)-->',
+ u'(<!-- ?interwiki \(no(?:/nb)?, ?sv, ?da first; then other languages alphabetically by name\) ?-->)')
+}
# This is from interwiki.py;
# move it to family file and implement global instances
@@ -157,6 +163,8 @@
try:
text = isbn.hyphenateIsbnNumbers(text)
except isbn.InvalidIsbnException, error:
+ if config.verbose_output:
+ pywikibot.output(u"ISBN error: %s" % error)
pass
if self.debug:
pywikibot.showDiff(oldText, text)
@@ -168,7 +176,8 @@
Remove their language code prefix.
"""
if not self.talkpage and pywikibot.calledModuleName() <> 'interwiki':
- interwikiR = re.compile(r'\[\[%s\s?:([^\[\]\n]*)\]\]' % self.site.lang)
+ interwikiR = re.compile(r'\[\[%s\s?:([^\[\]\n]*)\]\]'
+ % self.site.lang)
text = interwikiR.sub(r'[[\1]]', text)
return text
Modified: branches/rewrite/scripts/interwiki.py
===================================================================
--- branches/rewrite/scripts/interwiki.py 2012-06-03 22:01:00 UTC (rev 10291)
+++ branches/rewrite/scripts/interwiki.py 2012-06-04 13:41:38 UTC (rev 10292)
@@ -346,13 +346,14 @@
import codecs
import pickle
import socket
+import webbrowser
import pywikibot
from pywikibot import config
from pywikibot import catlib
from pywikibot import pagegenerators
from pywikibot import i18n
-import titletranslate, interwiki_graph
-import webbrowser
+from pywikibot import interwiki_graph
+from pywikibot import titletranslate
docuReplacements = {
'&pagegenerators_help;': pagegenerators.parameterHelp
Deleted: branches/rewrite/scripts/interwiki_graph.py
===================================================================
--- branches/rewrite/scripts/interwiki_graph.py 2012-06-03 22:01:00 UTC (rev 10291)
+++ branches/rewrite/scripts/interwiki_graph.py 2012-06-04 13:41:38 UTC (rev 10292)
@@ -1,150 +0,0 @@
-""" Module with the graphviz drawing calls """
-#
-# (C) Pywikipedia bot team, 2006-2010
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id$'
-import threading
-pydotfound = True
-try:
- import pydot
-except ImportError:
- pydotfound = False
-import pywikibot
-from pywikibot import config2 as config
-
-class GraphImpossible(Exception):
- "Drawing a graph is not possible on your system."
-
-class GraphSavingThread(threading.Thread):
- """
- Rendering a graph can take extremely long. We use
- multithreading because of that.
-
- TODO: Find out if several threads running in parallel
- can slow down the system too much. Consider adding a
- mechanism to kill a thread if it takes too long.
- """
-
- def __init__(self, graph, originPage):
- threading.Thread.__init__(self)
- self.graph = graph
- self.originPage = originPage
-
- def run(self):
- for format in config.interwiki_graph_formats:
- filename = 'interwiki-graphs/' + getFilename(self.originPage,
- format)
- if self.graph.write(filename, prog = 'dot', format = format):
- pywikibot.output(u'Graph saved as %s' % filename)
- else:
- pywikibot.output(u'Graph could not be saved as %s' % filename)
-
-class GraphDrawer:
- def __init__(self, subject):
- if not pydotfound:
- raise GraphImpossible, 'pydot is not installed.'
- self.graph = None
- self.subject = subject
-
- def getLabel(self, page):
- return (u'"\"%s:%s\""' % (page.site.language(),
- page.title())).encode('utf-8')
-
- def addNode(self, page):
- node = pydot.Node(self.getLabel(page), shape = 'rectangle')
- node.set_URL("\"http://%s%s\""
- % (page.site.hostname(),
- page.site.get_address(page.urlname())))
- node.set_style('filled')
- node.set_fillcolor('white')
- node.set_fontsize('11')
- if not page.exists():
- node.set_fillcolor('red')
- elif page.isRedirectPage():
- node.set_fillcolor('blue')
- elif page.isDisambig():
- node.set_fillcolor('orange')
- if page.namespace() != self.subject.originPage.namespace():
- node.set_color('green')
- node.set_style('filled,bold')
- # if we found more than one valid page for this language:
- if len(filter(lambda p: p.site == page.site and p.exists() \
- and not p.isRedirectPage(),
- self.subject.foundIn.keys())) > 1:
- # mark conflict by octagonal node
- node.set_shape('octagon')
- self.graph.add_node(node)
-
- def addDirectedEdge(self, page, refPage):
- # if page was given as a hint, referrers would be [None]
- if refPage is not None:
- sourceLabel = self.getLabel(refPage)
- targetLabel = self.getLabel(page)
- edge = pydot.Edge(sourceLabel, targetLabel)
-
- oppositeEdge = self.graph.get_edge(targetLabel, sourceLabel)
- if oppositeEdge:
- if isinstance(oppositeEdge, list):
- # bugfix for pydot >= 1.0.3
- oppositeEdge = oppositeEdge[0]
- #oppositeEdge.set_arrowtail('normal')
- oppositeEdge.set_dir('both')
- # workaround for bug [ 1722739 ]: prevent duplicate edges
- # (it is unclear why duplicate edges occur)
- elif self.graph.get_edge(sourceLabel, targetLabel):
- pywikibot.output(
- u'BUG: Tried to create duplicate edge from %s to %s'
- % (refPage.title(asLink=True), page.title(asLink=True)))
- # duplicate edges would be bad because then get_edge() would
- # give a list of edges, not a single edge when we handle the
- # opposite edge.
- else:
- # add edge
- if refPage.site == page.site:
- edge.set_color('blue')
- elif not page.exists():
- # mark dead links
- edge.set_color('red')
- elif refPage.isDisambig() != page.isDisambig():
- # mark links between disambiguation and non-disambiguation
- # pages
- edge.set_color('orange')
- if refPage.namespace() != page.namespace():
- edge.set_color('green')
- self.graph.add_edge(edge)
-
- def saveGraphFile(self):
- thread = GraphSavingThread(self.graph, self.subject.originPage)
- thread.start()
-
- def createGraph(self):
- """
- See http://meta.wikimedia.org/wiki/Interwiki_graphs
- """
- pywikibot.output(u'Preparing graph for %s'
- % self.subject.originPage.title())
- # create empty graph
- self.graph = pydot.Dot()
- # self.graph.set('concentrate', 'true')
- for page in self.subject.foundIn.iterkeys():
- # a node for each found page
- self.addNode(page)
- # mark start node by pointing there from a black dot.
- firstLabel = self.getLabel(self.subject.originPage)
- self.graph.add_node(pydot.Node('start', shape = 'point'))
- self.graph.add_edge(pydot.Edge('start', firstLabel))
- for page, referrers in self.subject.foundIn.iteritems():
- for refPage in referrers:
- self.addDirectedEdge(page, refPage)
- self.saveGraphFile()
-
-def getFilename(page, extension = None):
- filename = '%s-%s-%s' % (page.site.family.name,
- page.site.language(),
- page.titleForFilename())
- if extension:
- filename += '.%s' % extension
- return filename
-
Deleted: branches/rewrite/scripts/titletranslate.py
===================================================================
--- branches/rewrite/scripts/titletranslate.py 2012-06-03 22:01:00 UTC (rev 10291)
+++ branches/rewrite/scripts/titletranslate.py 2012-06-04 13:41:38 UTC (rev 10292)
@@ -1,135 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# (C) Rob W.W. Hooft, 2003
-# (C) Yuri Astrakhan, 2005
-# (C) Pywikipedia bot team, 2003-2010
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id$'
-#
-import re
-
-import pywikibot
-import pywikibot.date as date
-
-def translate(page, hints = None, auto = True, removebrackets = False):
- """
- Please comment your source code! --Daniel
-
- Does some magic stuff. Returns a list of Links.
- """
- result = []
- site = page.site
- if hints:
- for h in hints:
- if ':' not in h:
- # argument given as -hint:xy where xy is a language code
- codes = h
- newname = ''
- else:
- codes, newname = h.split(':', 1)
- if newname == '':
- # if given as -hint:xy or -hint:xy:, assume that there should
- # be a page in language xy with the same title as the page
- # we're currently working on ...
- ns = page.namespace()
- if ns:
- newname = u'%s:%s' % (site.family.namespace('_default', ns),
- page.title(withNamespace=False))
- else:
- # article in the main namespace
- newname = page.title()
- # ... unless we do want brackets
- if removebrackets:
- newname = re.sub(re.compile(ur"\W*?\(.*?\)\W*?", re.UNICODE), u" ", newname)
- try:
- number = int(codes)
- codes = site.family.languages_by_size[:number]
- except ValueError:
- if codes == 'all':
- codes = site.family.languages_by_size
- elif codes in site.family.language_groups:
- codes = site.family.language_groups[codes]
- else:
- codes = codes.split(',')
- for newcode in codes:
- if newcode in site.languages():
- if newcode != site.code:
- x = pywikibot.Link(site.getSite(code=newcode), newname)
- if x not in result:
- result.append(x)
- else:
- if pywikibot.verbose:
- pywikibot.output(u"Ignoring unknown language code %s"
- % newcode)
-
- # Autotranslate dates into all other languages, the rest will come from
- # existing interwiki links.
- if auto:
- # search inside all dictionaries for this link
- dictName, value = date.getAutoFormat(page.site.code,
- page.title())
- if dictName:
- if not (dictName == 'yearsBC' and
- page.site.code in date.maxyearBC and
- value > date.maxyearBC[page.site.code]) or \
- (dictName == 'yearsAD' and
- page.site.code in date.maxyearAD and
- value > date.maxyearAD[page.site.code]):
- pywikibot.output(
- u'TitleTranslate: %s was recognized as %s with value %d'
- % (page.title(), dictName, value))
- for entryLang, entry in date.formats[dictName].iteritems():
- if entryLang != page.site.code:
- if dictName == 'yearsBC' and \
- entryLang in date.maxyearBC and \
- value > date.maxyearBC[entryLang]:
- pass
- elif dictName == 'yearsAD' and \
- entryLang in date.maxyearAD and \
- value > date.maxyearAD[entryLang]:
- pass
- else:
- newname = entry(value)
- x = pywikibot.Link(
- newname,
- pywikibot.getSite(code=entryLang,
- fam=site.family))
- if x not in result:
- result.append(x) # add new page
- return result
-
-bcDateErrors = [u'[[ko:%d년]]']
-
-def appendFormatedDates( result, dictName, value ):
- for code, func in date.formats[dictName].iteritems():
- result.append( u'[[%s:%s]]' % (code,func(value)) )
-
-def getPoisonedLinks(pl):
- """Returns a list of known corrupted links that should be removed if seen
-
- """
- result = []
- pywikibot.output(u'getting poisoned links for %s' % pl.title())
- dictName, value = date.getAutoFormat(pl.site.code, pl.title())
- if dictName is not None:
- pywikibot.output( u'date found in %s' % dictName )
- # errors in year BC
- if dictName in date.bcFormats:
- for fmt in bcDateErrors:
- result.append( fmt % value )
- # i guess this is like friday the 13th for the years
- if value == 398 and dictName == 'yearsBC':
- appendFormatedDates(result, dictName, 399)
- if dictName == 'yearsBC':
- appendFormatedDates(result, 'decadesBC', value)
- appendFormatedDates(result, 'yearsAD', value)
- if dictName == 'yearsAD':
- appendFormatedDates(result, 'decadesAD', value)
- appendFormatedDates(result, 'yearsBC', value)
- if dictName == 'centuriesBC':
- appendFormatedDates(result, 'decadesBC', value * 100 + 1)
- if dictName == 'centuriesAD':
- appendFormatedDates(result, 'decadesAD', value * 100 + 1)
- return result
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10291
Revision: 10291
Author: xqt
Date: 2012-06-03 22:01:00 +0000 (Sun, 03 Jun 2012)
Log Message:
-----------
disable some replacements due to changed behaviour in rw branch
Modified Paths:
--------------
branches/rewrite/scripts/cosmetic_changes.py
Modified: branches/rewrite/scripts/cosmetic_changes.py
===================================================================
--- branches/rewrite/scripts/cosmetic_changes.py 2012-06-03 21:47:06 UTC (rev 10290)
+++ branches/rewrite/scripts/cosmetic_changes.py 2012-06-03 22:01:00 UTC (rev 10291)
@@ -135,15 +135,15 @@
if self.site.sitename()== u'commons:commons' and self.namespace == 6:
text = self.commonsfiledesc(text)
text = self.fixSelfInterwiki(text)
- text = self.standardizePageFooter(text)
+## text = self.standardizePageFooter(text) # removes pipe links on categories
text = self.fixSyntaxSave(text)
- text = self.cleanUpLinks(text)
+## text = self.cleanUpLinks(text) #module object has no attribute url2unicode
text = self.cleanUpSectionHeaders(text)
text = self.putSpacesInLists(text)
## text = self.translateAndCapitalizeNamespaces(text) ##excluded since family.namespaces does not exist anymore
## text = self.translateMagicWords(text)
text = self.replaceDeprecatedTemplates(text)
- text = self.resolveHtmlEntities(text)
+## text = self.resolveHtmlEntities(text)
text = self.validXhtml(text)
text = self.removeUselessSpaces(text)
text = self.removeNonBreakingSpaceBeforePercent(text)
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10288
Revision: 10288
Author: xqt
Date: 2012-06-03 14:35:52 +0000 (Sun, 03 Jun 2012)
Log Message:
-----------
update from trunk, some functions are disabled because of the framework changes
Modified Paths:
--------------
branches/rewrite/scripts/cosmetic_changes.py
Modified: branches/rewrite/scripts/cosmetic_changes.py
===================================================================
--- branches/rewrite/scripts/cosmetic_changes.py 2012-06-03 14:05:32 UTC (rev 10287)
+++ branches/rewrite/scripts/cosmetic_changes.py 2012-06-03 14:35:52 UTC (rev 10288)
@@ -33,7 +33,7 @@
"""
#
# (C) xqt, 2009-2011
-# (C) Pywikipedia bot team, 2006-2010
+# (C) Pywikipedia bot team, 2006-2012
#
# Distributed under the terms of the MIT license.
#
@@ -46,10 +46,11 @@
import sys
import re
-warning = """ATTENTION: You can run this script as a stand-alone for testing purposes.
-However, the changes are that are made are only minor, and other users
+warning = """
+ATTENTION: You can run this script as a stand-alone for testing purposes.
+However, the changes that are made are only minor, and other users
might get angry if you fill the version histories and watchlists with such
-irrelevant changes."""
+irrelevant changes. Some wikis prohibit stand-alone running."""
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp,
@@ -110,12 +111,14 @@
(u'Belege', u'Belege fehlen\g<parameters>'),
(u'Quelle', u'Belege fehlen\g<parameters>'),
(u'Quellen', u'Belege fehlen\g<parameters>'),
+ (u'Quellen fehlen', u'Belege fehlen\g<parameters>'),
],
}
}
class CosmeticChangesToolkit:
- def __init__(self, site, debug=False, redirect=False, namespace=None, pageTitle=None):
+ def __init__(self, site, debug=False, redirect=False, namespace=None,
+ pageTitle=None):
self.site = site
self.debug = debug
self.redirect = redirect
@@ -133,20 +136,24 @@
text = self.commonsfiledesc(text)
text = self.fixSelfInterwiki(text)
text = self.standardizePageFooter(text)
+ text = self.fixSyntaxSave(text)
text = self.cleanUpLinks(text)
text = self.cleanUpSectionHeaders(text)
text = self.putSpacesInLists(text)
- text = self.translateAndCapitalizeNamespaces(text)
+## text = self.translateAndCapitalizeNamespaces(text) ##excluded since family.namespaces does not exist anymore
+## text = self.translateMagicWords(text)
text = self.replaceDeprecatedTemplates(text)
text = self.resolveHtmlEntities(text)
text = self.validXhtml(text)
text = self.removeUselessSpaces(text)
text = self.removeNonBreakingSpaceBeforePercent(text)
- text = self.fixSyntaxSave(text)
+
text = self.fixHtml(text)
+ text = self.fixReferences(text)
text = self.fixStyle(text)
text = self.fixTypo(text)
- text = self.fixArabicLetters(text)
+ if self.site.lang in ['ckb', 'fa']:
+ text = self.fixArabicLetters(text)
try:
text = isbn.hyphenateIsbnNumbers(text)
except isbn.InvalidIsbnException, error:
@@ -191,19 +198,46 @@
if not family.isDefinedNSLanguage(nsNumber, self.site.lang):
# Skip undefined namespaces
continue
- namespaces = list(family.namespace(self.site.lang, nsNumber, all=True))
+ if nsNumber in (2, 3):
+ # Skip user namespace, maybe gender is used
+ continue
+ namespaces = list(self.site.namespace(nsNumber, all=True))
thisNs = namespaces.pop(0)
- if nsNumber == 6 and family.name == 'wikipedia' and \
- self.site.lang in ('en', 'fr'):
- # do not change "Image" on en-wiki and fr-wiki
- for image in [u'Image', u'image']:
- if image in namespaces:
- namespaces.remove(image)
+ if nsNumber == 6 and family.name == 'wikipedia':
+ if self.site.lang in ('en', 'fr'):
+ # do not change "Image" on en-wiki and fr-wiki
+ for image in [u'Image', u'image']:
+ if image in namespaces:
+ namespaces.remove(image)
+ elif self.site.lang == 'pt':
+ # bug #3346901 should be implemented
+ continue
# skip main (article) namespace
if thisNs and namespaces:
- text = pywikibot.replaceExcept(text, r'\[\[\s*(' + '|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + ':\g<nameAndLabel>]]', exceptions)
+ text = pywikibot.replaceExcept(
+ text,
+ r'\[\[\s*(' + '|'.join(namespaces) + \
+ ') *:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + \
+ ':\g<nameAndLabel>]]', exceptions)
return text
+ def translateMagicWords(self, text):
+ """
+ Makes sure that localized namespace names are used.
+ """
+ # not wanted at ru
+ # arz uses english stylish codes
+ if self.site.lang not in ['arz', 'ru']:
+ exceptions = ['nowiki', 'comment', 'math', 'pre']
+ for magicWord in ['img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none',
+ 'img_framed', 'img_frameless', 'img_border', 'img_upright',]:
+ aliases = self.site.siteinfo('magicwords').get(magicWord)
+ if not aliases: continue
+ text = pywikibot.replaceExcept(text, r'\[\[(?P<left>.+?:.+?\..+?\|) *(' + '|'.join(aliases) +') *(?P<right>(\|.*?)?\]\])',
+ r'[[\g<left>' + aliases[0] + '\g<right>',
+ exceptions)
+ return text
+
def cleanUpLinks(self, text):
# helper function which works on one link and either returns it
# unmodified, or returns a replacement.
@@ -238,17 +272,20 @@
if not trailingChars:
titleLength = len(titleWithSection)
titleWithSection = titleWithSection.rstrip()
- hadTrailingSpaces = (len(titleWithSection) != titleLength)
+ hadTrailingSpaces = (len(titleWithSection) !=
+ titleLength)
# Convert URL-encoded characters to unicode
- titleWithSection = pywikibot.url2unicode(titleWithSection, site = self.site)
+ titleWithSection = pywikibot.url2unicode(titleWithSection,
+ site=self.site)
if titleWithSection == '':
# just skip empty links.
return match.group()
# Remove unnecessary initial and final spaces from label.
- # Please note that some editors prefer spaces around pipes. (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
+ # Please note that some editors prefer spaces around pipes.
+ # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
if label is not None:
# Remove unnecessary leading spaces from label,
# but remember if we did this because we want
@@ -268,18 +305,26 @@
if trailingChars:
label += trailingChars
- if titleWithSection == label or titleWithSection[0].lower() + titleWithSection[1:] == label:
+ if titleWithSection == label or \
+ titleWithSection[0].lower() + \
+ titleWithSection[1:] == label:
newLink = "[[%s]]" % label
- # Check if we can create a link with trailing characters instead of a pipelink
- elif len(titleWithSection) <= len(label) and label[:len(titleWithSection)] == titleWithSection and re.sub(trailR, '', label[len(titleWithSection):]) == '':
- newLink = "[[%s]]%s" % (label[:len(titleWithSection)], label[len(titleWithSection):])
+ # Check if we can create a link with trailing characters
+ # instead of a pipelink
+ elif len(titleWithSection) <= len(label) and \
+ label[:len(titleWithSection)] == titleWithSection and \
+ re.sub(trailR, '',
+ label[len(titleWithSection):]) == '':
+ newLink = "[[%s]]%s" % (label[:len(titleWithSection)],
+ label[len(titleWithSection):])
else:
# Try to capitalize the first letter of the title.
# Maybe this feature is not useful for languages that
# don't capitalize nouns...
#if not self.site.nocapitalize:
if self.site.sitename() == 'wikipedia:de':
- titleWithSection = titleWithSection[0].upper() + titleWithSection[1:]
+ titleWithSection = titleWithSection[0].upper() + \
+ titleWithSection[1:]
newLink = "[[%s|%s]]" % (titleWithSection, label)
# re-add spaces that were pulled out of the link.
# Examples:
@@ -298,15 +343,20 @@
return match.group()
trailR = re.compile(self.site.linktrail())
- # The regular expression which finds links. Results consist of four groups:
- # group title is the target page title, that is, everything before | or ].
- # group section is the page section. It'll include the # to make life easier for us.
- # group label is the alternative link title, that's everything between | and ].
- # group linktrail is the link trail, that's letters after ]] which are part of the word.
- # note that the definition of 'letter' varies from language to language.
- linkR = re.compile(r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' + self.site.linktrail() + ')')
+ # The regular expression which finds links. Results consist of four groups:
+ # group <newline> depends whether the links starts with a new line.
+ # group <titleWithSection> is the page title and section, that is,
+ # everything before | or ]. It'll include the # to make life easier for us.
+ # group <label> is the alternative link title between | and ].
+ # group <linktrail> is the link trail after ]] which are part of the word.
+ # note that the definition of 'letter' varies from language to language.
+ linkR = re.compile(
+ r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' + \
+ self.site.linktrail() + ')')
- text = pywikibot.replaceExcept(text, linkR, handleOneLink, ['comment', 'math', 'nowiki', 'pre', 'startspace'])
+ text = pywikibot.replaceExcept(text, linkR, handleOneLink,
+ ['comment', 'math', 'nowiki', 'pre',
+ 'startspace'])
return text
def resolveHtmlEntities(self, text):
@@ -320,6 +370,8 @@
124, # Vertical bar (??) - used intentionally in navigation bar templates on de:
160, # Non-breaking space ( ) - not supported by Firefox textareas
173, # Soft-hypen (­) - enable editing
+ 8206, # left-to-right mark (<r;)
+ 8207, # right-to-left mark (&rtl;)
]
# ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
#if self.site.lang == 'eo':
@@ -330,7 +382,8 @@
return text
def validXhtml(self, text):
- text = pywikibot.replaceExcept(text, r'(?i)<br[ /]*>', r'<br />', ['comment', 'math', 'nowiki', 'pre'])
+ text = pywikibot.replaceExcept(text, r'(?i)<br[ /]*>', r'<br />',
+ ['comment', 'math', 'nowiki', 'pre'])
return text
def removeUselessSpaces(self, text):
@@ -405,19 +458,35 @@
#from fixes.py
def fixSyntaxSave(self, text):
- exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace']
+ exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
+ 'startspace']
+ # link to the wiki working on
+ ## TODO: disable this for difflinks and titled links
+ ## http://de.wikipedia.org/w/index.php?title=Wikipedia%3aVandalismusmeldung&di…
+## text = pywikibot.replaceExcept(text,
+## r'\[https?://%s\.%s\.org/wiki/(?P<link>\S+)\s+(?P<title>.+?)\s?\]'
+## % (self.site.lang, self.site.family.name),
+## r'[[\g<link>|\g<title>]]', exceptions)
# external link in double brackets
- text = pywikibot.replaceExcept(text, r'\[\[(?P<url>https?://[^\]]+?)\]\]', r'[\g<url>]', exceptions)
+ text = pywikibot.replaceExcept(text,
+ r'\[\[(?P<url>https?://[^\]]+?)\]\]',
+ r'[\g<url>]', exceptions)
# external link starting with double bracket
- text = pywikibot.replaceExcept(text, r'\[\[(?P<url>https?://.+?)\]', r'[\g<url>]', exceptions)
+ text = pywikibot.replaceExcept(text,
+ r'\[\[(?P<url>https?://.+?)\]',
+ r'[\g<url>]', exceptions)
# external link and description separated by a dash, with
# whitespace in front of the dash, so that it is clear that
# the dash is not a legitimate part of the URL.
- text = pywikibot.replaceExcept(text, r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions)
+ text = pywikibot.replaceExcept(text,
+ r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]',
+ r'[\g<url> \g<label>]', exceptions)
# dash in external link, where the correct end of the URL can
# be detected from the file extension. It is very unlikely that
# this will cause mistakes.
- text = pywikibot.replaceExcept(text, r'\[(?P<url>https?://[^\|\] ]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions)
+ text = pywikibot.replaceExcept(text,
+ r'\[(?P<url>https?://[^\|\] ]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]',
+ r'[\g<url> \g<label>]', exceptions)
return text
def fixHtml(self, text):
@@ -439,11 +508,20 @@
text = pywikibot.replaceExcept(text,
r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])'%(level, level),
r'%s'%equals, exceptions)
- #remove empty <ref/>-tag
- text = pywikibot.replaceExcept(text, r'(?i)<ref\s*/>', r'', exceptions)
# TODO: maybe we can make the bot replace <p> tags with \r\n's.
return text
+ def fixReferences(self, text):
+ #http://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm
+ exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace']
+
+ # it should be name = " or name=" NOT name ="
+ text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text)
+ #remove empty <ref/>-tag
+ text = pywikibot.replaceExcept(text, r'(?i)(<ref\s*/>|<ref *>\s*</ref>)', r'', exceptions)
+ text = pywikibot.replaceExcept(text, r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>', r'<ref \1/>', exceptions)
+ return text
+
def fixStyle(self, text):
exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace']
# convert prettytable to wikitable class
@@ -466,60 +544,55 @@
return text
def fixArabicLetters(self, text):
- if self.site.lang=='ckb' or self.site.lang=='fa':
- exceptions = [
- 'gallery',
- 'hyperlink',
- 'interwiki',
- # but changes letters inside wikilinks
- #'link',
- 'math',
- 'pre',
- 'template',
- 'timeline',
- 'ref',
- 'source',
- 'startspace',
- 'inputbox',
- ]
- # do not change inside file links
- namespaces = list(self.site.namespace(6, all = True))
- pattern = re.compile(u'\[\[(' + '|'.join(namespaces) + '):.+?\..+?\]\]',
- re.UNICODE)
- exceptions.append(pattern)
- text = pywikibot.replaceExcept(text, u',', u'،', exceptions)
- if self.site.lang=='ckb':
- text = pywikibot.replaceExcept(text,
- ur'ه([.،_<\]\s])',
- ur'ە\1', exceptions)
- text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions)
- text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions)
- text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions)
- text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions)
- # replace persian digits
- for i in range(0,10):
- if self.site.lang=='ckb':
- text = pywikibot.replaceExcept(text,
- u'۰۱۲۳۴۵۶۷۸۹'[i],
- u'٠١٢٣٤٥٦٧٨٩'[i], exceptions)
- else:
- text = pywikibot.replaceExcept(text,
- u'٠١٢٣٤٥٦٧٨٩'[i],
- u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions)
- # do not change digits in class, style and table params
- pattern = re.compile(u'=".*?"', re.UNICODE)
- exceptions.append(pattern)
- # do not change digits inside html-tags
- pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
- exceptions.append(pattern)
- exceptions.append('table') #exclude tables for now
- for i in range(0,10):
- if self.site.lang=='ckb':
- text = pywikibot.replaceExcept(text, str(i),
- u'٠١٢٣٤٥٦٧٨٩'[i], exceptions)
- else:
- text = pywikibot.replaceExcept(text, str(i),
- u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions)
+ exceptions = [
+ 'gallery',
+ 'hyperlink',
+ 'interwiki',
+ # but changes letters inside wikilinks
+ #'link',
+ 'math',
+ 'pre',
+ 'template',
+ 'timeline',
+ 'ref',
+ 'source',
+ 'startspace',
+ 'inputbox',
+ ]
+ # valid digits
+ digits = {
+ 'ckb' : u'٠١٢٣٤٥٦٧٨٩',
+ 'fa' : u'۰۱۲۳۴۵۶۷۸۹'
+ }
+ new = digits.pop(self.site.lang)
+ # This only works if there are only two items in digits dict
+ old = digits[digits.keys()[0]]
+ # do not change inside file links
+ namespaces = list(self.site.namespace(6, all = True))
+ pattern = re.compile(u'\[\[(' + '|'.join(namespaces) + '):.+?\..+?\]\]',
+ re.UNICODE)
+ exceptions.append(pattern)
+ text = pywikibot.replaceExcept(text, u',', u'،', exceptions)
+ if self.site.lang=='ckb':
+ text = pywikibot.replaceExcept(text,
+ ur'ه([.،_<\]\s])',
+ ur'ە\1', exceptions)
+ text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions)
+ text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions)
+ text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions)
+ text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions)
+ # replace persian digits
+ for i in range(0,10):
+ text = pywikibot.replaceExcept(text, old[i], new[i], exceptions)
+ # do not change digits in class, style and table params
+ pattern = re.compile(u'\w+=(".+?"|\d+)', re.UNICODE)
+ exceptions.append(pattern)
+ # do not change digits inside html-tags
+ pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
+ exceptions.append(pattern)
+ exceptions.append('table') #exclude tables for now
+ for i in range(0,10):
+ text = pywikibot.replaceExcept(text, str(i), new[i], exceptions)
return text
# Retrieved from "http://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cle…"
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10287
Revision: 10287
Author: xqt
Date: 2012-06-03 14:05:32 +0000 (Sun, 03 Jun 2012)
Log Message:
-----------
fix page generator from rewite branch
Modified Paths:
--------------
trunk/pywikipedia/cosmetic_changes.py
Modified: trunk/pywikipedia/cosmetic_changes.py
===================================================================
--- trunk/pywikipedia/cosmetic_changes.py 2012-06-03 13:14:45 UTC (rev 10286)
+++ trunk/pywikipedia/cosmetic_changes.py 2012-06-03 14:05:32 UTC (rev 10287)
@@ -439,7 +439,7 @@
# Remove unnecessary initial and final spaces from label.
# Please note that some editors prefer spaces around pipes.
- #(See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
+ # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
if label is not None:
# Remove unnecessary leading spaces from label,
# but remember if we did this because we want
@@ -818,7 +818,7 @@
# Highlight the title in purple.
pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
% page.title())
- ccToolkit = CosmeticChangesToolkit(page.site(), debug=True,
+ ccToolkit = CosmeticChangesToolkit(page.site, debug=True,
namespace=page.namespace(),
pageTitle=page.title())
changedText = ccToolkit.change(page.get())
@@ -883,8 +883,7 @@
'cosmetic_changes-standalone')
if pageTitle:
site = pywikibot.getSite()
- page = pywikibot.Page(site, ' '.join(pageTitle))
- gen = iter([page])
+ gen = iter([pywikibot.Page(site, t) for t in pageTitle])
if not gen:
gen = genFactory.getCombinedGenerator()
if not gen: