http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11312
Revision: 11312
Author: xqt
Date: 2013-04-01 10:37:24 +0000 (Mon, 01 Apr 2013)
Log Message:
-----------
PLURAL support for pywikibot.translate() method.
For additional help please refer pywikibot.twntranslate()
Modified Paths:
--------------
trunk/pywikipedia/pywikibot/i18n.py
Modified: trunk/pywikipedia/pywikibot/i18n.py
===================================================================
--- trunk/pywikipedia/pywikibot/i18n.py 2013-03-31 17:05:13 UTC (rev 11311)
+++ trunk/pywikipedia/pywikibot/i18n.py 2013-04-01 10:37:24 UTC (rev 11312)
@@ -3,34 +3,38 @@
and for TranslateWiki-based translations
"""
#
-# (C) Pywikipedia bot team, 2004-2012
+# (C) Pywikipedia bot team, 2004-2013
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id$'
-import re, sys
+import re
+import sys
import locale
from pywikibot.exceptions import Error
import wikipedia as pywikibot
import config
-# Languages to use for comment text after the actual language but before
-# en:. For example, if for language 'xx', you want the preference of
-# languages to be:
-# xx:, then fr:, then ru:, then en:
-# you let altlang return ['fr','ru'].
-# This code is used by translate() and twtranslate() below.
+PLURAL_PATTERN = '{{PLURAL:(?:%\()?([^\)]*?)(?:\)d)?\|(.*?)}}'
+
def _altlang(code):
"""Define fallback languages for particular languages.
+ @param code The language code
+ @type code string
+ @return a list of strings as language codes
+
If no translation is available to a specified language, translate() will
try each of the specified fallback languages, in order, until it finds
one with a translation, with 'en' and '_default' as a last resort.
For example, if for language 'xx', you want the preference of languages
to be: xx > fr > ru > en, you let altlang return ['fr', 'ru'].
+
+ This code is used by other translating methods below.
+
"""
#Akan
if code in ['ak', 'tw']:
@@ -64,7 +68,7 @@
return ['de', 'pl']
if code == 'rm':
return ['de', 'it']
- if code =='stq':
+ if code == 'stq':
return ['nds', 'de']
#Greek
if code in ['grc', 'pnt']:
@@ -73,7 +77,8 @@
if code in ['io', 'nov']:
return ['eo']
#Spanish
- if code in ['an', 'arn', 'ast', 'ay', 'ca', 'ext', 'lad', 'nah', 'nv', 'qu', 'yua']:
+ if code in ['an', 'arn', 'ast', 'ay', 'ca', 'ext', 'lad', 'nah', 'nv', 'qu',
+ 'yua']:
return ['es']
if code in ['gl', 'gn']:
return ['es', 'pt']
@@ -144,9 +149,9 @@
if code in ['mo', 'roa-rup']:
return ['ro']
#Russian and Belarusian
- if code in ['ab', 'av', 'ba', 'bxr', 'ce', 'cv', 'inh', 'kk', 'koi', 'krc', 'kv',
- 'ky', 'lbe', 'lez', 'mdf', 'mhr', 'mn', 'mrj', 'myv', 'os', 'sah',
- 'tg', 'udm', 'uk', 'xal']:
+ if code in ['ab', 'av', 'ba', 'bxr', 'ce', 'cv', 'inh', 'kk', 'koi', 'krc',
+ 'kv', 'ky', 'lbe', 'lez', 'mdf', 'mhr', 'mn', 'mrj', 'myv',
+ 'os', 'sah', 'tg', 'udm', 'uk', 'xal']:
return ['ru']
if code in ['kbd', 'ady']:
return ['kbd', 'ady', 'ru']
@@ -157,7 +162,7 @@
if code == 'kaa':
return ['uz', 'ru']
#Serbocroatian
- if code in ['bs', 'hr', 'sh',]:
+ if code in ['bs', 'hr', 'sh']:
return ['sh', 'hr', 'bs', 'sr', 'sr-el']
if code == 'sr':
return ['sr-el', 'sh', 'hr', 'bs']
@@ -212,9 +217,27 @@
#Default value
return []
-def translate(code, xdict, fallback=True):
+
+class TranslationError(Error):
+ """ Raised when no correct translation could be found """
+ pass
+
+
+def translate(code, xdict, parameters=None, fallback=True):
"""Return the most appropriate translation from a translation dict.
+ @param code The language code
+ @type code string or Site object
+ @param xdict dictionary with language codes as keys or extended dictionary
+ with family names as keys containing language dictionaries or
+ a single (unicode) string. May contain PLURAL tags as described
+ in twntranslate
+ @type xdict dict, string, unicode
+ @param parameters For passing (plural) parameters
+ @type parameters dict, string, unicode, int
+ @param fallback Try an alternate language code
+ @type fallback boolean
+
Given a language code and a dictionary, returns the dictionary's value for
key 'code' if this key exists; otherwise tries to return a value for an
alternative language that is most applicable to use on the Wikipedia in
@@ -225,7 +248,13 @@
the options gives result, we just take the first language in the
list.
+ For PLURAL support have a look at the twntranslate method
+
"""
+ param = None
+ if type(parameters) == dict:
+ param = parameters
+
family = pywikibot.default_family
# If a site is given instead of a code, use its language
if hasattr(code, 'lang'):
@@ -237,27 +266,62 @@
xdict = xdict[family]
elif 'wikipedia' in xdict:
xdict = xdict['wikipedia']
+
+ # Get the translated string
+ trans = None
if type(xdict) != dict:
- return xdict
+ trans = xdict
+ elif code in xdict:
+ trans = xdict[code]
+ elif fallback:
+ for alt in _altlang(code) + ['_default', 'en']:
+ if alt in xdict:
+ trans = xdict[alt]
+ break
+ else:
+ trans = xdict.values()[0]
+ if not trans:
+ return # return None if we have no translation found
+ if parameters is None:
+ return trans
- if code in xdict:
- return xdict[code]
- if not fallback:
- return None
- for alt in _altlang(code):
- if alt in xdict:
- return xdict[alt]
- if '_default' in xdict:
- return xdict['_default']
- if 'en' in xdict:
- return xdict['en']
- return xdict.values()[0]
+ # else we check for PLURAL variants
+ try:
+ selector, variants = re.search(PLURAL_PATTERN, trans).groups()
+ except AttributeError:
+ pass
+ else: # we found PLURAL patterns, process it
+ # no python 2.5 support anymore but we won't break old code
+ # therefore we import plural_rules here
+ from plural import plural_rules
+ if type(parameters) == dict:
+ num = param[selector]
+ elif isinstance(parameters, basestring):
+ num = int(parameters)
+ else:
+ num = parameters
+ # we only need the lang or _default, not a _altlang code
+ # TODO: check against plural_rules[lang]['nplurals']
+ try:
+ index = plural_rules[code]['plural'](num)
+ print 1, num, index
+ except KeyError:
+ index = plural_rules['_default']['plural'](num)
+ print 2, num, index
+ except TypeError:
+ # we got an int, not a function
+ index = plural_rules[code]['plural']
+ print 3, index
+ trans = re.sub(PLURAL_PATTERN, variants.split('|')[index], trans)
+ if param:
+ try:
+ return trans % param
+ except KeyError:
+ # parameter is for PLURAL variants only, don't change the string
+ pass
+ return trans
-class TranslationError(Error):
- """ Raised when no correct translation could be found """
- pass
-
def twtranslate(code, twtitle, parameters=None):
""" Uses TranslateWiki files to provide translations based on the TW title
twtitle, which corresponds to a page on TW.
@@ -301,7 +365,8 @@
except KeyError:
continue
if not trans:
- raise TranslationError("No English translation has been defined for TranslateWiki key %r" % twtitle)
+ raise TranslationError("No English translation has been defined "
+ "for TranslateWiki key %r" % twtitle)
# send the language code back via the given list
if code_needed:
code.append(lang)
@@ -310,6 +375,7 @@
else:
return trans
+
# Maybe this function should be merged with twtranslate
def twntranslate(code, twtitle, parameters=None):
""" First implementation of plural support for translations based on the
@@ -362,8 +428,8 @@
The translations are retrieved from i18n.<package>, based on the callers
import table.
+
"""
- PATTERN = '{{PLURAL:(?:%\()?([^\)]*?)(?:\)d)?\|(.*?)}}'
param = None
if type(parameters) == dict:
param = parameters
@@ -374,14 +440,14 @@
code = [code]
trans = twtranslate(code, twtitle, None)
try:
- selector, variants = re.search(PATTERN, trans).groups()
+ selector, variants = re.search(PLURAL_PATTERN, trans).groups()
# No PLURAL tag found: nothing to replace
except AttributeError:
pass
else:
if type(parameters) == dict:
num = param[selector]
- elif type(parameters) == basestring:
+ elif isinstance(parameters, basestring):
num = int(parameters)
else:
num = parameters
@@ -391,7 +457,7 @@
# to use plural.py - use _default rules for all
if sys.version_info < (2, 5):
plural_func = lambda n: (n != 1)
- else:
+ else:
from plural import plural_rules
# we only need the lang or _default, not a _altlang code
# maybe we should implement this to i18n.translate()
@@ -404,7 +470,7 @@
# we got an int
index = plural_rules[lang]['plural']
repl = variants.split('|')[index]
- trans = re.sub(PATTERN, repl, trans)
+ trans = re.sub(PLURAL_PATTERN, repl, trans)
if param:
try:
return trans % param
@@ -412,6 +478,7 @@
pass
return trans
+
def twhas_key(code, twtitle):
""" Uses TranslateWiki files to to check whether specified translation
based on the TW title is provided. No code fallback is made.
@@ -429,6 +496,7 @@
code = code.lang
return code in transdict and twtitle in transdict[code]
+
def input(twtitle, parameters=None, password=False):
""" Ask the user a question, return the user's answer.
@param twtitle The TranslateWiki string title, in <package>-<key> format
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11309
Revision: 11309
Author: legoktm
Date: 2013-03-31 16:44:16 +0000 (Sun, 31 Mar 2013)
Log Message:
-----------
Split getting a property's datatype into its own request so it can be cached.
Since a property's datatype will never change, we can safely cache it "forever".
I also fixed it so that we don't guess the datatype based on what was returned in
a "datavalue" object, since that is unreliable.
Modified Paths:
--------------
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2013-03-30 16:56:02 UTC (rev 11308)
+++ branches/rewrite/pywikibot/page.py 2013-03-31 16:44:16 UTC (rev 11309)
@@ -2515,9 +2515,7 @@
Examples: item, commons media file, StringValue, NumericalValue
"""
if not hasattr(self, 'type'):
- self.get()
- if self.type == 'wikibase-entityid':
- self.type = 'wikibase-item'
+ self.type = self.repo.getPropertyType(self)
return self.type
@@ -2561,8 +2559,7 @@
claim.isReference = True
claim.snaktype = data['mainsnak']['snaktype']
if claim.getSnakType() == 'value':
- claim.type = data['mainsnak']['datavalue']['type']
- if claim.type == 'wikibase-entityid':
+ if claim.getType() == 'wikibase-item':
claim.target = ItemPage(site, 'Q' +
str(data['mainsnak']['datavalue']['value']['numeric-id']))
else:
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2013-03-30 16:56:02 UTC (rev 11308)
+++ branches/rewrite/pywikibot/site.py 2013-03-31 16:44:16 UTC (rev 11309)
@@ -14,6 +14,7 @@
from hashlib import md5
except ImportError:
from md5 import md5
+import datetime
import itertools
import os
import re
@@ -3331,6 +3332,21 @@
raise pywikibot.data.api.APIError, data['errors']
return data['entities']
+ def getPropertyType(self, prop):
+ """
+ This is used sepecifically because we can cache
+ the value for a much longer time (near infinite).
+ """
+ params = dict(action='wbgetentities',
+ ids=prop.getID(),
+ props='datatype',
+ )
+ expiry = datetime.timedelta(days=365*100)
+ #Store it for 100 years
+ req = api.CachedRequest(expiry, site=self, **params)
+ data = req.submit()
+ return data['entities'][prop.getID()]['datatype']
+
def editEntity(self, identification, data, **kwargs):
params = dict(**identification)
params['action'] = 'wbeditentity'
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11307
Revision: 11307
Author: xqt
Date: 2013-03-30 16:42:04 +0000 (Sat, 30 Mar 2013)
Log Message:
-----------
some PEP8 changes
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2013-03-30 16:21:10 UTC (rev 11306)
+++ trunk/pywikipedia/wikipedia.py 2013-03-30 16:42:04 UTC (rev 11307)
@@ -4717,6 +4717,7 @@
Caches the HTML code, so that if you run this method twice on the
same ImagePage object, the page will only be downloaded once.
+
"""
if not self._imagePageHtml:
path = self.site().get_address(self.urlname())
@@ -4758,7 +4759,8 @@
for info in pageInfo['imageinfo']:
count += 1
if count == 1 and 'iistart' not in params:
- # count 1 and no iicontinue mean first image revision is latest.
+ # count 1 and no iicontinue mean first image revision
+ # is latest.
self._latestInfo = info
infos.append(info)
if limit == 1:
@@ -4769,7 +4771,7 @@
else:
break
except KeyError:
- output("Not image in imagepage")
+ output("No image in imagepage")
self._infoLoaded = True
if limit > 1:
return infos
@@ -4814,8 +4816,8 @@
return self.fileUrl().startswith(u'http://wikitravel.org/upload/shared/')
return self.fileIsOnCommons()
- # FIXME: MD5 might be performed on incomplete file due to server disconnection
- # (see bug #1795683).
+ # FIXME: MD5 might be performed on incomplete file due to server
+ # disconnection (see bug #1795683).
def getFileMd5Sum(self):
"""Return image file's MD5 checksum."""
f = MyURLopener.open(self.fileUrl())
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11306
Revision: 11306
Author: xqt
Date: 2013-03-30 16:21:10 +0000 (Sat, 30 Mar 2013)
Log Message:
-----------
PEP8 changes, update from trunk r11300, r11301, r11305, r10028
Modified Paths:
--------------
branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2013-03-30 16:17:48 UTC (rev 11305)
+++ branches/rewrite/pywikibot/textlib.py 2013-03-30 16:21:10 UTC (rev 11306)
@@ -7,7 +7,7 @@
"""
#
-# (C) Pywikipedia bot team, 2008-2011
+# (C) Pywikipedia bot team, 2008-2013
#
# Distributed under the terms of the MIT license.
#
@@ -16,11 +16,10 @@
import pywikibot
import re
-
-from pywikibot.i18n import translate
from HTMLParser import HTMLParser
import config2 as config
+
def unescape(s):
"""Replace escaped HTML-special characters by their originals"""
if '&' not in s:
@@ -29,12 +28,12 @@
s = s.replace(">", ">")
s = s.replace("'", "'")
s = s.replace(""", '"')
- s = s.replace("&", "&") # Must be last
+ s = s.replace("&", "&") # Must be last
return s
def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
- allowoverlap=False, marker = '', site = None):
+ allowoverlap=False, marker='', site=None):
"""
Return text with 'old' replaced by 'new', ignoring specified types of text.
@@ -61,21 +60,21 @@
site = pywikibot.getSite()
exceptionRegexes = {
- 'comment': re.compile(r'(?s)<!--.*?-->'),
+ 'comment': re.compile(r'(?s)<!--.*?-->'),
# section headers
- 'header': re.compile(r'\r?\n=+.+=+ *\r?\n'),
+ 'header': re.compile(r'\r?\n=+.+=+ *\r?\n'),
# preformatted text
- 'pre': re.compile(r'(?ism)<pre>.*?</pre>'),
- 'source': re.compile(r'(?is)<source .*?</source>'),
+ 'pre': re.compile(r'(?ism)<pre>.*?</pre>'),
+ 'source': re.compile(r'(?is)<source .*?</source>'),
# inline references
- 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'),
+ 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'),
# lines that start with a space are shown in a monospace font and
# have whitespace preserved.
- 'startspace': re.compile(r'(?m)^ (.*?)$'),
+ 'startspace': re.compile(r'(?m)^ (.*?)$'),
# tables often have whitespace that is used to improve wiki
# source code readability.
# TODO: handle nested tables.
- 'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
+ 'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
# templates with parameters often have whitespace that is used to
# improve wiki source code readability.
# 'template': re.compile(r'(?s){{.*?}}'),
@@ -83,22 +82,25 @@
# templates cascaded up to level 2, but no deeper. For arbitrary
# depth, we'd need recursion which can't be done in Python's re.
# After all, the language of correct parenthesis words is not regular.
- 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
- 'hyperlink': compileLinkR(),
- 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
+ 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'),
+ 'hyperlink': compileLinkR(),
+ 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
# this matches internal wikilinks, but also interwiki, categories, and
# images.
- 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
+ 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
# also finds links to foreign sites with preleading ":"
- 'interwiki': re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*'
- % '|'.join(site.validLanguageLinks()
- + site.family.obsolete.keys())
- ),
+ 'interwiki': re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*'
+ % '|'.join(site.validLanguageLinks() +
+ site.family.obsolete.keys())),
+ # Wikidata property inclusions
+ 'property': re.compile(r'(?i)\{\{\s*#property:\s*p\d+\s*\}\}'),
+ # Module invocations (currently only Lua)
+ 'invoke': re.compile(r'(?i)\{\{\s*#invoke:.*?}\}'),
}
# if we got a string, compile it as a regular expression
- if type(old) in [str, unicode]:
+ if isinstance(old, basestring):
if caseInsensitive:
old = re.compile(old, re.IGNORECASE | re.UNICODE)
else:
@@ -175,9 +177,14 @@
break
groupID = groupMatch.group('name') or \
int(groupMatch.group('number'))
- replacement = replacement[:groupMatch.start()] + \
- match.group(groupID) + \
- replacement[groupMatch.end():]
+ try:
+ replacement = replacement[:groupMatch.start()] + \
+ match.group(groupID) + \
+ replacement[groupMatch.end():]
+ except IndexError:
+ print '\nInvalid group reference:', groupID
+ print 'Groups found:\n', match.groups()
+ raise IndexError
text = text[:match.start()] + replacement + text[match.end():]
# continue the search on the remaining text
@@ -190,7 +197,7 @@
return text
-def removeDisabledParts(text, tags = ['*']):
+def removeDisabledParts(text, tags=['*']):
"""
Return text without portions where wiki markup is disabled
@@ -205,12 +212,12 @@
"""
regexes = {
- 'comments' : r'<!--.*?-->',
- 'includeonly': r'<includeonly>.*?</includeonly>',
- 'nowiki': r'<nowiki>.*?</nowiki>',
- 'pre': r'<pre>.*?</pre>',
- 'source': r'<source .*?</source>',
- 'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>',
+ 'comments': r'<!--.*?-->',
+ 'includeonly': r'<includeonly>.*?</includeonly>',
+ 'nowiki': r'<nowiki>.*?</nowiki>',
+ 'pre': r'<pre>.*?</pre>',
+ 'source': r'<source .*?</source>',
+ 'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>',
}
if '*' in tags:
tags = regexes.keys()
@@ -223,7 +230,7 @@
return toRemoveR.sub('', text)
-def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small', 'sup']):
+def removeHTMLParts(text, keeptags=['tt', 'nowiki', 'small', 'sup']):
"""
Return text without portions where HTML markup is disabled
@@ -232,9 +239,9 @@
The exact set of parts which should NOT be removed can be passed as the
'keeptags' parameter, which defaults to ['tt', 'nowiki', 'small', 'sup'].
+
"""
# try to merge with 'removeDisabledParts()' above into one generic function
-
# thanks to http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-c…
parser = _GetDataHTML()
parser.keeptags = keeptags
@@ -242,6 +249,7 @@
parser.close()
return parser.textdata
+
# thanks to http://docs.python.org/library/htmlparser.html
class _GetDataHTML(HTMLParser):
textdata = u''
@@ -251,17 +259,19 @@
self.textdata += data
def handle_starttag(self, tag, attrs):
- if tag in self.keeptags: self.textdata += u"<%s>" % tag
+ if tag in self.keeptags:
+ self.textdata += u"<%s>" % tag
def handle_endtag(self, tag):
- if tag in self.keeptags: self.textdata += u"</%s>" % tag
+ if tag in self.keeptags:
+ self.textdata += u"</%s>" % tag
-def isDisabled(text, index, tags = ['*']):
+def isDisabled(text, index, tags=['*']):
"""
Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
+ For the tags parameter, see removeDisabledParts() above.
- For the tags parameter, see removeDisabledParts() above.
"""
# Find a marker that is not already in the text.
marker = findmarker(text, '@@', '@')
@@ -270,9 +280,9 @@
return (marker not in text)
-def findmarker(text, startwith = u'@', append = u'@'):
+def findmarker(text, startwith=u'@', append=None):
# find a string which is not part of text
- if len(append) <= 0:
+ if not append:
append = u'@'
mymarker = startwith
while mymarker in text:
@@ -280,7 +290,7 @@
return mymarker
-def expandmarker(text, marker = '', separator = ''):
+def expandmarker(text, marker='', separator=''):
# set to remove any number of separator occurrences plus arbitrary
# whitespace before, after, and between them,
# by allowing to include them into marker.
@@ -292,8 +302,8 @@
while firstinseparator > 0 and striploopcontinue:
striploopcontinue = False
if (firstinseparator >= lenseparator) and \
- (separator == text[firstinseparator - \
- lenseparator : firstinseparator]):
+ (separator == text[firstinseparator -
+ lenseparator:firstinseparator]):
firstinseparator -= lenseparator
striploopcontinue = True
elif text[firstinseparator-1] < ' ':
@@ -302,6 +312,7 @@
marker = text[firstinseparator:firstinmarker] + marker
return marker
+
#-------------------------------------------------
# Functions dealing with interwiki language links
#-------------------------------------------------
@@ -324,7 +335,8 @@
# do not find or change links of other kinds, nor any that are formatted
# as in-line interwiki links (e.g., "[[:es:Articulo]]".
-def getLanguageLinks(text, insite=None, pageLink="[[]]", template_subpage=False):
+def getLanguageLinks(text, insite=None, pageLink="[[]]",
+ template_subpage=False):
"""
Return a dict of interlanguage links found in text.
@@ -336,7 +348,8 @@
if insite is None:
insite = pywikibot.getSite()
fam = insite.family
- # when interwiki links forward to another family, retrieve pages & other infos there
+ # when interwiki links forward to another family, retrieve pages & other
+ # infos there
if fam.interwiki_forward:
fam = pywikibot.Family(fam.interwiki_forward)
result = {}
@@ -351,8 +364,10 @@
# interwiki link.
# NOTE: language codes are case-insensitive and only consist of basic latin
# letters and hyphens.
- #TODO: currently, we do not have any, but BCP 47 allows digits, and underscores.
- #TODO: There is no semantic difference between hyphens and underscores -> fold them.
+ # TODO: currently, we do not have any, but BCP 47 allows digits, and
+ # underscores.
+ # TODO: There is no semantic difference between hyphens and
+ # underscores -> fold them.
interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
for lang, pagetitle in interwikiR.findall(text):
lang = lang.lower()
@@ -369,14 +384,14 @@
try:
result[site] = pywikibot.Page(site, pagetitle, insite=insite)
except pywikibot.InvalidTitle:
- pywikibot.output(
- u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
- % (lang, pagetitle))
+ pywikibot.output(u'[getLanguageLinks] Text contains invalid '
+ u'interwiki link [[%s:%s]].'
+ % (lang, pagetitle))
continue
return result
-def removeLanguageLinks(text, site = None, marker = ''):
+def removeLanguageLinks(text, site=None, marker=''):
"""Return text with all interlanguage links removed.
If a link to an unknown language is encountered, a warning is printed.
@@ -391,7 +406,8 @@
return text
# This regular expression will find every interwiki link, plus trailing
# whitespace.
- languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())
+ languages = '|'.join(site.validLanguageLinks() +
+ site.family.obsolete.keys())
interwikiR = re.compile(r'\[\[(%s)\s?:[^\[\]\n]*\]\][\s]*'
% languages, re.IGNORECASE)
text = replaceExcept(text, interwikiR, '',
@@ -421,7 +437,7 @@
def replaceLanguageLinks(oldtext, new, site=None, addOnly=False,
- template=False, template_subpage=False):
+ template=False, template_subpage=False):
"""Replace interlanguage links in the text with a new set of links.
'new' should be a dict with the Site objects as keys, and Page or Link
@@ -442,7 +458,7 @@
else:
s2 = removeLanguageLinksAndSeparator(oldtext, site=site, marker=marker,
separator=separatorstripped)
- s = interwikiFormat(new, insite = site)
+ s = interwikiFormat(new, insite=site)
if s:
if site.language() in site.family.interwiki_attop or \
u'<!-- interwiki at top -->' in oldtext:
@@ -462,28 +478,28 @@
if "</noinclude>" in s2[firstafter:]:
if separatorstripped:
s = separator + s
- newtext = s2[:firstafter].replace(marker,'') + s \
- + s2[firstafter:]
+ newtext = s2[:firstafter].replace(marker, '') + s + \
+ s2[firstafter:]
elif site.language() in site.family.categories_last:
- cats = getCategoryLinks(s2, site = site)
+ cats = getCategoryLinks(s2, site=site)
s2 = removeCategoryLinksAndSeparator(
- s2.replace(marker, cseparatorstripped).strip(),
- site) + separator + s
+ s2.replace(marker, cseparatorstripped).strip(), site) + \
+ separator + s
newtext = replaceCategoryLinks(s2, cats, site=site,
addOnly=True)
# for Wikitravel's language links position.
# (not supported by rewrite - no API)
elif site.family.name == 'wikitravel':
s = separator + s + separator
- newtext = s2[:firstafter].replace(marker,'') + s + \
+ newtext = s2[:firstafter].replace(marker, '') + s + \
s2[firstafter:]
else:
if template or template_subpage:
if template_subpage:
- includeOn = '<includeonly>'
+ includeOn = '<includeonly>'
includeOff = '</includeonly>'
else:
- includeOn = '<noinclude>'
+ includeOn = '<noinclude>'
includeOff = '</noinclude>'
separator = ''
# Do we have a noinclude at the end of the template?
@@ -495,16 +511,16 @@
newtext = regexp.sub(s + includeOff, s2)
else:
# Put the langlinks at the end, inside noinclude's
- newtext = s2.replace(marker,'').strip() + separator + \
+ newtext = s2.replace(marker, '').strip() + separator + \
u'%s\n%s%s\n' % (includeOn, s, includeOff)
else:
- newtext = s2.replace(marker,'').strip() + separator + s
+ newtext = s2.replace(marker, '').strip() + separator + s
else:
- newtext = s2.replace(marker,'')
+ newtext = s2.replace(marker, '')
return newtext
-def interwikiFormat(links, insite = None):
+def interwikiFormat(links, insite=None):
"""Convert interwiki link dict into a wikitext string.
'links' should be a dict with the Site objects as keys, and Page
@@ -512,6 +528,7 @@
Return a unicode string that is formatted for inclusion in insite
(defaulting to the current site).
+
"""
if insite is None:
insite = pywikibot.getSite()
@@ -530,16 +547,16 @@
sep = u' '
else:
sep = config.line_separator
- s=sep.join(s) + config.line_separator
+ s = sep.join(s) + config.line_separator
return s
# Sort sites according to local interwiki sort logic
-def interwikiSort(sites, insite = None):
+def interwikiSort(sites, insite=None):
+ if not sites:
+ return []
if insite is None:
- insite = pywikibot.getSite()
- if not sites:
- return []
+ insite = pywikibot.getSite()
sites.sort()
putfirst = insite.interwiki_putfirst()
@@ -547,11 +564,8 @@
#In this case I might have to change the order
firstsites = []
for code in putfirst:
- # The code may not exist in this family?
-## if code in insite.family.obsolete:
-## code = insite.family.obsolete[code]
if code in insite.validLanguageLinks():
- site = insite.getSite(code = code)
+ site = insite.getSite(code=code)
if site in sites:
del sites[sites.index(site)]
firstsites = firstsites + [site]
@@ -561,6 +575,7 @@
sites = insite.interwiki_putfirst_doubled(sites) + sites
return sites
+
#---------------------------------------
# Functions dealing with category links
#---------------------------------------
@@ -588,7 +603,7 @@
'%s:%s' % (match.group('namespace'),
match.group('catName')),
site),
- sortKey = match.group('sortKey'))
+ sortKey=match.group('sortKey'))
result.append(cat)
return result
@@ -658,9 +673,10 @@
# spaces and underscores in page titles are interchangeable and collapsible
title = title.replace(r"\ ", "[ _]+").replace(r"\_", "[ _]+")
categoryR = re.compile(r'\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])'
- % (catNamespace, title), re.I)
- categoryRN = re.compile(r'^[^\S\n]*\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])[^\S\n]*\n'
- % (catNamespace, title), re.I | re.M)
+ % (catNamespace, title), re.I)
+ categoryRN = re.compile(
+ r'^[^\S\n]*\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])[^\S\n]*\n'
+ % (catNamespace, title), re.I | re.M)
if newcat is None:
""" First go through and try the more restrictive regex that removes
an entire line, if the category is the only thing on that line (this
@@ -679,7 +695,7 @@
return text
-def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
+def replaceCategoryLinks(oldtext, new, site=None, addOnly=False):
"""
Replace the category links given in the wikitext given
in oldtext by the new links given in new.
@@ -692,7 +708,7 @@
"""
# Find a marker that is not already in the text.
- marker = findmarker( oldtext, u'@@')
+ marker = findmarker(oldtext, u'@@')
if site is None:
site = pywikibot.getSite()
if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext:
@@ -711,7 +727,7 @@
else:
s2 = removeCategoryLinksAndSeparator(oldtext, site=site, marker=marker,
separator=separatorstripped)
- s = categoryFormat(new, insite = site)
+ s = categoryFormat(new, insite=site)
if s:
if site.language() in site.family.category_attop:
newtext = s + separator + s2
@@ -730,7 +746,7 @@
newtext = s2[:firstafter].replace(marker, '') + s + \
s2[firstafter:]
elif site.language() in site.family.categories_last:
- newtext = s2.replace(marker,'').strip() + separator + s
+ newtext = s2.replace(marker, '').strip() + separator + s
else:
interwiki = getLanguageLinks(s2)
s2 = removeLanguageLinksAndSeparator(s2.replace(marker, ''),
@@ -740,11 +756,11 @@
newtext = replaceLanguageLinks(s2, interwiki, site=site,
addOnly=True)
else:
- newtext = s2.replace(marker,'')
+ newtext = s2.replace(marker, '')
return newtext.strip()
-def categoryFormat(categories, insite = None):
+def categoryFormat(categories, insite=None):
"""Return a string containing links to all categories in a list.
'categories' should be a list of Category objects or strings
@@ -758,13 +774,14 @@
if insite is None:
insite = pywikibot.getSite()
- if isinstance(categories[0],basestring):
+ if isinstance(categories[0], basestring):
if categories[0][0] == '[':
catLinks = categories
else:
catLinks = ['[[Category:'+category+']]' for category in categories]
else:
- catLinks = [category.aslink(noInterwiki=True) for category in categories]
+ catLinks = [category.aslink(noInterwiki=True)
+ for category in categories]
if insite.category_on_one_line():
sep = ' '
@@ -774,6 +791,7 @@
#catLinks.sort()
return sep.join(catLinks) + config.line_separator
+
#---------------------------------------
# Functions dealing with external links
#---------------------------------------
@@ -796,9 +814,9 @@
# not allowed inside links. For example, in this wiki text:
# ''Please see http://www.example.org.''
# .'' shouldn't be considered as part of the link.
- regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
- + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside \
- + ']*[^' + notAtEnd + '])'
+ regex = r'(?P<url>http[s]?://[^%(notInside)s]*?[^%(notAtEnd)s]' \
+ r'(?=[%(notAtEnd)s]*\'\')|http[s]?://[^%(notInside)s]*' \
+ r'[^%(notAtEnd)s])' % {'notInside': notInside, 'notAtEnd': notAtEnd}
if withoutBracketed:
regex = r'(?<!\[)' + regex
@@ -807,6 +825,7 @@
linkR = re.compile(regex)
return linkR
+
#----------------------------------
# Functions dealing with templates
#----------------------------------
@@ -845,7 +864,7 @@
inside = {}
count = 0
Rtemplate = re.compile(
- ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
+ ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
Rmath = re.compile(ur'<math>[^<]+</math>')
Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2))
@@ -892,8 +911,8 @@
for m2 in pywikibot.link_regex.finditer(paramString):
count2 += 1
text = m2.group(0)
- paramString = paramString.replace(text,
- '%s%d%s' % (marker2, count2, marker2))
+ paramString = paramString.replace(
+ text, '%s%d%s' % (marker2, count2, marker2))
links[count2] = text
# Parse string
markedParams = paramString.split('|')
@@ -927,22 +946,21 @@
You can use items from extract_templates_and_params here to get
an equivalent template wiki text (it may happen that the order
of the params changes).
+
"""
(template, params) = template_and_params
-
text = u''
for item in params:
- text += u'|%s=%s\n' % (item, params[item])
+ text += u'|%s=%s\n' % (item, params[item])
return u'{{%s\n%s}}' % (template, text)
+
#----------------------------------
# Page parsing functionality
#----------------------------------
def does_text_contain_section(pagetext, section):
- """ Determines whether the page text contains the given
- section title.
- """
+ """Determines whether the page text contains the given section title."""
m = re.search("=+[ ']*%s[ ']*=+" % re.escape(section), pagetext)
return bool(m)
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11305
Revision: 11305
Author: xqt
Date: 2013-03-30 16:17:48 +0000 (Sat, 30 Mar 2013)
Log Message:
-----------
PEP8 changes
Modified Paths:
--------------
trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2013-03-30 12:42:36 UTC (rev 11304)
+++ trunk/pywikipedia/pywikibot/textlib.py 2013-03-30 16:17:48 UTC (rev 11305)
@@ -19,6 +19,7 @@
from HTMLParser import HTMLParser
import config
+
def unescape(s):
"""Replace escaped HTML-special characters by their originals"""
if '&' not in s:
@@ -27,11 +28,12 @@
s = s.replace(">", ">")
s = s.replace("'", "'")
s = s.replace(""", '"')
- s = s.replace("&", "&") # Must be last
+ s = s.replace("&", "&") # Must be last
return s
+
def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
- allowoverlap=False, marker = '', site = None):
+ allowoverlap=False, marker='', site=None):
"""
Return text with 'old' replaced by 'new', ignoring specified types of text.
@@ -88,9 +90,8 @@
'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
# also finds links to foreign sites with preleading ":"
'interwiki': re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*'
- % '|'.join(site.validLanguageLinks()
- + site.family.obsolete.keys())
- ),
+ % '|'.join(site.validLanguageLinks() +
+ site.family.obsolete.keys())),
# Wikidata property inclusions
'property': re.compile(r'(?i)\{\{\s*#property:\s*p\d+\s*\}\}'),
# Module invocations (currently only Lua)
@@ -99,7 +100,7 @@
}
# if we got a string, compile it as a regular expression
- if type(old) in [str, unicode]:
+ if isinstance(old, basestring):
if caseInsensitive:
old = re.compile(old, re.IGNORECASE | re.UNICODE)
else:
@@ -196,7 +197,7 @@
return text
-def removeDisabledParts(text, tags = ['*']):
+def removeDisabledParts(text, tags=['*']):
"""
Return text without portions where wiki markup is disabled
@@ -211,12 +212,12 @@
"""
regexes = {
- 'comments' : r'<!--.*?-->',
- 'includeonly': r'<includeonly>.*?</includeonly>',
- 'nowiki': r'<nowiki>.*?</nowiki>',
- 'pre': r'<pre>.*?</pre>',
- 'source': r'<source .*?</source>',
- 'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>',
+ 'comments': r'<!--.*?-->',
+ 'includeonly': r'<includeonly>.*?</includeonly>',
+ 'nowiki': r'<nowiki>.*?</nowiki>',
+ 'pre': r'<pre>.*?</pre>',
+ 'source': r'<source .*?</source>',
+ 'syntaxhighlight': r'<syntaxhighlight .*?</syntaxhighlight>',
}
if '*' in tags:
tags = regexes.keys()
@@ -229,7 +230,7 @@
return toRemoveR.sub('', text)
-def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small', 'sup']):
+def removeHTMLParts(text, keeptags=['tt', 'nowiki', 'small', 'sup']):
"""
Return text without portions where HTML markup is disabled
@@ -238,9 +239,9 @@
The exact set of parts which should NOT be removed can be passed as the
'keeptags' parameter, which defaults to ['tt', 'nowiki', 'small', 'sup'].
+
"""
# try to merge with 'removeDisabledParts()' above into one generic function
-
# thanks to http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-c…
parser = _GetDataHTML()
parser.keeptags = keeptags
@@ -248,6 +249,7 @@
parser.close()
return parser.textdata
+
# thanks to http://docs.python.org/library/htmlparser.html
class _GetDataHTML(HTMLParser):
textdata = u''
@@ -257,17 +259,19 @@
self.textdata += data
def handle_starttag(self, tag, attrs):
- if tag in self.keeptags: self.textdata += u"<%s>" % tag
+ if tag in self.keeptags:
+ self.textdata += u"<%s>" % tag
def handle_endtag(self, tag):
- if tag in self.keeptags: self.textdata += u"</%s>" % tag
+ if tag in self.keeptags:
+ self.textdata += u"</%s>" % tag
-def isDisabled(text, index, tags = ['*']):
+def isDisabled(text, index, tags=['*']):
"""
Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
+ For the tags parameter, see removeDisabledParts() above.
- For the tags parameter, see removeDisabledParts() above.
"""
# Find a marker that is not already in the text.
marker = findmarker(text, '@@', '@')
@@ -276,9 +280,9 @@
return (marker not in text)
-def findmarker(text, startwith = u'@', append = u'@'):
+def findmarker(text, startwith=u'@', append=None):
# find a string which is not part of text
- if len(append) <= 0:
+ if not append:
append = u'@'
mymarker = startwith
while mymarker in text:
@@ -286,7 +290,7 @@
return mymarker
-def expandmarker(text, marker = '', separator = ''):
+def expandmarker(text, marker='', separator=''):
# set to remove any number of separator occurrences plus arbitrary
# whitespace before, after, and between them,
# by allowing to include them into marker.
@@ -298,8 +302,8 @@
while firstinseparator > 0 and striploopcontinue:
striploopcontinue = False
if (firstinseparator >= lenseparator) and \
- (separator == text[firstinseparator - \
- lenseparator : firstinseparator]):
+ (separator == text[firstinseparator -
+ lenseparator:firstinseparator]):
firstinseparator -= lenseparator
striploopcontinue = True
elif text[firstinseparator-1] < ' ':
@@ -308,6 +312,7 @@
marker = text[firstinseparator:firstinmarker] + marker
return marker
+
#-------------------------------------------------
# Functions dealing with interwiki language links
#-------------------------------------------------
@@ -330,7 +335,8 @@
# do not find or change links of other kinds, nor any that are formatted
# as in-line interwiki links (e.g., "[[:es:Articulo]]".
-def getLanguageLinks(text, insite=None, pageLink="[[]]", template_subpage=False):
+def getLanguageLinks(text, insite=None, pageLink="[[]]",
+ template_subpage=False):
"""
Return a dict of interlanguage links found in text.
@@ -342,7 +348,8 @@
if insite is None:
insite = pywikibot.getSite()
fam = insite.family
- # when interwiki links forward to another family, retrieve pages & other infos there
+ # when interwiki links forward to another family, retrieve pages & other
+ # infos there
if fam.interwiki_forward:
fam = pywikibot.Family(fam.interwiki_forward)
result = {}
@@ -357,8 +364,10 @@
# interwiki link.
# NOTE: language codes are case-insensitive and only consist of basic latin
# letters and hyphens.
- #TODO: currently, we do not have any, but BCP 47 allows digits, and underscores.
- #TODO: There is no semantic difference between hyphens and underscores -> fold them.
+ # TODO: currently, we do not have any, but BCP 47 allows digits, and
+ # underscores.
+ # TODO: There is no semantic difference between hyphens and
+ # underscores -> fold them.
interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
for lang, pagetitle in interwikiR.findall(text):
lang = lang.lower()
@@ -375,14 +384,14 @@
try:
result[site] = pywikibot.Page(site, pagetitle, insite=insite)
except pywikibot.InvalidTitle:
- pywikibot.output(
- u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
- % (lang, pagetitle))
+ pywikibot.output(u'[getLanguageLinks] Text contains invalid '
+ u'interwiki link [[%s:%s]].'
+ % (lang, pagetitle))
continue
return result
-def removeLanguageLinks(text, site = None, marker = ''):
+def removeLanguageLinks(text, site=None, marker=''):
"""Return text with all interlanguage links removed.
If a link to an unknown language is encountered, a warning is printed.
@@ -397,7 +406,8 @@
return text
# This regular expression will find every interwiki link, plus trailing
# whitespace.
- languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())
+ languages = '|'.join(site.validLanguageLinks() +
+ site.family.obsolete.keys())
interwikiR = re.compile(r'\[\[(%s)\s?:[^\[\]\n]*\]\][\s]*'
% languages, re.IGNORECASE)
text = replaceExcept(text, interwikiR, '',
@@ -427,7 +437,7 @@
def replaceLanguageLinks(oldtext, new, site=None, addOnly=False,
- template=False, template_subpage=False):
+ template=False, template_subpage=False):
"""Replace interlanguage links in the text with a new set of links.
'new' should be a dict with the Site objects as keys, and Page or Link
@@ -448,7 +458,7 @@
else:
s2 = removeLanguageLinksAndSeparator(oldtext, site=site, marker=marker,
separator=separatorstripped)
- s = interwikiFormat(new, insite = site)
+ s = interwikiFormat(new, insite=site)
if s:
if site.language() in site.family.interwiki_attop or \
u'<!-- interwiki at top -->' in oldtext:
@@ -468,28 +478,28 @@
if "</noinclude>" in s2[firstafter:]:
if separatorstripped:
s = separator + s
- newtext = s2[:firstafter].replace(marker,'') + s \
- + s2[firstafter:]
+ newtext = s2[:firstafter].replace(marker, '') + s + \
+ s2[firstafter:]
elif site.language() in site.family.categories_last:
- cats = getCategoryLinks(s2, site = site)
+ cats = getCategoryLinks(s2, site=site)
s2 = removeCategoryLinksAndSeparator(
- s2.replace(marker, cseparatorstripped).strip(),
- site) + separator + s
+ s2.replace(marker, cseparatorstripped).strip(), site) + \
+ separator + s
newtext = replaceCategoryLinks(s2, cats, site=site,
addOnly=True)
# for Wikitravel's language links position.
# (not supported by rewrite - no API)
elif site.family.name == 'wikitravel':
s = separator + s + separator
- newtext = s2[:firstafter].replace(marker,'') + s + \
+ newtext = s2[:firstafter].replace(marker, '') + s + \
s2[firstafter:]
else:
if template or template_subpage:
if template_subpage:
- includeOn = '<includeonly>'
+ includeOn = '<includeonly>'
includeOff = '</includeonly>'
else:
- includeOn = '<noinclude>'
+ includeOn = '<noinclude>'
includeOff = '</noinclude>'
separator = ''
# Do we have a noinclude at the end of the template?
@@ -501,16 +511,16 @@
newtext = regexp.sub(s + includeOff, s2)
else:
# Put the langlinks at the end, inside noinclude's
- newtext = s2.replace(marker,'').strip() + separator + \
+ newtext = s2.replace(marker, '').strip() + separator + \
u'%s\n%s%s\n' % (includeOn, s, includeOff)
else:
- newtext = s2.replace(marker,'').strip() + separator + s
+ newtext = s2.replace(marker, '').strip() + separator + s
else:
- newtext = s2.replace(marker,'')
+ newtext = s2.replace(marker, '')
return newtext
-def interwikiFormat(links, insite = None):
+def interwikiFormat(links, insite=None):
"""Convert interwiki link dict into a wikitext string.
'links' should be a dict with the Site objects as keys, and Page
@@ -518,6 +528,7 @@
Return a unicode string that is formatted for inclusion in insite
(defaulting to the current site).
+
"""
if insite is None:
insite = pywikibot.getSite()
@@ -536,16 +547,16 @@
sep = u' '
else:
sep = config.line_separator
- s=sep.join(s) + config.line_separator
+ s = sep.join(s) + config.line_separator
return s
# Sort sites according to local interwiki sort logic
-def interwikiSort(sites, insite = None):
+def interwikiSort(sites, insite=None):
+ if not sites:
+ return []
if insite is None:
- insite = pywikibot.getSite()
- if not sites:
- return []
+ insite = pywikibot.getSite()
sites.sort()
putfirst = insite.interwiki_putfirst()
@@ -553,11 +564,8 @@
#In this case I might have to change the order
firstsites = []
for code in putfirst:
- # The code may not exist in this family?
-## if code in insite.family.obsolete:
-## code = insite.family.obsolete[code]
if code in insite.validLanguageLinks():
- site = insite.getSite(code = code)
+ site = insite.getSite(code=code)
if site in sites:
del sites[sites.index(site)]
firstsites = firstsites + [site]
@@ -567,6 +575,7 @@
sites = insite.interwiki_putfirst_doubled(sites) + sites
return sites
+
#---------------------------------------
# Functions dealing with category links
#---------------------------------------
@@ -590,10 +599,9 @@
r'(?:\|(?P<sortKey>.+?))?\s*\]\]'
% catNamespace, re.I)
for match in R.finditer(text):
- cat = catlib.Category(site,
- '%s:%s' % (match.group('namespace'),
- match.group('catName')),
- sortKey = match.group('sortKey'))
+ cat = catlib.Category(site, '%s:%s' % (match.group('namespace'),
+ match.group('catName')),
+ sortKey=match.group('sortKey'))
result.append(cat)
return result
@@ -663,9 +671,10 @@
# spaces and underscores in page titles are interchangeable and collapsible
title = title.replace(r"\ ", "[ _]+").replace(r"\_", "[ _]+")
categoryR = re.compile(r'\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])'
- % (catNamespace, title), re.I)
- categoryRN = re.compile(r'^[^\S\n]*\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])[^\S\n]*\n'
- % (catNamespace, title), re.I | re.M)
+ % (catNamespace, title), re.I)
+ categoryRN = re.compile(
+ r'^[^\S\n]*\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])[^\S\n]*\n'
+ % (catNamespace, title), re.I | re.M)
if newcat is None:
""" First go through and try the more restrictive regex that removes
an entire line, if the category is the only thing on that line (this
@@ -684,7 +693,7 @@
return text
-def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
+def replaceCategoryLinks(oldtext, new, site=None, addOnly=False):
"""
Replace the category links given in the wikitext given
in oldtext by the new links given in new.
@@ -697,7 +706,7 @@
"""
# Find a marker that is not already in the text.
- marker = findmarker( oldtext, u'@@')
+ marker = findmarker(oldtext, u'@@')
if site is None:
site = pywikibot.getSite()
if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext:
@@ -716,7 +725,7 @@
else:
s2 = removeCategoryLinksAndSeparator(oldtext, site=site, marker=marker,
separator=separatorstripped)
- s = categoryFormat(new, insite = site)
+ s = categoryFormat(new, insite=site)
if s:
if site.language() in site.family.category_attop:
newtext = s + separator + s2
@@ -735,7 +744,7 @@
newtext = s2[:firstafter].replace(marker, '') + s + \
s2[firstafter:]
elif site.language() in site.family.categories_last:
- newtext = s2.replace(marker,'').strip() + separator + s
+ newtext = s2.replace(marker, '').strip() + separator + s
else:
interwiki = getLanguageLinks(s2)
s2 = removeLanguageLinksAndSeparator(s2.replace(marker, ''),
@@ -745,11 +754,11 @@
newtext = replaceLanguageLinks(s2, interwiki, site=site,
addOnly=True)
else:
- newtext = s2.replace(marker,'')
+ newtext = s2.replace(marker, '')
return newtext.strip()
-def categoryFormat(categories, insite = None):
+def categoryFormat(categories, insite=None):
"""Return a string containing links to all categories in a list.
'categories' should be a list of Category objects or strings
@@ -763,13 +772,14 @@
if insite is None:
insite = pywikibot.getSite()
- if isinstance(categories[0],basestring):
+ if isinstance(categories[0], basestring):
if categories[0][0] == '[':
catLinks = categories
else:
catLinks = ['[[Category:'+category+']]' for category in categories]
else:
- catLinks = [category.aslink(noInterwiki=True) for category in categories]
+ catLinks = [category.aslink(noInterwiki=True)
+ for category in categories]
if insite.category_on_one_line():
sep = ' '
@@ -779,6 +789,7 @@
#catLinks.sort()
return sep.join(catLinks) + config.line_separator
+
#---------------------------------------
# Functions dealing with external links
#---------------------------------------
@@ -801,9 +812,9 @@
# not allowed inside links. For example, in this wiki text:
# ''Please see http://www.example.org.''
# .'' shouldn't be considered as part of the link.
- regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd \
- + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside \
- + ']*[^' + notAtEnd + '])'
+ regex = r'(?P<url>http[s]?://[^%(notInside)s]*?[^%(notAtEnd)s]' \
+ r'(?=[%(notAtEnd)s]*\'\')|http[s]?://[^%(notInside)s]*' \
+ r'[^%(notAtEnd)s])' % {'notInside': notInside, 'notAtEnd': notAtEnd}
if withoutBracketed:
regex = r'(?<!\[)' + regex
@@ -812,6 +823,7 @@
linkR = re.compile(regex)
return linkR
+
#----------------------------------
# Functions dealing with templates
#----------------------------------
@@ -850,7 +862,7 @@
inside = {}
count = 0
Rtemplate = re.compile(
- ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
+ ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
Rmath = re.compile(ur'<math>[^<]+</math>')
Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2))
@@ -897,8 +909,8 @@
for m2 in pywikibot.link_regex.finditer(paramString):
count2 += 1
text = m2.group(0)
- paramString = paramString.replace(text,
- '%s%d%s' % (marker2, count2, marker2))
+ paramString = paramString.replace(
+ text, '%s%d%s' % (marker2, count2, marker2))
links[count2] = text
# Parse string
markedParams = paramString.split('|')
@@ -932,22 +944,21 @@
You can use items from extract_templates_and_params here to get
an equivalent template wiki text (it may happen that the order
of the params changes).
+
"""
(template, params) = template_and_params
-
text = u''
for item in params:
- text += u'|%s=%s\n' % (item, params[item])
+ text += u'|%s=%s\n' % (item, params[item])
return u'{{%s\n%s}}' % (template, text)
+
#----------------------------------
# Page parsing functionality
#----------------------------------
def does_text_contain_section(pagetext, section):
- """ Determines whether the page text contains the given
- section title.
- """
+ """Determines whether the page text contains the given section title."""
m = re.search("=+[ ']*%s[ ']*=+" % re.escape(section), pagetext)
return bool(m)