http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10025
Revision: 10025
Author: valhallasw
Date: 2012-03-18 17:23:48 +0000 (Sun, 18 Mar 2012)
Log Message:
-----------
(somewhat blunt) merge of trunk/pywikibot/textlib.py
Modified Paths:
--------------
branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2012-03-18 17:08:41 UTC (rev 10024)
+++ branches/rewrite/pywikibot/textlib.py 2012-03-18 17:23:48 UTC (rev 10025)
@@ -18,6 +18,7 @@
import re
from pywikibot.i18n import translate
+from HTMLParser import HTMLParser
def unescape(s):
"""Replace escaped HTML-special characters by their
originals"""
@@ -221,6 +222,40 @@
return toRemoveR.sub('', text)
+def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small',
'sup']):
+ """
+ Return text without portions where HTML markup is disabled
+
+ Parts that can/will be removed are --
+ * HTML and all wiki tags
+
+ The exact set of parts which should NOT be removed can be passed as the
+ 'keeptags' parameter, which defaults to ['tt', 'nowiki',
'small', 'sup'].
+ """
+ # try to merge with 'removeDisabledParts()' above into one generic function
+
+ # thanks to
http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-c…
+ parser = _GetDataHTML()
+ parser.keeptags = keeptags
+ parser.feed(text)
+ parser.close()
+ return parser.textdata
+
+# thanks to
http://docs.python.org/library/htmlparser.html
+class _GetDataHTML(HTMLParser):
+ textdata = u''
+ keeptags = []
+
+ def handle_data(self, data):
+ self.textdata += data
+
+ def handle_starttag(self, tag, attrs):
+ if tag in self.keeptags: self.textdata += u"<%s>" % tag
+
+ def handle_endtag(self, tag):
+ if tag in self.keeptags: self.textdata += u"</%s>" % tag
+
+
def isDisabled(text, index, tags = ['*']):
"""
Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
@@ -269,12 +304,24 @@
#-------------------------------------------------
# Functions dealing with interwiki language links
#-------------------------------------------------
-# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
-# interproject. These functions only deal with links to a
-# corresponding page in another language on the same project (e.g.,
-# Wikipedia, Wiktionary, etc.) in another language. They do not find
-# or change links to a different project, or any that are formatted
-# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM)
+# Note - MediaWiki supports several kinds of interwiki links; two kinds are
+# interlanguage links. We deal here with those kinds only.
+# A family has by definition only one kind of interlanguage links:
+# 1 - interlanguage links inside the own family.
+# They go to a corresponding page in another language in the same
+# family, such as from 'en.wikipedia' to 'pt.wikipedia', or
from
+# 'es.wiktionary' to 'arz.wiktionary'.
+# Families with this kind have several language-specific sites.
+# They have their interwiki_forward attribute set to None
+# 2 - language links forwarding to another family.
+# They go to a corresponding page in another family, such as from
+# 'commons' to 'zh.wikipedia, or from 'incubator' to
'en.wikipedia'.
+# Families having those have one member only, and do not have
+# language-specific sites. The name of the target family of their
+# interlanguage links is kept in their interwiki_forward attribute.
+# These functions only deal with links of these two kinds only. They
+# do not find or change links of other kinds, nor any that are formatted
+# as in-line interwiki links (e.g., "[[:es:Articulo]]".
def getLanguageLinks(text, insite=None, pageLink="[[]]",
template_subpage=False):
"""
@@ -287,6 +334,10 @@
"""
if insite is None:
insite = pywikibot.getSite()
+ fam = insite.family
+ # when interwiki links forward to another family, retrieve pages & other infos
there
+ if fam.interwiki_forward:
+ fam = pywikibot.Family(fam.interwiki_forward)
result = {}
# Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
# and HTML comments
@@ -299,20 +350,28 @@
# interwiki link.
# NOTE: language codes are case-insensitive and only consist of basic latin
# letters and hyphens.
+ #TODO: currently, we do not have any, but BCP 47 allows digits, and underscores.
+ #TODO: There is no semantic difference between hyphens and underscores -> fold
them.
interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
for lang, pagetitle in interwikiR.findall(text):
lang = lang.lower()
# Check if it really is in fact an interwiki link to a known
# language, or if it's e.g. a category tag or an internal link
- if lang in insite.family.obsolete:
- lang = insite.family.obsolete[lang]
- if lang in insite.validLanguageLinks():
+ if lang in fam.obsolete:
+ lang = fam.obsolete[lang]
+ if lang in fam.langs.keys():
if '|' in pagetitle:
# ignore text after the pipe
pagetitle = pagetitle[:pagetitle.index('|')]
# we want the actual page objects rather than the titles
- site = insite.getSite(code = lang)
- result[site] = pywikibot.Page(pywikibot.Link(pagetitle, site))
+ site = pywikibot.getSite(code=lang, fam=fam)
+ try:
+ result[site] = pywikibot.Page(site, pagetitle, insite=insite)
+ except pywikibot.InvalidTitle:
+ pywikibot.output(
+ u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
+ % (lang, pagetitle))
+ continue
return result
@@ -386,7 +445,11 @@
if s:
if site.language() in site.family.interwiki_attop or \
u'<!-- interwiki at top -->' in oldtext:
- newtext = s + separator + s2.replace(marker,'').strip()
+ #do not add separator if interiki links are on one line
+ newtext = s + \
+ [separator, u''][site.language() in
+ site.family.interwiki_on_one_line] + \
+ s2.replace(marker, '').strip()
else:
# calculate what was after the language links on the page
firstafter = s2.find(marker)
@@ -407,6 +470,12 @@
site) + separator + s
newtext = replaceCategoryLinks(s2, cats, site=site,
addOnly=True)
+ # for Wikitravel's language links position.
+ # (not supported by rewrite - no API)
+ elif site.family.name == 'wikitravel':
+ s = separator + s + separator
+ newtext = s2[:firstafter].replace(marker,'') + s + \
+ s2[firstafter:]
else:
if template or template_subpage:
if template_subpage:
@@ -451,13 +520,11 @@
ar = interwikiSort(links.keys(), insite)
s = []
for site in ar:
- obj = links[site]
- if isinstance(obj, pywikibot.Link):
- link = obj.astext(insite)
- else:
- # Page
- link = obj.title(asLink=True, forceInterwiki=True)
- s.append(link)
+ try:
+ link = unicode(links[site]).replace('[[:', '[[')
+ s.append(link)
+ except AttributeError:
+ s.append(getSite(site).linkto(links[site], othersite=insite))
if insite.lang in insite.family.interwiki_on_one_line:
sep = u' '
else:
@@ -497,7 +564,8 @@
# Functions dealing with category links
#---------------------------------------
-def getCategoryLinks(text, site):
+def getCategoryLinks(text, site=None):
+ import catlib
"""Return a list of category links found in text.
List contains Category objects.
@@ -505,6 +573,8 @@
"""
result = []
+ if site is None:
+ site = pywikibot.getSite()
# Ignore category links within nowiki tags, pre tags, includeonly tags,
# and HTML comments
text = removeDisabledParts(text)
@@ -522,7 +592,7 @@
return result
-def removeCategoryLinks(text, site, marker=''):
+def removeCategoryLinks(text, site=None, marker=''):
"""Return text with all category links removed.
Put the string marker after the last replacement (at the end of the text
@@ -533,6 +603,8 @@
# interwiki link, plus trailing whitespace. The language code is grouped.
# NOTE: This assumes that language codes only consist of non-capital
# ASCII letters and hyphens.
+ if site is None:
+ site = pywikibot.getSite()
catNamespace = '|'.join(site.category_namespaces())
categoryR = re.compile(r'\[\[\s*(%s)\s*:.*?\]\]\s*' % catNamespace, re.I)
text = replaceExcept(text, categoryR, '',
@@ -554,6 +626,8 @@
if there is no replacement).
"""
+ if site is None:
+ site = pywikibot.getSite()
if separator:
mymarker = findmarker(text, u'@C@')
newtext = removeCategoryLinks(text, site, mymarker)
@@ -609,7 +683,8 @@
Replace the category links given in the wikitext given
in oldtext by the new links given in new.
- 'new' should be a list of Category objects.
+ 'new' should be a list of Category objects or strings
+ which can be either the raw name or [[Category:..]].
If addOnly is True, the old category won't be deleted and the
category(s) given will be added (and so they won't replace anything).
@@ -671,7 +746,8 @@
def categoryFormat(categories, insite = None):
"""Return a string containing links to all categories in a list.
- 'categories' should be a list of Category objects.
+ 'categories' should be a list of Category objects or strings
+ which can be either the raw name or [[Category:..]].
The string is formatted for inclusion in insite.
@@ -680,7 +756,15 @@
return ''
if insite is None:
insite = pywikibot.getSite()
- catLinks = [category.aslink() for category in categories]
+
+ if isinstance(categories[0],basestring):
+ if categories[0][0] == '[':
+ catLinks = categories
+ else:
+ catLinks = ['[[Category:'+category+']]' for category in
categories]
+ else:
+ catLinks = [category.aslink(noInterwiki=True) for category in categories]
+
if insite.category_on_one_line():
sep = ' '
else:
@@ -834,3 +918,19 @@
# Add it to the result
result.append((name, params))
return result
+
+
+def glue_template_and_params(template_and_params):
+ """Return wiki text of template glued from params.
+
+ You can use items from extract_templates_and_params here to get
+ an equivalent template wiki text (it may happen that the order
+ of the params changes).
+ """
+ (template, params) = template_and_params
+
+ text = u''
+ for item in params:
+ text += u'|%s=%s\n' % (item, params[item])
+
+ return u'{{%s\n%s}}' % (template, text)