http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9911
Revision: 9911 Author: drtrigon Date: 2012-02-19 14:24:41 +0000 (Sun, 19 Feb 2012) Log Message: ----------- Adding capabilities of DrTrigonBot 'textlib' script; 'removeHTMLParts' (this is a follow-up or bug fix for r9902 also)
Modified Paths: -------------- trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py =================================================================== --- trunk/pywikipedia/pywikibot/textlib.py 2012-02-19 12:49:35 UTC (rev 9910) +++ trunk/pywikipedia/pywikibot/textlib.py 2012-02-19 14:24:41 UTC (rev 9911) @@ -16,6 +16,7 @@
import wikipedia as pywikibot import re +from HTMLParser import HTMLParser
def unescape(s): """Replace escaped HTML-special characters by their originals""" @@ -219,6 +220,40 @@ return toRemoveR.sub('', text)
+def removeHTMLParts(text, keeptags = ['tt', 'nowiki', 'small', 'sup']): + """ + Return text without portions where HTML markup is disabled + + Parts that can/will be removed are -- + * HTML and all wiki tags + + The exact set of parts which should NOT be removed can be passed as the + 'keeptags' parameter, which defaults to ['tt', 'nowiki', 'small', 'sup']. + """ + # try to merge with 'removeDisabledParts()' above into one generic function + + # thanks to http://www.hellboundhackers.org/articles/841-using-python-39;s-htmlparser-cl... + parser = _GetDataHTML() + parser.keeptags = keeptags + parser.feed(text) + parser.close() + return parser.textdata + +# thanks to http://docs.python.org/library/htmlparser.html +class _GetDataHTML(HTMLParser): + textdata = u'' + keeptags = [] + + def handle_data(self, data): + self.textdata += data + + def handle_starttag(self, tag, attrs): + if tag in self.keeptags: self.textdata += u"<%s>" % tag + + def handle_endtag(self, tag): + if tag in self.keeptags: self.textdata += u"</%s>" % tag + + def isDisabled(text, index, tags = ['*']): """ Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
pywikipedia-svn@lists.wikimedia.org