http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11737
Revision: 11737 Author: legoktm Date: 2013-07-10 06:24:43 +0000 (Wed, 10 Jul 2013) Log Message: ----------- Implement an opt-in version of using mwparserfromhell rather regex.
Users can set "use_mwparserfromhell" as True in their user-config.py to enable this.
Modified Paths: -------------- branches/rewrite/pywikibot/config2.py branches/rewrite/pywikibot/textlib.py branches/rewrite/tests/textlib_tests.py
Modified: branches/rewrite/pywikibot/config2.py =================================================================== --- branches/rewrite/pywikibot/config2.py 2013-07-10 05:46:56 UTC (rev 11736) +++ branches/rewrite/pywikibot/config2.py 2013-07-10 06:24:43 UTC (rev 11737) @@ -583,6 +583,11 @@ # LS is a shortcut alias. line_separator = LS = u'\n'
+# Settings to enable mwparserfromhell http://mwparserfromhell.readthedocs.org/en/latest/ +# Currently used in textlib.extract_templates_and_params +# This should be more accurate than our current regex, but is currently opt-in. +use_mwparserfromhell = False + # End of configuration section # ============================
Modified: branches/rewrite/pywikibot/textlib.py =================================================================== --- branches/rewrite/pywikibot/textlib.py 2013-07-10 05:46:56 UTC (rev 11736) +++ branches/rewrite/pywikibot/textlib.py 2013-07-10 06:24:43 UTC (rev 11737) @@ -13,7 +13,10 @@ # __version__ = '$Id$'
- +try: + import mwparserfromhell +except ImportError: + mwparserfromhell = False import pywikibot import re from HTMLParser import HTMLParser @@ -886,10 +889,36 @@ with an integer value corresponding to its position among the unnnamed parameters, and if this results multiple parameters with the same name only the last value provided will be returned. + + This uses a third party library (mwparserfromhell) if it is installed + and enabled in the user-config.py. Otherwise it falls back on a + regex based function defined below. + @param text: The wikitext from which templates are extracted @type text: unicode or string
""" + + if not (config.use_mwparserfromhell and mwparserfromhell): + return extract_templates_and_params_regex(text) + code = mwparserfromhell.parse(text) + result = [] + for template in code.filter_templates(): + params = {} + for param in template.params: + params[unicode(param.name)] = unicode(param.value) + result.append((unicode(template.name.strip()), params)) + return result + + +def extract_templates_and_params_regex(text): + """ + See the documentation for extract_templates_and_params + This does basically the same thing, but uses regex. + @param text: + @return: + """ + # remove commented-out stuff etc. thistxt = removeDisabledParts(text)
Modified: branches/rewrite/tests/textlib_tests.py =================================================================== --- branches/rewrite/tests/textlib_tests.py 2013-07-10 05:46:56 UTC (rev 11736) +++ branches/rewrite/tests/textlib_tests.py 2013-07-10 06:24:43 UTC (rev 11737) @@ -6,6 +6,10 @@ # __version__ = '$Id: api_tests.py 8238 2010-06-02 13:50:48Z xqt $'
+try: + import mwparserfromhell +except ImportError: + mwparserfromhell = False import unittest import codecs import os @@ -34,11 +38,18 @@ self.assertContains("enwiki_help_editing", u"Editing")
def testExtractTemplates(self): + if not (pywikibot.config.use_mwparserfromhell and mwparserfromhell): + return # We'll test the regex function in the test below func = textlib.extract_templates_and_params # It's really long. self.assertEqual(func('{{a}}'), [('a', {})]) self.assertEqual(func('{{a|b=c}}'), [('a', {'b': 'c'})]) self.assertEqual(func('{{a|b|c=d}}'), [('a', {u'1': 'b', 'c': 'd'})])
+ def testExtractTemplatesRegex(self): + func = textlib.extract_templates_and_params_regex # It's really long. + self.assertEqual(func('{{a}}'), [('a', {})]) + self.assertEqual(func('{{a|b=c}}'), [('a', {'b': 'c'})]) + self.assertEqual(func('{{a|b|c=d}}'), [('a', {u'1': 'b', 'c': 'd'})])
@unittest.expectedFailure def testSpacesInSection(self):
pywikipedia-svn@lists.wikimedia.org