jenkins-bot has submitted this change and it was merged.
Change subject: Add textlib._MultiTemplateMatchBuilder ......................................................................
Add textlib._MultiTemplateMatchBuilder
Various template matching regex's exist throughout pywikibot and scripts, with varying issues in the template matching regexes.
Create a _MultiTemplateMatchBuilder capable of providing regex objects for templates, using the regex from template.TemplateRobot, and use it to fix bugs in cosmetic_changes replaceDeprecatedTemplates.
template.XmlDumpTemplatePageGenerator duplicates template matching logic found in TemplateRobot. Replace and deprecate template.XmlDumpTemplatePageGenerator.
Start test module for template script.
Add TODO for other template matching to be converted to using _MultiTemplateMatchBuilder.
Change-Id: I0deb795b6634b030c9e655e8e1dbbb925480de5b --- M pywikibot/cosmetic_changes.py M pywikibot/textlib.py M scripts/add_text.py M scripts/category_redirect.py M scripts/checkimages.py M scripts/template.py M tests/__init__.py M tests/cosmetic_changes_tests.py A tests/data/xml/dummy-template.xml A tests/template_bot_tests.py 10 files changed, 325 insertions(+), 20 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py index f466b89..0d77cee 100755 --- a/pywikibot/cosmetic_changes.py +++ b/pywikibot/cosmetic_changes.py @@ -71,6 +71,7 @@ import pywikibot
from pywikibot import config, textlib +from pywikibot.textlib import _MultiTemplateMatchBuilder from pywikibot.tools import deprecate_arg, first_lower, first_upper from pywikibot.tools import MediaWikiVersion
@@ -306,6 +307,7 @@ 5. interwiki links
""" + # TODO: T123150 starsList = [ u'bueno', u'bom interwiki', @@ -694,6 +696,8 @@
def replaceDeprecatedTemplates(self, text): exceptions = ['comment', 'math', 'nowiki', 'pre'] + builder = _MultiTemplateMatchBuilder(self.site) + if self.site.family.name in deprecatedTemplates and \ self.site.code in deprecatedTemplates[self.site.family.name]: for template in deprecatedTemplates[ @@ -704,12 +708,12 @@ new = '' else: new = '{{%s}}' % new - if self.site.namespaces[10].case == 'first-letter': - old = '[' + old[0].upper() + old[0].lower() + ']' + old[1:] + text = textlib.replaceExcept( text, - r'{{([mM][sS][gG]:)?%s(?P<parameters>|[^}]+|)}}' % old, + builder.pattern(old), new, exceptions) + return text
# from fixes.py diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 8e73b56..79d30ee 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -42,6 +42,7 @@ deprecated, DeprecatedRegex, OrderedDict, + StringTypes, UnicodeType, issue_deprecation_warning ) @@ -144,6 +145,52 @@ return s
+class _MultiTemplateMatchBuilder(object): + + """Build template matcher.""" + + def __init__(self, site): + """Constructor.""" + self.site = site + + def pattern(self, template, flags=re.DOTALL): + """Return a compiled regex to match template.""" + # TODO: add ability to also match contents within the template + # TODO: add option for template to be None to match any template + # TODO: use NESTED_TEMPLATE_REGEX with <parameters> instead of <params> + namespace = self.site.namespaces[10] + if isinstance(template, pywikibot.Page): + if template.namespace() == 10: + old = template.title(withNamespace=False) + else: + raise ValueError( + '{0} is not a template Page object'.format(template)) + elif isinstance(template, StringTypes): + old = template + else: + raise ValueError( + '{0!r} is not a valid template'.format(template)) + + if namespace.case == 'first-letter': + pattern = '[' + \ + re.escape(old[0].upper()) + \ + re.escape(old[0].lower()) + \ + ']' + re.escape(old[1:]) + else: + pattern = re.escape(old) + pattern = re.sub(r'_|\ ', r'[_ ]', pattern) + templateRegex = re.compile(r'{{ *(' + ':|'.join(namespace) + + r':|[mM][sS][gG]:)?' + pattern + + r'(?P<parameters>\s*|.+?|) *}}', + flags) + return templateRegex + + def search_any_predicate(self, templates): + """Return a predicate that matches any template.""" + predicates = [self.pattern(template).search for template in templates] + return lambda text: any(predicate(text) for predicate in predicates) + + def _create_default_regexes(): """Fill (and possibly overwrite) _regex_cache with default regexes.""" _regex_cache.update({ diff --git a/scripts/add_text.py b/scripts/add_text.py index bfe9365..f6049f5 100755 --- a/scripts/add_text.py +++ b/scripts/add_text.py @@ -196,6 +196,7 @@ categoriesInside, site, True) # Dealing the stars' issue + # TODO: T123150 allstars = [] starstext = textlib.removeDisabledParts(text) for star in starsList: diff --git a/scripts/category_redirect.py b/scripts/category_redirect.py index 7305b76..51e3159 100755 --- a/scripts/category_redirect.py +++ b/scripts/category_redirect.py @@ -293,6 +293,7 @@ with open(datafile + ".bak", "wb") as f: cPickle.dump(record, f, protocol=config.pickle_protocol) # regex to match soft category redirects + # TODO: enhance and use textlib._MultiTemplateMatchBuilder # note that any templates containing optional "category:" are # incorrect and will be fixed by the bot template_regex = re.compile( diff --git a/scripts/checkimages.py b/scripts/checkimages.py index f34a18f..516efda 100755 --- a/scripts/checkimages.py +++ b/scripts/checkimages.py @@ -1405,6 +1405,7 @@
def isTagged(self): """Understand if a file is already tagged or not.""" + # TODO: enhance and use textlib._MultiTemplateMatchBuilder # Is the image already tagged? If yes, no need to double-check, skip for i in i18n.translate(self.site, txt_find): # If there are {{ use regex, otherwise no (if there's not the diff --git a/scripts/template.py b/scripts/template.py index 6aa9563..fba031c 100755 --- a/scripts/template.py +++ b/scripts/template.py @@ -118,14 +118,16 @@
import pywikibot
-from pywikibot import i18n, pagegenerators, Bot +from pywikibot import i18n, pagegenerators, textlib, Bot
from pywikibot.exceptions import ArgumentDeprecationWarning from pywikibot.pagegenerators import XMLDumpPageGenerator +from pywikibot.tools import deprecated
from scripts.replace import ReplaceRobot as ReplaceBot
+@deprecated('XMLDumpPageGenerator') class XmlDumpTemplatePageGenerator(XMLDumpPageGenerator):
""" @@ -220,20 +222,9 @@
replacements = [] exceptions = {} - namespace = self.site.namespaces[10] + builder = textlib._MultiTemplateMatchBuilder(site) for old, new in self.templates.items(): - if namespace.case == 'first-letter': - pattern = '[' + \ - re.escape(old[0].upper()) + \ - re.escape(old[0].lower()) + \ - ']' + re.escape(old[1:]) - else: - pattern = re.escape(old) - pattern = re.sub(r'_|\ ', r'[_ ]', pattern) - templateRegex = re.compile(r'{{ *(' + ':|'.join(namespace) + - r':|[mM][sS][gG]:)?' + pattern + - r'(?P<parameters>\s*|.+?|) *}}', - re.DOTALL) + templateRegex = builder.pattern(old)
if self.getOption('subst') and self.getOption('remove'): replacements.append((templateRegex, @@ -344,9 +335,14 @@ oldTemplates.append(oldTemplate)
if xmlfilename: - gen = XmlDumpTemplatePageGenerator(oldTemplates, xmlfilename) + builder = textlib._MultiTemplateMatchBuilder(site) + predicate = builder.search_any_predicate(oldTemplates) + + gen = XmlDumpTemplatePageGenerator( + xmlfilename, site=site, text_predicate=predicate) else: gen = genFactory.getCombinedGenerator() + if not gen: gens = [ pagegenerators.ReferringPageGenerator(t, onlyTemplateInclusion=True) diff --git a/tests/__init__.py b/tests/__init__.py index 1596d05..c7a66bf 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -138,6 +138,7 @@ 'isbn', 'protectbot', 'reflinks', + 'template_bot', 'replacebot', 'uploadbot', 'weblinkchecker', diff --git a/tests/cosmetic_changes_tests.py b/tests/cosmetic_changes_tests.py index 4144a33..cf8b7ae 100644 --- a/tests/cosmetic_changes_tests.py +++ b/tests/cosmetic_changes_tests.py @@ -102,10 +102,10 @@ def test_replaceDeprecatedTemplates(self): """Test replaceDeprecatedTemplates method.""" self.assertEqual( - '{{Quellen fehlen }}' + '{{Belege fehlen}}' '{{Belege fehlen| }}' '{{Belege fehlen|foo}}' - '{{Quellen_fehlen|foo}}', + '{{Belege fehlen|foo}}', self.cct.replaceDeprecatedTemplates( '{{Quellen fehlen }}' '{{Quellen fehlen| }}' diff --git a/tests/data/xml/dummy-template.xml b/tests/data/xml/dummy-template.xml new file mode 100644 index 0000000..1673c75 --- /dev/null +++ b/tests/data/xml/dummy-template.xml @@ -0,0 +1,108 @@ +<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en"> + <siteinfo> + <sitename>Wikipedia</sitename> + <dbname>enwiki</dbname> + <base>http://en.wikipedia.org/wiki/Main_Page</base> + <generator>MediaWiki 1.25wmf12</generator> + <case>first-letter</case> + <namespaces> + <namespace key="-2" case="first-letter">Media</namespace> + <namespace key="-1" case="first-letter">Special</namespace> + <namespace key="0" case="first-letter" /> + <namespace key="1" case="first-letter">Talk</namespace> + <namespace key="2" case="first-letter">User</namespace> + <namespace key="3" case="first-letter">User talk</namespace> + <namespace key="4" case="first-letter">Wikipedia</namespace> + <namespace key="5" case="first-letter">Wikipedia talk</namespace> + <namespace key="6" case="first-letter">File</namespace> + <namespace key="7" case="first-letter">File talk</namespace> + <namespace key="8" case="first-letter">MediaWiki</namespace> + <namespace key="9" case="first-letter">MediaWiki talk</namespace> + <namespace key="10" case="first-letter">Template</namespace> + <namespace key="11" case="first-letter">Template talk</namespace> + <namespace key="12" case="first-letter">Help</namespace> + <namespace key="13" case="first-letter">Help talk</namespace> + <namespace key="14" case="first-letter">Category</namespace> + <namespace key="15" case="first-letter">Category talk</namespace> + <namespace key="100" case="first-letter">Portal</namespace> + <namespace key="101" case="first-letter">Portal talk</namespace> + <namespace key="108" case="first-letter">Book</namespace> + <namespace key="109" case="first-letter">Book talk</namespace> + <namespace key="118" case="first-letter">Draft</namespace> + <namespace key="119" case="first-letter">Draft talk</namespace> + <namespace key="446" case="first-letter">Education Program</namespace> + <namespace key="447" case="first-letter">Education Program talk</namespace> + <namespace key="710" case="first-letter">TimedText</namespace> + <namespace key="711" case="first-letter">TimedText talk</namespace> + <namespace key="828" case="first-letter">Module</namespace> + <namespace key="829" case="first-letter">Module talk</namespace> + <namespace key="2600" case="first-letter">Topic</namespace> + </namespaces> + </siteinfo> + <page> + <title>Fake page with msg</title> + <ns>0</ns> + <id>12345</id> + <revision> + <id>123456789</id> + <parentid>123456788</parentid> + <timestamp>2014-12-24T01:01:01Z</timestamp> + <contributor> + <username>John Vandenberg</username> + <id>31009137</id> + </contributor> + <minor/> + <comment>Foo</comment> + <model>wikitext</model> + <format>text/x-wiki</format> + <text xml:space="preserve" bytes="1"> + {{ MsG:foo }} + </text> + <sha1>1ywwm7o751gkr3fj9l7rqpl0s8o87b1</sha1> + </revision> + </page> + <page> + <title>Fake page with unnecessary template prefix</title> + <ns>1</ns> + <id>54321</id> + <revision> + <id>987654321</id> + <parentid>987654320</parentid> + <timestamp>2014-12-24T01:01:02Z</timestamp> + <contributor> + <username>John Vandenberg</username> + <id>31009137</id> + </contributor> + <minor/> + <comment>Lets discuss foo</comment> + <model>wikitext</model> + <format>text/x-wiki</format> + <text xml:space="preserve" bytes="1"> + {{TEMPLATE:bar}} + </text> + <sha1>1ywwm7o751gkr3fj9l7rqpl0s8o87b2</sha1> + </revision> + </page> + <page> + <title>Fake page with nested template</title> + <ns>1</ns> + <id>54322</id> + <revision> + <id>987654322</id> + <parentid>987654320</parentid> + <timestamp>2014-12-24T01:01:02Z</timestamp> + <contributor> + <username>John Vandenberg</username> + <id>31009137</id> + </contributor> + <minor/> + <comment>Lets discuss foo</comment> + <model>wikitext</model> + <format>text/x-wiki</format> + <text xml:space="preserve" bytes="1"> + {{baz|{{boo|}} }} + </text> + <sha1>1ywwm7o751gkr3fj9l7rqpl0s8o87b2</sha1> + </revision> + </page> +</mediawiki> diff --git a/tests/template_bot_tests.py b/tests/template_bot_tests.py new file mode 100644 index 0000000..c27dd67 --- /dev/null +++ b/tests/template_bot_tests.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- +"""Test template bot module.""" +# +# (C) Pywikibot team, 2015 +# +# Distributed under the terms of the MIT license. +# +from __future__ import absolute_import, unicode_literals + +__version__ = '$Id$' + +import pywikibot + +from pywikibot.pagegenerators import XMLDumpPageGenerator +from pywikibot.textlib import _MultiTemplateMatchBuilder + +from tests import join_xml_data_path +from tests.aspects import unittest, TestCase + + +class TestXMLPageGenerator(TestCase): + + """Test XML Page generator.""" + + family = 'wikipedia' + code = 'en' + + dry = True + + def test_no_match(self): + """Test pages without any desired templates.""" + template = pywikibot.Page(self.site, 'Template:foobar') + builder = _MultiTemplateMatchBuilder(self.site) + predicate = builder.search_any_predicate([template]) + gen = XMLDumpPageGenerator( + filename=join_xml_data_path('article-pear-0.10.xml'), + site=self.site, + text_predicate=predicate) + pages = list(gen) + self.assertEqual(len(pages), 0) + + def test_match(self): + """Test pages with one match without parameters.""" + template = pywikibot.Page(self.site, 'Template:stack begin') + builder = _MultiTemplateMatchBuilder(self.site) + predicate = builder.search_any_predicate([template]) + gen = XMLDumpPageGenerator( + filename=join_xml_data_path('article-pear-0.10.xml'), + site=self.site, + text_predicate=predicate) + pages = list(gen) + self.assertEqual(len(pages), 1) + self.assertPagelistTitles(pages, ['Pear'], + site=self.site) + + def test_match_with_params(self): + """Test pages with one match with parameters.""" + template = pywikibot.Page(self.site, 'Template:Taxobox') + builder = _MultiTemplateMatchBuilder(self.site) + predicate = builder.search_any_predicate([template]) + gen = XMLDumpPageGenerator( + filename=join_xml_data_path('article-pear-0.10.xml'), + site=self.site, + text_predicate=predicate) + pages = list(gen) + self.assertEqual(len(pages), 1) + self.assertPagelistTitles(pages, ['Pear'], + site=self.site) + + def test_match_any(self): + """Test pages with one of many matches.""" + template1 = pywikibot.Page(self.site, 'Template:stack begin') + template2 = pywikibot.Page(self.site, 'Template:foobar') + builder = _MultiTemplateMatchBuilder(self.site) + + predicate = builder.search_any_predicate([template1, template2]) + gen = XMLDumpPageGenerator( + filename=join_xml_data_path('article-pear-0.10.xml'), + site=self.site, + text_predicate=predicate) + pages = list(gen) + self.assertEqual(len(pages), 1) + self.assertPagelistTitles(pages, ['Pear'], + site=self.site) + + # reorder templates + predicate = builder.search_any_predicate([template2, template1]) + gen = XMLDumpPageGenerator( + filename=join_xml_data_path('article-pear-0.10.xml'), + site=self.site, + text_predicate=predicate) + pages = list(gen) + self.assertEqual(len(pages), 1) + self.assertPagelistTitles(pages, ['Pear'], + site=self.site) + + def test_match_msg(self): + """Test pages with {{msg:..}}.""" + template = pywikibot.Page(self.site, 'Template:Foo') + builder = _MultiTemplateMatchBuilder(self.site) + + predicate = builder.search_any_predicate([template]) + gen = XMLDumpPageGenerator( + filename=join_xml_data_path('dummy-template.xml'), + site=self.site, + text_predicate=predicate) + pages = list(gen) + self.assertEqual(len(pages), 1) + self.assertPagelistTitles(pages, ['Fake page with msg'], + site=self.site) + + @unittest.expectedFailure + def test_match_unnecessary_template_prefix(self): + """Test pages with {{template:..}}.""" + template = pywikibot.Page(self.site, 'Template:Bar') + builder = _MultiTemplateMatchBuilder(self.site) + + predicate = builder.search_any_predicate([template]) + gen = XMLDumpPageGenerator( + filename=join_xml_data_path('dummy-template.xml'), + site=self.site, + text_predicate=predicate) + pages = list(gen) + self.assertEqual(len(pages), 1) + self.assertPagelistTitles( + pages, ['Fake page with unnecessary template prefix'], + site=self.site) + + def test_nested_match(self): + """Test pages with one match inside another template.""" + template = pywikibot.Page(self.site, 'Template:boo') + builder = _MultiTemplateMatchBuilder(self.site) + predicate = builder.search_any_predicate([template]) + gen = XMLDumpPageGenerator( + filename=join_xml_data_path('dummy-template.xml'), + site=self.site, + text_predicate=predicate) + pages = list(gen) + self.assertEqual(len(pages), 1) + self.assertPagelistTitles( + pages, ['Fake page with nested template'], + site=self.site) + + +if __name__ == "__main__": + unittest.main()
pywikibot-commits@lists.wikimedia.org