jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/782196 )
Change subject: [IMPR] handle ParserError within xmlreader.XmlDump.parse() ......................................................................
[IMPR] handle ParserError within xmlreader.XmlDump.parse()
- If a ParserError occurs within xmlreader.XmlDump.parse() this exception can be handled by a callable passed to on_error parameter. This callable is called in this case with the exception as parameter. - Use this functionality with pagegenerators.XMLDump[Old]PageGenerator, replace.XmlDumpReplacePageGenerator and redirect.RedirectGenerator and call pywikibot.error() - add xmlreader example
Bug: T306134 Change-Id: Ie392e0cc604b4c576a272f7374fd25d2bdf8a029 --- M pywikibot/pagegenerators.py M pywikibot/xmlreader.py M scripts/redirect.py M scripts/replace.py M tox.ini 5 files changed, 51 insertions(+), 10 deletions(-)
Approvals: Xqt: Verified; Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index a033cfa..6d55f85 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -2815,7 +2815,7 @@ self.namespaces = self.site.namespaces else: self.namespaces = self.site.namespaces.resolve(namespaces) - dump = xmlreader.XmlDump(filename) + dump = xmlreader.XmlDump(filename, on_error=pywikibot.error) self.parser = dump.parse()
def __next__(self) -> 'pywikibot.page.Page': diff --git a/pywikibot/xmlreader.py b/pywikibot/xmlreader.py index 02e1bcf..12f2287 100644 --- a/pywikibot/xmlreader.py +++ b/pywikibot/xmlreader.py @@ -13,8 +13,10 @@ # Distributed under the terms of the MIT license. # import re -from xml.etree.ElementTree import iterparse +from typing import Optional +from xml.etree.ElementTree import iterparse, ParseError
+from pywikibot.backports import Callable, Type from pywikibot.tools import open_archive
@@ -66,32 +68,71 @@
class XmlDump:
- """ - Represents an XML dump file. + """Represents an XML dump file.
Reads the local file at initialization, parses it, and offers access to the resulting XmlEntries via a generator.
+ .. versionadded:: 7.2 + the `on_error` parameter + .. versionchanged:: 7.2 + `allrevisions` parameter must be given as keyword parameter + + Usage example: + + >>> from pywikibot import xmlreader + >>> dump = xmlreader.XmlDump('tests/data/xml/article-pear.xml') + >>> for elem in dump.parse(): + ... print(elem.title, elem.revisionid) + ... + ... + Pear 185185 + Pear 185241 + Pear 185408 + Pear 188924 + >>> + :param allrevisions: boolean If True, parse all revisions instead of only the latest one. Default: False. + :param on_error: a callable which is invoked within :meth:`parse` + method when a ParseError occurs. The exception is passed to this + callable. Otherwise the exception is raised. """
- def __init__(self, filename, allrevisions: bool = False) -> None: + def __init__(self, filename, *, + allrevisions: bool = False, + on_error: Optional[ + Callable[[Type[BaseException]], None]] = None) -> None: """Initializer.""" self.filename = filename + self.on_error = on_error if allrevisions: self._parse = self._parse_all else: self._parse = self._parse_only_latest
def parse(self): - """Generator using ElementTree iterparse function.""" + """Generator using ElementTree iterparse function. + + .. versionchanged:: 7.2 + if a ParseError occurs it can be handled by the callable + given with `on_error` parameter of this instance. + """ with open_archive(self.filename) as source: context = iterparse(source, events=('start', 'end', 'start-ns')) self.root = None + while True: + try: + event, elem = next(context) + except StopIteration: + return + except ParseError as e: + if self.on_error: + self.on_error(e) + continue + raise
- for event, elem in context: if event == 'start-ns' and elem[0] == '': self.uri = elem[1] continue diff --git a/scripts/redirect.py b/scripts/redirect.py index 277acf7..e7eaa39 100755 --- a/scripts/redirect.py +++ b/scripts/redirect.py @@ -148,7 +148,7 @@ xmlFilename = self.opt.xml redict = {} # open xml dump and read page titles out of it - dump = xmlreader.XmlDump(xmlFilename) + dump = xmlreader.XmlDump(xmlFilename, on_error=pywikibot.error) redirR = self.site.redirect_regex readPagesCount = 0 pageTitles = set() diff --git a/scripts/replace.py b/scripts/replace.py index 0deb505..fe93bad 100755 --- a/scripts/replace.py +++ b/scripts/replace.py @@ -423,7 +423,7 @@ self.site = site else: self.site = pywikibot.Site() - dump = xmlreader.XmlDump(self.xmlFilename) + dump = xmlreader.XmlDump(self.xmlFilename, on_error=pywikibot.error) self.parser = dump.parse()
def __iter__(self): diff --git a/tox.ini b/tox.ini index 67c9cfe..9fc7bd6 100644 --- a/tox.ini +++ b/tox.ini @@ -9,7 +9,7 @@ hacking-py37
[params] -doctest_skip = --ignore-files=(eventstreams|gui|mysql).py +doctest_skip = --ignore-files=(eventstreams|gui|mysql|xmlreader).py generate_user_files = -W error::UserWarning -m pwb generate_user_files -family:wikipedia -lang:test -v
[testenv]
pywikibot-commits@lists.wikimedia.org