jenkins-bot submitted this change.

View Change

Approvals: Xqt: Verified; Looks good to me, approved jenkins-bot: Verified
[IMPR] handle ParserError within xmlreader.XmlDump.parse()

- If a ParserError occurs within xmlreader.XmlDump.parse() this
exception can be handled by a callable passed to on_error parameter.
This callable is called in this case with the exception as parameter.
- Use this functionality with pagegenerators.XMLDump[Old]PageGenerator,
replace.XmlDumpReplacePageGenerator and redirect.RedirectGenerator
and call pywikibot.error()
- add xmlreader example

Bug: T306134
Change-Id: Ie392e0cc604b4c576a272f7374fd25d2bdf8a029
---
M pywikibot/pagegenerators.py
M pywikibot/xmlreader.py
M scripts/redirect.py
M scripts/replace.py
M tox.ini
5 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index a033cfa..6d55f85 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -2815,7 +2815,7 @@
self.namespaces = self.site.namespaces
else:
self.namespaces = self.site.namespaces.resolve(namespaces)
- dump = xmlreader.XmlDump(filename)
+ dump = xmlreader.XmlDump(filename, on_error=pywikibot.error)
self.parser = dump.parse()

def __next__(self) -> 'pywikibot.page.Page':
diff --git a/pywikibot/xmlreader.py b/pywikibot/xmlreader.py
index 02e1bcf..12f2287 100644
--- a/pywikibot/xmlreader.py
+++ b/pywikibot/xmlreader.py
@@ -13,8 +13,10 @@
# Distributed under the terms of the MIT license.
#
import re
-from xml.etree.ElementTree import iterparse
+from typing import Optional
+from xml.etree.ElementTree import iterparse, ParseError

+from pywikibot.backports import Callable, Type
from pywikibot.tools import open_archive


@@ -66,32 +68,71 @@

class XmlDump:

- """
- Represents an XML dump file.
+ """Represents an XML dump file.

Reads the local file at initialization,
parses it, and offers access to the resulting XmlEntries via a generator.

+ .. versionadded:: 7.2
+ the `on_error` parameter
+ .. versionchanged:: 7.2
+ `allrevisions` parameter must be given as keyword parameter
+
+ Usage example:
+
+ >>> from pywikibot import xmlreader
+ >>> dump = xmlreader.XmlDump('tests/data/xml/article-pear.xml')
+ >>> for elem in dump.parse():
+ ... print(elem.title, elem.revisionid)
+ ...
+ ...
+ Pear 185185
+ Pear 185241
+ Pear 185408
+ Pear 188924
+ >>>
+
:param allrevisions: boolean
If True, parse all revisions instead of only the latest one.
Default: False.
+ :param on_error: a callable which is invoked within :meth:`parse`
+ method when a ParseError occurs. The exception is passed to this
+ callable. Otherwise the exception is raised.
"""

- def __init__(self, filename, allrevisions: bool = False) -> None:
+ def __init__(self, filename, *,
+ allrevisions: bool = False,
+ on_error: Optional[
+ Callable[[Type[BaseException]], None]] = None) -> None:
"""Initializer."""
self.filename = filename
+ self.on_error = on_error
if allrevisions:
self._parse = self._parse_all
else:
self._parse = self._parse_only_latest

def parse(self):
- """Generator using ElementTree iterparse function."""
+ """Generator using ElementTree iterparse function.
+
+ .. versionchanged:: 7.2
+ if a ParseError occurs it can be handled by the callable
+ given with `on_error` parameter of this instance.
+ """
with open_archive(self.filename) as source:
context = iterparse(source, events=('start', 'end', 'start-ns'))
self.root = None
+ while True:
+ try:
+ event, elem = next(context)
+ except StopIteration:
+ return
+ except ParseError as e:
+ if self.on_error:
+ self.on_error(e)
+ continue
+ raise

- for event, elem in context:
if event == 'start-ns' and elem[0] == '':
self.uri = elem[1]
continue
diff --git a/scripts/redirect.py b/scripts/redirect.py
index 277acf7..e7eaa39 100755
--- a/scripts/redirect.py
+++ b/scripts/redirect.py
@@ -148,7 +148,7 @@
xmlFilename = self.opt.xml
redict = {}
# open xml dump and read page titles out of it
- dump = xmlreader.XmlDump(xmlFilename)
+ dump = xmlreader.XmlDump(xmlFilename, on_error=pywikibot.error)
redirR = self.site.redirect_regex
readPagesCount = 0
pageTitles = set()
diff --git a/scripts/replace.py b/scripts/replace.py
index 0deb505..fe93bad 100755
--- a/scripts/replace.py
+++ b/scripts/replace.py
@@ -423,7 +423,7 @@
self.site = site
else:
self.site = pywikibot.Site()
- dump = xmlreader.XmlDump(self.xmlFilename)
+ dump = xmlreader.XmlDump(self.xmlFilename, on_error=pywikibot.error)
self.parser = dump.parse()

def __iter__(self):
diff --git a/tox.ini b/tox.ini
index 67c9cfe..9fc7bd6 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,7 +9,7 @@
hacking-py37

[params]
-doctest_skip = --ignore-files=(eventstreams|gui|mysql)\.py
+doctest_skip = --ignore-files=(eventstreams|gui|mysql|xmlreader)\.py
generate_user_files = -W error::UserWarning -m pwb generate_user_files -family:wikipedia -lang:test -v

[testenv]

To view, visit change 782196. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie392e0cc604b4c576a272f7374fd25d2bdf8a029
Gerrit-Change-Number: 782196
Gerrit-PatchSet: 14
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Mpaa <mpaa.wiki@gmail.com>
Gerrit-MessageType: merged