jenkins-bot submitted this change.

View Change


Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
update XmlDump and tests

1) 'revisions' parameter added; can be set to earliest, latest or all.
2) _parse_only_earliest() function added.
3) default behavior for XmlDump is now completely preserved as
"first_found"
option and _parse_only_first_found().
4) all tests updated to use revisions instead of allrevisions. added tests
for
_parse_only_earliest() and _parse_only_first_found().
5) update to f-strings
6) use dispatcher dict for parse method
7) update docstring example w/phab link, add revisions param description

Change-Id: Icfc5628a7efda744c53fc30cfc81e9156468b460
---
M tests/xmlreader_tests.py
M pywikibot/xmlreader.py
2 files changed, 169 insertions(+), 47 deletions(-)

diff --git a/pywikibot/xmlreader.py b/pywikibot/xmlreader.py
index a85dbab..5d3575e 100644
--- a/pywikibot/xmlreader.py
+++ b/pywikibot/xmlreader.py
@@ -18,6 +18,7 @@
from __future__ import annotations

import re
+from typing import Optional, Union


try:
@@ -25,8 +26,11 @@
except ImportError:
from xml.etree.ElementTree import iterparse, ParseError

-from pywikibot.backports import Callable
-from pywikibot.tools import open_archive
+from pywikibot.backports import Callable, Type
+from pywikibot.tools import (
+ issue_deprecation_warning,
+ open_archive,
+)


def parseRestrictions(restrictions):
@@ -86,12 +90,15 @@
the `on_error` parameter
.. versionchanged:: 7.2
`allrevisions` parameter must be given as keyword parameter
+ .. versionchanged:: 9.0
+ `allrevisions` parameter deprecated due to :phab:`T340804`
+ `revisions` parameter introduced as replacement

Usage example:

>>> from pywikibot import xmlreader
>>> name = 'tests/data/xml/article-pear.xml'
- >>> dump = xmlreader.XmlDump(name, allrevisions=True)
+ >>> dump = xmlreader.XmlDump(name, revisions='all')
>>> for elem in dump.parse():
... print(elem.title, elem.revisionid)
...
@@ -108,22 +115,50 @@
:param on_error: a callable which is invoked within :meth:`parse`
method when a ParseError occurs. The exception is passed to this
callable. Otherwise the exception is raised.
+ :param revisions: which of four methods to use to parse the dump:
+ * `first_found` (whichever revision is the first element)
+ * `latest` (most recent revision, by largest `revisionid`)
+ * `earliest` (first revision, by smallest `revisionid`)
+ * `all` (all revisions for each page)
+ Default: `first_found`
"""

- def __init__(
- self,
- filename,
- *,
- allrevisions: bool = False,
- on_error: Callable[[type[BaseException]], None] | None = None,
- ) -> None:
+ def __init__(self, filename, *,
+ allrevisions: Union[bool, str] = None,
+ # when allrevisions removed, revisions can default to 'latest'
+ revisions: str = 'first_found',
+ on_error: Optional[
+ Callable[[Type[BaseException]], None]] = None) -> None:
"""Initializer."""
self.filename = filename
self.on_error = on_error
+
+ self.rev_actions = {
+ 'first_found': self._parse_only_first_found,
+ 'latest': self._parse_only_latest,
+ 'earliest': self._parse_only_earliest,
+ 'all': self._parse_all,
+ }
+
if allrevisions:
- self._parse = self._parse_all
- else:
- self._parse = self._parse_only_latest
+ issue_deprecation_warning(
+ 'allrevisions=True',
+ "revisions='all'",
+ since='9.0.0')
+ revisions = 'all'
+ elif revisions == 'first_found':
+ issue_deprecation_warning(
+ 'allrevisions=False returns first revision found,'
+ " usually earliest. For most recent, use revisions='latest'. "
+ "For oldest, use revisions='earliest',"
+ "'allrevisions'",
+ since='9.0.0')
+
+ if revisions not in self.rev_actions:
+ actions = str(list(self.rev_actions.keys())).strip('[]')
+ raise ValueError(f"'revisions' must be one of {actions}.")
+
+ self._parse = self.rev_actions.get(revisions)

def parse(self):
"""Generator using ElementTree iterparse function.
@@ -154,44 +189,81 @@
continue
yield from self._parse(event, elem)

+ def _parse_only_first_found(self, event, elem):
+ """
+ Deprecated parser that yields the first revision found.
+
+ Documentation had wrongly indicated it returned the latest revision.
+ :phab: `T340804`
+ """
+ if event == 'end' and elem.tag == f'{{{self.uri}}}page':
+ self._headers(elem)
+ revision = elem.find(f'{{{self.uri}}}revision')
+ yield self._create_revision(revision)
+ elem.clear()
+ self.root.clear()
+
def _parse_only_latest(self, event, elem):
"""Parser that yields only the latest revision."""
- if event == 'end' and elem.tag == '{%s}page' % self.uri:
+ if event == 'end' and elem.tag == f'{{{self.uri}}}page':
self._headers(elem)
- revision = elem.find('{%s}revision' % self.uri)
- yield self._create_revision(revision)
+ latest_revision = None
+ latest_revisionid = 0
+ for revision in elem.findall(f'{{{self.uri}}}revision'):
+ revisionid = int(revision.findtext(f'{{{self.uri}}}id'))
+ if latest_revision is None or revisionid > latest_revisionid:
+ latest_revision = revision
+ latest_revisionid = revisionid
+ if latest_revision is not None:
+ yield self._create_revision(latest_revision)
+ elem.clear()
+ self.root.clear()
+
+ def _parse_only_earliest(self, event, elem):
+ """Parser that yields only the earliest revision."""
+ if event == 'end' and elem.tag == f'{{{self.uri}}}page':
+ self._headers(elem)
+ earliest_revision = None
+ earliest_revisionid = float('inf') # Initialize positive infinity
+ for revision in elem.findall(f'{{{self.uri}}}revision'):
+ revisionid = int(revision.findtext(f'{{{self.uri}}}id'))
+ if revisionid < earliest_revisionid:
+ earliest_revisionid = revisionid
+ earliest_revision = revision
+ if earliest_revision is not None:
+ yield self._create_revision(earliest_revision)
elem.clear()
self.root.clear()

def _parse_all(self, event, elem):
"""Parser that yields all revisions."""
- if event == 'start' and elem.tag == '{%s}page' % self.uri:
+ if event == 'start' and elem.tag == f'{{{self.uri}}}page':
self._headers(elem)
- if event == 'end' and elem.tag == '{%s}revision' % self.uri:
+ if event == 'end' and elem.tag == f'{{{self.uri}}}revision':
yield self._create_revision(elem)
elem.clear()
self.root.clear()

def _headers(self, elem) -> None:
"""Extract headers from XML chunk."""
- self.title = elem.findtext('{%s}title' % self.uri)
- self.ns = elem.findtext('{%s}ns' % self.uri)
- self.pageid = elem.findtext('{%s}id' % self.uri)
- self.restrictions = elem.findtext('{%s}restrictions' % self.uri)
- self.isredirect = elem.findtext('{%s}redirect' % self.uri) is not None
+ self.title = elem.findtext(f'{{{self.uri}}}title')
+ self.ns = elem.findtext(f'{{{self.uri}}}ns')
+ self.pageid = elem.findtext(f'{{{self.uri}}}id')
+ self.restrictions = elem.findtext(f'{{{self.uri}}}restrictions')
+ self.isredirect = elem.findtext(f'{{{self.uri}}}redirect') is not None
self.editRestriction, self.moveRestriction = parseRestrictions(
self.restrictions)

def _create_revision(self, revision):
- """Create a Single revision."""
- revisionid = revision.findtext('{%s}id' % self.uri)
- timestamp = revision.findtext('{%s}timestamp' % self.uri)
- comment = revision.findtext('{%s}comment' % self.uri)
- contributor = revision.find('{%s}contributor' % self.uri)
- ipeditor = contributor.findtext('{%s}ip' % self.uri)
- username = ipeditor or contributor.findtext('{%s}username' % self.uri)
+ """Create a single revision."""
+ revisionid = revision.findtext(f'{{{self.uri}}}id')
+ timestamp = revision.findtext(f'{{{self.uri}}}timestamp')
+ comment = revision.findtext(f'{{{self.uri}}}comment')
+ contributor = revision.find(f'{{{self.uri}}}contributor')
+ ipeditor = contributor.findtext(f'{{{self.uri}}}ip')
+ username = ipeditor or contributor.findtext(f'{{{self.uri}}}username')
# could get comment, minor as well
- text = revision.findtext('{%s}text' % self.uri)
+ text = revision.findtext(f'{{{self.uri}}}text')
return XmlEntry(title=self.title,
ns=self.ns,
id=self.pageid,
diff --git a/tests/xmlreader_tests.py b/tests/xmlreader_tests.py
index d5864db..0398ad1 100755
--- a/tests/xmlreader_tests.py
+++ b/tests/xmlreader_tests.py
@@ -29,56 +29,85 @@

def test_XmlDumpAllRevs(self):
"""Test loading all revisions."""
- pages = get_entries('article-pear.xml', allrevisions=True)
+ pages = get_entries('article-pear.xml', revisions='all')
self.assertLength(pages, 4)
self.assertEqual('Automated conversion', pages[0].comment)
self.assertEqual('Pear', pages[0].title)
self.assertEqual('24278', pages[0].id)
+ self.assertEqual('185185', pages[0].revisionid)
+ self.assertEqual('188924', pages[3].revisionid)
self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of'))
self.assertEqual('Quercusrobur', pages[1].username)
self.assertEqual('Pear', pages[0].title)

- def test_XmlDumpFirstRev(self):
- """Test loading the first revision."""
- pages = get_entries('article-pear.xml', allrevisions=False)
+ def test_XmlDumpFirstFoundRev(self):
+ """Test loading the first found revision.
+
+ To be deprecated.
+ :phab: `T340804`
+ """
+ pages = get_entries('article-pear.xml', revisions='first_found')
self.assertLength(pages, 1)
self.assertEqual('Automated conversion', pages[0].comment)
self.assertEqual('Pear', pages[0].title)
self.assertEqual('24278', pages[0].id)
+ self.assertEqual('185185', pages[0].revisionid)
+ self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of'))
+ self.assertTrue(not pages[0].isredirect)
+
+ def test_XmlDumpEarliestRev(self):
+ """Test loading the earliest revision."""
+ pages = get_entries('article-pear.xml', revisions='earliest')
+ self.assertLength(pages, 1)
+ self.assertEqual('Automated conversion', pages[0].comment)
+ self.assertEqual('Pear', pages[0].title)
+ self.assertEqual('24278', pages[0].id)
+ self.assertEqual('185185', pages[0].revisionid)
+ self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of'))
+ self.assertTrue(not pages[0].isredirect)
+
+ def test_XmlDumpLatestRev(self):
+ """Test loading the latest revision."""
+ pages = get_entries('article-pear.xml', revisions='latest')
+ self.assertLength(pages, 1)
+ self.assertEqual('sp', pages[0].comment)
+ self.assertEqual('Pear', pages[0].title)
+ self.assertEqual('24278', pages[0].id)
+ self.assertEqual('188924', pages[0].revisionid)
self.assertTrue(pages[0].text.startswith('Pears are [[tree]]s of'))
self.assertTrue(not pages[0].isredirect)

def test_XmlDumpRedirect(self):
"""Test XmlDump correctly parsing whether a page is a redirect."""
- get_entries('article-pyrus.xml', allrevisions=True)
+ get_entries('article-pyrus.xml', revisions='all')
pages = list(xmlreader.XmlDump(
join_xml_data_path('article-pyrus.xml')).parse())
self.assertTrue(pages[0].isredirect)

- def _compare(self, previous, variant, all_revisions):
+ def _compare(self, previous, variant, revisions):
"""Compare the tested variant with the previous (if not None)."""
entries = get_entries('article-pyrus' + variant,
- allrevisions=all_revisions)
+ revisions=revisions)
result = [entry.__dict__ for entry in entries]
if previous:
self.assertEqual(previous, result)
return result

- def _compare_variants(self, all_revisions):
+ def _compare_variants(self, revisions):
"""Compare the different XML file variants."""
previous = None
- previous = self._compare(previous, '.xml', all_revisions)
- previous = self._compare(previous, '-utf16.xml', all_revisions)
- previous = self._compare(previous, '.xml.bz2', all_revisions)
- self._compare(previous, '-utf16.xml.bz2', all_revisions)
+ previous = self._compare(previous, '.xml', revisions)
+ previous = self._compare(previous, '-utf16.xml', revisions)
+ previous = self._compare(previous, '.xml.bz2', revisions)
+ self._compare(previous, '-utf16.xml.bz2', revisions)

def test_XmlDump_compare_all(self):
"""Compare the different XML files using all revisions."""
- self._compare_variants(True)
+ self._compare_variants('all')

def test_XmlDump_compare_single(self):
"""Compare the different XML files using only a single revision."""
- self._compare_variants(False)
+ self._compare_variants('latest')


class ExportDotTenTestCase(TestCase):
@@ -89,7 +118,7 @@

def test_pair(self):
"""Test reading the main page/user talk page pair file."""
- entries = get_entries('pair-0.10.xml', allrevisions=True)
+ entries = get_entries('pair-0.10.xml', revisions='all')
self.assertLength(entries, 4)
for entry in entries:
self.assertEqual(entry.username, 'Carlossuarez46')
@@ -115,7 +144,7 @@

def test_edit_summary_decoding(self):
"""Test edit summaries are decoded."""
- entries = get_entries('pair-0.10.xml', allrevisions=True)
+ entries = get_entries('pair-0.10.xml', revisions='all')
articles = [entry for entry in entries if entry.ns == '0']

# It does not decode the edit summary

To view, visit change 934432. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Icfc5628a7efda744c53fc30cfc81e9156468b460
Gerrit-Change-Number: 934432
Gerrit-PatchSet: 21
Gerrit-Owner: ElSeiver <ellis.seiver@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-CC: Mpaa <mpaa.wiki@gmail.com>
Gerrit-MessageType: merged