Xqt submitted this change.

View Change


Approvals: Xqt: Verified; Looks good to me, approved
[IMPR]: lazy load imageinfo metadata

In case of large pdf/djvu file, a lot of data is transmitted as text
layer is stored as metadata. Lazy load metadata if needed.

Bug: T253591
Change-Id: I742dfb7e0baf22c00698235a87ff805c47e996e9
---
M pywikibot/__init__.py
M tests/site_generators_tests.py
M pywikibot/page/_filepage.py
M pywikibot/site/_apisite.py
M tests/file_tests.py
M pywikibot/data/api/_generators.py
6 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index 69bf268..02e7e2f 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -72,8 +72,8 @@
stdout,
warning,
)
-from pywikibot.site import APISite, BaseSite
from pywikibot.time import Timestamp
+from pywikibot.site import APISite, BaseSite
from pywikibot.tools import PYTHON_VERSION, normalize_username


diff --git a/pywikibot/data/api/_generators.py b/pywikibot/data/api/_generators.py
index 07f0338..45a1e57 100644
--- a/pywikibot/data/api/_generators.py
+++ b/pywikibot/data/api/_generators.py
@@ -701,7 +701,7 @@
and 'protection' in parameters['inprop']):
append_params(parameters, 'inprop', 'protection')
append_params(parameters, 'iiprop',
- 'timestamp|user|comment|url|size|sha1|metadata')
+ 'timestamp|user|comment|url|size|sha1')
append_params(parameters, 'iilimit', 'max') # T194233
parameters['generator'] = generator
super().__init__(**kwargs)
diff --git a/pywikibot/page/_filepage.py b/pywikibot/page/_filepage.py
index f9347ad..e95da06 100644
--- a/pywikibot/page/_filepage.py
+++ b/pywikibot/page/_filepage.py
@@ -65,6 +65,12 @@
)

def _load_file_revisions(self, imageinfo) -> None:
+ """
+ Store an Image revision of FilePage (a FileInfo object) in local cache.
+
+ Metadata shall be added lazily to the revision already present
+ in cache.
+ """
for file_rev in imageinfo:
# filemissing in API response indicates most fields are missing
# see https://gerrit.wikimedia.org/r/c/mediawiki/core/+/533482/
@@ -72,8 +78,13 @@
pywikibot.warning(
f"File '{self.title()}' contains missing revisions")
continue
- file_revision = FileInfo(file_rev)
- self._file_revisions[file_revision.timestamp] = file_revision
+
+ ts_key = pywikibot.Timestamp.fromISOformat(file_rev['timestamp'])
+ file_revision = self._file_revisions.setdefault(
+ ts_key, FileInfo(file_rev, self))
+
+ # add new imageinfo attributes since last request.
+ file_revision.update(file_rev)

@property
def latest_file_info(self):
@@ -105,6 +116,20 @@
oldest_ts = min(self._file_revisions)
return self._file_revisions[oldest_ts]

+ def get_file_info(self, ts) -> dict:
+ """
+ Retrieve and store information of a specific Image rev. of FilePage.
+
+ This function will load also metadata.
+ It is also used as a helper in FileInfo to load metadata lazily.
+
+ :param ts: timestamp of the Image rev. to retrieve
+
+ :return: instance of FileInfo()
+ """
+ self.site.loadimageinfo(self, history=False, timestamp=ts)
+ return self._file_revisions[ts]
+
def get_file_history(self) -> dict:
"""
Return the file's version history.
@@ -431,7 +456,7 @@
Attributes can be retrieved both as self['key'] or self.key.

Following attributes will be returned:
- - timestamp, user, comment, url, size, sha1, mime, metadata
+ - timestamp, user, comment, url, size, sha1, mime, metadata (lazily)
- archivename (not for latest revision)

see :meth:`Site.loadimageinfo()
@@ -442,12 +467,23 @@
.. versionchanged:: 7.7
raises KeyError instead of AttributeError if FileInfo is used as
Mapping.
+ .. versionchanged:: 8.6.0
+ Metadata are loaded lazily.
+ Added *filepage* parameter.
"""

- def __init__(self, file_revision) -> None:
+ def __init__(self, file_revision, filepage) -> None:
"""Initiate the class using the dict from ``APISite.loadimageinfo``."""
- self.__dict__.update(file_revision)
- self.timestamp = pywikibot.Timestamp.fromISOformat(self.timestamp)
+ self.filepage = filepage
+ self._metadata = None
+ self.update(file_revision)
+
+ def update(self, file_revision):
+ """Update FileInfo with new values."""
+ for k, v in file_revision.items():
+ if k == 'timestamp':
+ v = pywikibot.Timestamp.fromISOformat(v)
+ setattr(self, k, v)

def __getitem__(self, key):
"""Give access to class values by key."""
@@ -464,3 +500,15 @@
def __eq__(self, other) -> bool:
"""Test if two FileInfo objects are equal."""
return self.__dict__ == other.__dict__
+
+ @property
+ def metadata(self):
+ """Return metadata."""
+ if self._metadata is None:
+ self.filepage.get_file_info(self.timestamp)
+ return self._metadata
+
+ @metadata.setter
+ def metadata(self, value):
+ """Set metadata."""
+ self._metadata = value
diff --git a/pywikibot/site/_apisite.py b/pywikibot/site/_apisite.py
index 332eada..d3f2652 100644
--- a/pywikibot/site/_apisite.py
+++ b/pywikibot/site/_apisite.py
@@ -1362,27 +1362,35 @@
history: bool = False,
url_width: Optional[int] = None,
url_height: Optional[int] = None,
- url_param: Optional[str] = None
+ url_param: Optional[str] = None,
+ timestamp: Optional[pywikibot.Timestamp] = None
) -> None:
"""Load image info from api and save in page attributes.

The following properties are loaded: ``timestamp``, ``user``,
``comment``, ``url``, ``size``, ``sha1``, ``mime``, ``mediatype``,
- ``metadata``, ``archivename`` and ``bitdepth``. If *url_width*,
- *url_height* or *url_param* is given, additional properties
- ``thumbwidth``, ``thumbheight``, ``thumburl`` and
+ ``archivename`` and ``bitdepth``.
+ ``metadata``is loaded only if history is False.
+ If *url_width*, *url_height* or *url_param* is given, additional
+ properties ``thumbwidth``, ``thumbheight``, ``thumburl`` and
``responsiveUrls`` are given.

.. note:: Parameters validation and error handling left to the
API call.
.. versionchanged:: 8.2
*mediatype* and *bitdepth* properties were added.
+ .. versionchanged:: 8.6.0
+ Added *timestamp* parameter.
+ Metadata are loaded only if history is False.
.. seealso:: :api:`Imageinfo`

:param history: if true, return the image's version history
:param url_width: get info for a thumbnail with given width
:param url_height: get info for a thumbnail with given height
:param url_param: get info for a thumbnail with given param
+ :param timestamp: timestamp of the image's version to retrieve.
+ It has effect only if history is False.
+ If omitted, the latest version will be fetched.
"""
args = {
'titles': page.title(with_section=False),
@@ -1391,11 +1399,15 @@
'iiurlparam': url_param,
'iiprop': [
'timestamp', 'user', 'comment', 'url', 'size', 'sha1', 'mime',
- 'mediatype', 'metadata', 'archivename', 'bitdepth',
+ 'mediatype', 'archivename', 'bitdepth',
]
}
if not history:
args['total'] = 1
+ args['iiprop'].append('metadata')
+ if timestamp:
+ args['iistart'] = args['iiend'] = timestamp.isoformat()
+
query = self._generator(api.PropertyGenerator,
type_arg='imageinfo',
**args)
diff --git a/tests/file_tests.py b/tests/file_tests.py
index adf1d92..0457925 100755
--- a/tests/file_tests.py
+++ b/tests/file_tests.py
@@ -229,6 +229,14 @@
super().setUp()
self.image = pywikibot.FilePage(self.site, self.file_name)

+ def test_lazyload_metadata(self):
+ """Test metadata lazy load."""
+ self.assertTrue(self.image.exists())
+
+ rev = self.image.latest_file_info
+ self.assertIsNone(rev._metadata)
+ self.assertIsNotNone(rev.metadata)
+
def test_get_file_url(self):
"""Get File url."""
self.assertTrue(self.image.exists())
diff --git a/tests/site_generators_tests.py b/tests/site_generators_tests.py
index ec63ae4..fd22d31 100755
--- a/tests/site_generators_tests.py
+++ b/tests/site_generators_tests.py
@@ -181,8 +181,7 @@
'continue': [True],
'inprop': ['protection'],
'iilimit': ['max'],
- 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', 'sha1',
- 'metadata'],
+ 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', 'sha1'],
'indexpageids': [True],
'generator': ['templates'], 'action': ['query'],
'prop': ['info', 'imageinfo', 'categoryinfo'],
@@ -218,7 +217,7 @@
'inprop': ['protection'],
'iilimit': ['max'],
'iiprop': ['timestamp', 'user', 'comment', 'url', 'size',
- 'sha1', 'metadata'], 'generator': ['links'],
+ 'sha1'], 'generator': ['links'],
'prop': ['info', 'imageinfo', 'categoryinfo'],
'redirects': [False],
}
@@ -1108,8 +1107,7 @@
expected_params = {
'prop': ['info', 'imageinfo', 'categoryinfo'],
'inprop': ['protection'],
- 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', 'sha1',
- 'metadata'],
+ 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', 'sha1'],
'iilimit': ['max'], 'generator': ['search'], 'action': ['query'],
'indexpageids': [True], 'continue': [True],
'gsrnamespace': [0], 'gsrsearch': ['wiki'], 'gsrwhat': ['title']}

To view, visit change 980007. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I742dfb7e0baf22c00698235a87ff805c47e996e9
Gerrit-Change-Number: 980007
Gerrit-PatchSet: 8
Gerrit-Owner: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged