Xqt has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/980007 )
Change subject: [IMPR]: lazy load imageinfo metadata ......................................................................
[IMPR]: lazy load imageinfo metadata
In case of large pdf/djvu file, a lot of data is transmitted as text layer is stored as metadata. Lazy load metadata if needed.
Bug: T253591 Change-Id: I742dfb7e0baf22c00698235a87ff805c47e996e9 --- M pywikibot/__init__.py M tests/site_generators_tests.py M pywikibot/page/_filepage.py M pywikibot/site/_apisite.py M tests/file_tests.py M pywikibot/data/api/_generators.py 6 files changed, 97 insertions(+), 18 deletions(-)
Approvals: Xqt: Verified; Looks good to me, approved
diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py index 69bf268..02e7e2f 100644 --- a/pywikibot/__init__.py +++ b/pywikibot/__init__.py @@ -72,8 +72,8 @@ stdout, warning, ) -from pywikibot.site import APISite, BaseSite from pywikibot.time import Timestamp +from pywikibot.site import APISite, BaseSite from pywikibot.tools import PYTHON_VERSION, normalize_username
diff --git a/pywikibot/data/api/_generators.py b/pywikibot/data/api/_generators.py index 07f0338..45a1e57 100644 --- a/pywikibot/data/api/_generators.py +++ b/pywikibot/data/api/_generators.py @@ -701,7 +701,7 @@ and 'protection' in parameters['inprop']): append_params(parameters, 'inprop', 'protection') append_params(parameters, 'iiprop', - 'timestamp|user|comment|url|size|sha1|metadata') + 'timestamp|user|comment|url|size|sha1') append_params(parameters, 'iilimit', 'max') # T194233 parameters['generator'] = generator super().__init__(**kwargs) diff --git a/pywikibot/page/_filepage.py b/pywikibot/page/_filepage.py index f9347ad..e95da06 100644 --- a/pywikibot/page/_filepage.py +++ b/pywikibot/page/_filepage.py @@ -65,6 +65,12 @@ )
def _load_file_revisions(self, imageinfo) -> None: + """ + Store an Image revision of FilePage (a FileInfo object) in local cache. + + Metadata shall be added lazily to the revision already present + in cache. + """ for file_rev in imageinfo: # filemissing in API response indicates most fields are missing # see https://gerrit.wikimedia.org/r/c/mediawiki/core/+/533482/ @@ -72,8 +78,13 @@ pywikibot.warning( f"File '{self.title()}' contains missing revisions") continue - file_revision = FileInfo(file_rev) - self._file_revisions[file_revision.timestamp] = file_revision + + ts_key = pywikibot.Timestamp.fromISOformat(file_rev['timestamp']) + file_revision = self._file_revisions.setdefault( + ts_key, FileInfo(file_rev, self)) + + # add new imageinfo attributes since last request. + file_revision.update(file_rev)
@property def latest_file_info(self): @@ -105,6 +116,20 @@ oldest_ts = min(self._file_revisions) return self._file_revisions[oldest_ts]
+ def get_file_info(self, ts) -> dict: + """ + Retrieve and store information of a specific Image rev. of FilePage. + + This function will load also metadata. + It is also used as a helper in FileInfo to load metadata lazily. + + :param ts: timestamp of the Image rev. to retrieve + + :return: instance of FileInfo() + """ + self.site.loadimageinfo(self, history=False, timestamp=ts) + return self._file_revisions[ts] + def get_file_history(self) -> dict: """ Return the file's version history. @@ -431,7 +456,7 @@ Attributes can be retrieved both as self['key'] or self.key.
Following attributes will be returned: - - timestamp, user, comment, url, size, sha1, mime, metadata + - timestamp, user, comment, url, size, sha1, mime, metadata (lazily) - archivename (not for latest revision)
see :meth:`Site.loadimageinfo() @@ -442,12 +467,23 @@ .. versionchanged:: 7.7 raises KeyError instead of AttributeError if FileInfo is used as Mapping. + .. versionchanged:: 8.6.0 + Metadata are loaded lazily. + Added *filepage* parameter. """
- def __init__(self, file_revision) -> None: + def __init__(self, file_revision, filepage) -> None: """Initiate the class using the dict from ``APISite.loadimageinfo``.""" - self.__dict__.update(file_revision) - self.timestamp = pywikibot.Timestamp.fromISOformat(self.timestamp) + self.filepage = filepage + self._metadata = None + self.update(file_revision) + + def update(self, file_revision): + """Update FileInfo with new values.""" + for k, v in file_revision.items(): + if k == 'timestamp': + v = pywikibot.Timestamp.fromISOformat(v) + setattr(self, k, v)
def __getitem__(self, key): """Give access to class values by key.""" @@ -464,3 +500,15 @@ def __eq__(self, other) -> bool: """Test if two FileInfo objects are equal.""" return self.__dict__ == other.__dict__ + + @property + def metadata(self): + """Return metadata.""" + if self._metadata is None: + self.filepage.get_file_info(self.timestamp) + return self._metadata + + @metadata.setter + def metadata(self, value): + """Set metadata.""" + self._metadata = value diff --git a/pywikibot/site/_apisite.py b/pywikibot/site/_apisite.py index 332eada..d3f2652 100644 --- a/pywikibot/site/_apisite.py +++ b/pywikibot/site/_apisite.py @@ -1362,27 +1362,35 @@ history: bool = False, url_width: Optional[int] = None, url_height: Optional[int] = None, - url_param: Optional[str] = None + url_param: Optional[str] = None, + timestamp: Optional[pywikibot.Timestamp] = None ) -> None: """Load image info from api and save in page attributes.
The following properties are loaded: ``timestamp``, ``user``, ``comment``, ``url``, ``size``, ``sha1``, ``mime``, ``mediatype``, - ``metadata``, ``archivename`` and ``bitdepth``. If *url_width*, - *url_height* or *url_param* is given, additional properties - ``thumbwidth``, ``thumbheight``, ``thumburl`` and + ``archivename`` and ``bitdepth``. + ``metadata``is loaded only if history is False. + If *url_width*, *url_height* or *url_param* is given, additional + properties ``thumbwidth``, ``thumbheight``, ``thumburl`` and ``responsiveUrls`` are given.
.. note:: Parameters validation and error handling left to the API call. .. versionchanged:: 8.2 *mediatype* and *bitdepth* properties were added. + .. versionchanged:: 8.6.0 + Added *timestamp* parameter. + Metadata are loaded only if history is False. .. seealso:: :api:`Imageinfo`
:param history: if true, return the image's version history :param url_width: get info for a thumbnail with given width :param url_height: get info for a thumbnail with given height :param url_param: get info for a thumbnail with given param + :param timestamp: timestamp of the image's version to retrieve. + It has effect only if history is False. + If omitted, the latest version will be fetched. """ args = { 'titles': page.title(with_section=False), @@ -1391,11 +1399,15 @@ 'iiurlparam': url_param, 'iiprop': [ 'timestamp', 'user', 'comment', 'url', 'size', 'sha1', 'mime', - 'mediatype', 'metadata', 'archivename', 'bitdepth', + 'mediatype', 'archivename', 'bitdepth', ] } if not history: args['total'] = 1 + args['iiprop'].append('metadata') + if timestamp: + args['iistart'] = args['iiend'] = timestamp.isoformat() + query = self._generator(api.PropertyGenerator, type_arg='imageinfo', **args) diff --git a/tests/file_tests.py b/tests/file_tests.py index adf1d92..0457925 100755 --- a/tests/file_tests.py +++ b/tests/file_tests.py @@ -229,6 +229,14 @@ super().setUp() self.image = pywikibot.FilePage(self.site, self.file_name)
+ def test_lazyload_metadata(self): + """Test metadata lazy load.""" + self.assertTrue(self.image.exists()) + + rev = self.image.latest_file_info + self.assertIsNone(rev._metadata) + self.assertIsNotNone(rev.metadata) + def test_get_file_url(self): """Get File url.""" self.assertTrue(self.image.exists()) diff --git a/tests/site_generators_tests.py b/tests/site_generators_tests.py index ec63ae4..fd22d31 100755 --- a/tests/site_generators_tests.py +++ b/tests/site_generators_tests.py @@ -181,8 +181,7 @@ 'continue': [True], 'inprop': ['protection'], 'iilimit': ['max'], - 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', 'sha1', - 'metadata'], + 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', 'sha1'], 'indexpageids': [True], 'generator': ['templates'], 'action': ['query'], 'prop': ['info', 'imageinfo', 'categoryinfo'], @@ -218,7 +217,7 @@ 'inprop': ['protection'], 'iilimit': ['max'], 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', - 'sha1', 'metadata'], 'generator': ['links'], + 'sha1'], 'generator': ['links'], 'prop': ['info', 'imageinfo', 'categoryinfo'], 'redirects': [False], } @@ -1108,8 +1107,7 @@ expected_params = { 'prop': ['info', 'imageinfo', 'categoryinfo'], 'inprop': ['protection'], - 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', 'sha1', - 'metadata'], + 'iiprop': ['timestamp', 'user', 'comment', 'url', 'size', 'sha1'], 'iilimit': ['max'], 'generator': ['search'], 'action': ['query'], 'indexpageids': [True], 'continue': [True], 'gsrnamespace': [0], 'gsrsearch': ['wiki'], 'gsrwhat': ['title']}