jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/676040 )
Change subject: [IMPR] move interwiki.PageTree class to tools ......................................................................
[IMPR] move interwiki.PageTree class to tools
- add new SizedKeyCollection to tools which is similar to a defaultdict except - len() gives the number of all values, not only the keys - items cannot be set, append method must be used - any attribute or method of the given values can be used to extract the key, not only the site property - rename add() method to append() because the order counts - rename removeSite() to a more general remove_key() - rename siteCounts() to a more general iter_values_len() - introduce a clear() method to remove all entries - getattr and repr are also provided - test are made as doctest
Change-Id: Ie847ec35cebc95f50a42551c32d721466a28c26f --- M pywikibot/tools/__init__.py M scripts/interwiki.py 2 files changed, 136 insertions(+), 66 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py index 60138c4..4893aed 100644 --- a/pywikibot/tools/__init__.py +++ b/pywikibot/tools/__init__.py @@ -19,7 +19,8 @@ import time import types
-from collections.abc import Iterator, Mapping +from collections.abc import Container, Iterable, Iterator, Mapping, Sized +from collections import defaultdict from contextlib import suppress from datetime import datetime from distutils.version import LooseVersion, Version @@ -27,7 +28,7 @@ from importlib import import_module from inspect import getfullargspec from ipaddress import ip_address -from itertools import zip_longest +from itertools import chain, zip_longest from typing import Optional from warnings import catch_warnings, showwarning, warn
@@ -273,6 +274,113 @@ return '{}({!r})'.format(self.__class__.__name__, self.__data)
+# Collection is not provided with Python 3.5; use Container, Iterable, Sized +class SizedKeyCollection(Container, Iterable, Sized): + + """Structure to hold values where the key is given by the value itself. + + A stucture like a defaultdict but the key is given by the value + itselfvand cannot be assigned directly. It returns the number of all + items with len() but not the number of keys. + + Samples: + + >>> from pywikibot.tools import SizedKeyCollection + >>> data = SizedKeyCollection('title') + >>> data.append('foo') + >>> data.append('bar') + >>> data.append('Foo') + >>> list(data) + ['foo', 'Foo', 'bar'] + >>> len(data) + 3 + >>> 'Foo' in data + True + >>> 'foo' in data + False + >>> data['Foo'] + ['foo', 'Foo'] + >>> list(data.keys()) + ['Foo', 'Bar'] + >>> data.remove_key('Foo') + >>> list(data) + ['bar'] + >>> data.clear() + >>> list(data) + [] + """ + + def __init__(self, keyattr: str): + """Initializer. + + @param keyattr: an attribute or method of the values to be hold + with this collection which will be used as key. + """ + self.keyattr = keyattr + self.clear() + + def __contains__(self, key) -> bool: + return key in self.data + + def __getattr__(self, key): + """Delegate Mapping methods to self.data.""" + if key in ('keys', 'values', 'items'): + return getattr(self.data, key) + return super().__getattr__(key) + + def __getitem__(self, key) -> list: + return self.data[key] + + def __iter__(self): + """Iterate through all items of the tree.""" + yield from chain.from_iterable(self.data.values()) + + def __len__(self) -> int: + """Return the number of all values.""" + return self.size + + def __repr__(self) -> str: + return str(self.data).replace('defaultdict', self.__class__.__name__) + + def append(self, value): + """Add a value to the collection.""" + key = getattr(value, self.keyattr) + if callable(key): + key = key() + self.data[key].append(value) + self.size += 1 + + def remove(self, value): + """Remove a value from the container.""" + key = getattr(value, self.keyattr) + if callable(key): + key = key() + with suppress(ValueError): + self.data[key].remove(value) + self.size -= 1 + + def remove_key(self, key): + """Remove all values for a given key.""" + with suppress(KeyError): + self.size -= len(self.data[key]) + del self.data[key] + + def clear(self): + """Remove all elements from SizedKeyCollection.""" + self.data = defaultdict(list) + self.size = 0 + + def filter(self, key): + """Iterate over items for a given key.""" + with suppress(KeyError): + yield from self.data[key] + + def iter_values_len(self): + """Yield key, len(values) pairs.""" + for key, values in self.data.items(): + yield key, len(values) + + class LazyRegex:
""" diff --git a/scripts/interwiki.py b/scripts/interwiki.py index 684e2fd..e920bd2 100755 --- a/scripts/interwiki.py +++ b/scripts/interwiki.py @@ -340,7 +340,6 @@
from collections import defaultdict from contextlib import suppress -from itertools import chain from textwrap import fill
import pywikibot @@ -350,7 +349,7 @@
from pywikibot.bot import OptionHandler, ListOption, StandardOption from pywikibot.cosmetic_changes import moved_links -from pywikibot.tools import first_upper +from pywikibot.tools import first_upper, SizedKeyCollection from pywikibot.tools.formatter import color_format
docuReplacements = { @@ -542,7 +541,7 @@ return True
-class PageTree: +class PageTree(SizedKeyCollection):
""" Structure to manipulate a set of pages. @@ -564,49 +563,12 @@ list of pages to the user when he'll be asked to resolve conflicts.
- @ivar tree: dictionary with Site as keys and list of page as values. - All pages found within Site are kept in self.tree[site]. + @ivar data: dictionary with Site as keys and list of page as values. + All pages found within Site are kept in self.data[site].
- @type tree: dict + @type data: defaultdict(list) """ - self.tree = defaultdict(list) - self.size = 0 - - def filter(self, site): - """Iterate over pages that are in Site site.""" - with suppress(KeyError): - yield from self.tree[site] - - def __len__(self): - """Length of the object.""" - return self.size - - def add(self, page): - """Add a page to the tree.""" - site = page.site - self.tree[site].append(page) - self.size += 1 - - def remove(self, page): - """Remove a page from the tree.""" - with suppress(ValueError): - self.tree[page.site].remove(page) - self.size -= 1 - - def removeSite(self, site): - """Remove all pages from Site site.""" - with suppress(KeyError): - self.size -= len(self.tree[site]) - del self.tree[site] - - def siteCounts(self): - """Yield (Site, number of pages in site) pairs.""" - for site, d in self.tree.items(): - yield site, len(d) - - def __iter__(self): - """Iterate through all items of the tree.""" - yield from chain.from_iterable(self.tree.values()) + super().__init__('site')
class Subject(interwiki_graph.Subject): @@ -689,7 +651,7 @@ # Mark the origin page as todo. self.todo = PageTree() if origin: - self.todo.add(origin) + self.todo.append(origin)
# done is a list of all pages that have been analyzed and that # are known to belong to this subject. @@ -782,7 +744,7 @@
for link in links: page = pywikibot.Page(link) - self.todo.add(page) + self.todo.append(page) self.found_in[page] = [None] if keephintedsites: self.hintedsites.add(page.site) @@ -795,7 +757,7 @@ * site is a site where we still have work to do on * count is the number of items in that Site that need work on """ - return self.todo.siteCounts() + return self.todo.iter_values_len()
def whatsNextPageBatch(self, site): """ @@ -815,19 +777,19 @@ # Prepare a list of suitable pages result = [] for page in self.todo.filter(site): - self.pending.add(page) + self.pending.append(page) result.append(page)
- self.todo.removeSite(site) + self.todo.remove_key(site)
# If there are any, return them. Otherwise, nothing is in progress. return result
def makeForcedStop(self, counter): """End work on the page before the normal end.""" - for site, count in self.todo.siteCounts(): + for site, count in self.todo.iter_values_len(): counter.minus(site, count) - self.todo = PageTree() + self.todo.clear() self.forcedStop = True
def addIfNew(self, page, counter, linkingPage): @@ -860,7 +822,7 @@ return False
self.found_in[page] = [linkingPage] - self.todo.add(page) + self.todo.append(page) counter.plus(page.site) return True
@@ -1134,7 +1096,7 @@ # Loop over all the pages that should have been taken care of for page in self.pending: # Mark the page as done - self.done.add(page) + self.done.append(page)
# make sure that none of the linked items is an auto item if self.conf.skipauto: @@ -1163,12 +1125,12 @@ if page == self.origin: # The page we are working on is the page that does not # exist. No use in doing any work on it in that case. - for site, count in self.todo.siteCounts(): + for site, count in self.todo.iter_values_len(): counter.minus(site, count) - self.todo = PageTree() + self.todo.clear() # In some rare cases it might be we already did check some # 'automatic' links - self.done = PageTree() + self.done.clear() continue
if page.isRedirectPage() or page.isCategoryRedirect(): @@ -1189,15 +1151,15 @@ if not redirectTargetPage.isRedirectPage() \ and not redirectTargetPage.isCategoryRedirect(): self.origin = redirectTargetPage - self.todo.add(redirectTargetPage) + self.todo.append(redirectTargetPage) counter.plus(redirectTargetPage.site) else: # This is a redirect page to the origin. We don't need # to follow the redirection. # In this case we can also stop all hints! - for site, count in self.todo.siteCounts(): + for site, count in self.todo.iter_values_len(): counter.minus(site, count) - self.todo = PageTree() + self.todo.clear() elif not self.conf.followredirect: self.conf.note('not following {}redirects.'.format(redir)) elif page.isStaticRedirect(): @@ -1220,10 +1182,10 @@ self.conf.remove.append(str(page)) self.conf.note('{} is empty. Skipping.'.format(page)) if page == self.origin: - for site, count in self.todo.siteCounts(): + for site, count in self.todo.iter_values_len(): counter.minus(site, count) - self.todo = PageTree() - self.done = PageTree() + self.todo.clear() + self.done.clear() self.origin = None continue
@@ -1339,7 +1301,7 @@ break
# These pages are no longer 'in progress' - self.pending = PageTree() + self.pending.clear() # Check whether we need hints and the user offered to give them if self.untranslated and not self.hintsAsked: self.reportInterwikilessPage(page) @@ -1347,7 +1309,7 @@
def isDone(self): """Return True if all the work for this subject has completed.""" - return len(self.todo) == 0 + return not self.todo
def problem(self, txt, createneed=True): """Report a problem with the resolution of this subject."""