jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] move interwiki.PageTree class to tools

- add new SizedKeyCollection to tools which is similar to a defaultdict
except
- len() gives the number of all values, not only the keys
- items cannot be set, append method must be used
- any attribute or method of the given values can be used to extract
the key, not only the site property
- rename add() method to append() because the order counts
- rename removeSite() to a more general remove_key()
- rename siteCounts() to a more general iter_values_len()
- introduce a clear() method to remove all entries
- getattr and repr are also provided
- test are made as doctest

Change-Id: Ie847ec35cebc95f50a42551c32d721466a28c26f
---
M pywikibot/tools/__init__.py
M scripts/interwiki.py
2 files changed, 136 insertions(+), 66 deletions(-)

diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 60138c4..4893aed 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -19,7 +19,8 @@
import time
import types

-from collections.abc import Iterator, Mapping
+from collections.abc import Container, Iterable, Iterator, Mapping, Sized
+from collections import defaultdict
from contextlib import suppress
from datetime import datetime
from distutils.version import LooseVersion, Version
@@ -27,7 +28,7 @@
from importlib import import_module
from inspect import getfullargspec
from ipaddress import ip_address
-from itertools import zip_longest
+from itertools import chain, zip_longest
from typing import Optional
from warnings import catch_warnings, showwarning, warn

@@ -273,6 +274,113 @@
return '{}({!r})'.format(self.__class__.__name__, self.__data)


+# Collection is not provided with Python 3.5; use Container, Iterable, Sized
+class SizedKeyCollection(Container, Iterable, Sized):
+
+ """Structure to hold values where the key is given by the value itself.
+
+ A stucture like a defaultdict but the key is given by the value
+ itselfvand cannot be assigned directly. It returns the number of all
+ items with len() but not the number of keys.
+
+ Samples:
+
+ >>> from pywikibot.tools import SizedKeyCollection
+ >>> data = SizedKeyCollection('title')
+ >>> data.append('foo')
+ >>> data.append('bar')
+ >>> data.append('Foo')
+ >>> list(data)
+ ['foo', 'Foo', 'bar']
+ >>> len(data)
+ 3
+ >>> 'Foo' in data
+ True
+ >>> 'foo' in data
+ False
+ >>> data['Foo']
+ ['foo', 'Foo']
+ >>> list(data.keys())
+ ['Foo', 'Bar']
+ >>> data.remove_key('Foo')
+ >>> list(data)
+ ['bar']
+ >>> data.clear()
+ >>> list(data)
+ []
+ """
+
+ def __init__(self, keyattr: str):
+ """Initializer.
+
+ @param keyattr: an attribute or method of the values to be hold
+ with this collection which will be used as key.
+ """
+ self.keyattr = keyattr
+ self.clear()
+
+ def __contains__(self, key) -> bool:
+ return key in self.data
+
+ def __getattr__(self, key):
+ """Delegate Mapping methods to self.data."""
+ if key in ('keys', 'values', 'items'):
+ return getattr(self.data, key)
+ return super().__getattr__(key)
+
+ def __getitem__(self, key) -> list:
+ return self.data[key]
+
+ def __iter__(self):
+ """Iterate through all items of the tree."""
+ yield from chain.from_iterable(self.data.values())
+
+ def __len__(self) -> int:
+ """Return the number of all values."""
+ return self.size
+
+ def __repr__(self) -> str:
+ return str(self.data).replace('defaultdict', self.__class__.__name__)
+
+ def append(self, value):
+ """Add a value to the collection."""
+ key = getattr(value, self.keyattr)
+ if callable(key):
+ key = key()
+ self.data[key].append(value)
+ self.size += 1
+
+ def remove(self, value):
+ """Remove a value from the container."""
+ key = getattr(value, self.keyattr)
+ if callable(key):
+ key = key()
+ with suppress(ValueError):
+ self.data[key].remove(value)
+ self.size -= 1
+
+ def remove_key(self, key):
+ """Remove all values for a given key."""
+ with suppress(KeyError):
+ self.size -= len(self.data[key])
+ del self.data[key]
+
+ def clear(self):
+ """Remove all elements from SizedKeyCollection."""
+ self.data = defaultdict(list)
+ self.size = 0
+
+ def filter(self, key):
+ """Iterate over items for a given key."""
+ with suppress(KeyError):
+ yield from self.data[key]
+
+ def iter_values_len(self):
+ """Yield key, len(values) pairs."""
+ for key, values in self.data.items():
+ yield key, len(values)
+
+
class LazyRegex:

"""
diff --git a/scripts/interwiki.py b/scripts/interwiki.py
index 684e2fd..e920bd2 100755
--- a/scripts/interwiki.py
+++ b/scripts/interwiki.py
@@ -340,7 +340,6 @@

from collections import defaultdict
from contextlib import suppress
-from itertools import chain
from textwrap import fill

import pywikibot
@@ -350,7 +349,7 @@

from pywikibot.bot import OptionHandler, ListOption, StandardOption
from pywikibot.cosmetic_changes import moved_links
-from pywikibot.tools import first_upper
+from pywikibot.tools import first_upper, SizedKeyCollection
from pywikibot.tools.formatter import color_format

docuReplacements = {
@@ -542,7 +541,7 @@
return True


-class PageTree:
+class PageTree(SizedKeyCollection):

"""
Structure to manipulate a set of pages.
@@ -564,49 +563,12 @@
list of pages to the user when he'll be asked to resolve
conflicts.

- @ivar tree: dictionary with Site as keys and list of page as values.
- All pages found within Site are kept in self.tree[site].
+ @ivar data: dictionary with Site as keys and list of page as values.
+ All pages found within Site are kept in self.data[site].

- @type tree: dict
+ @type data: defaultdict(list)
"""
- self.tree = defaultdict(list)
- self.size = 0
-
- def filter(self, site):
- """Iterate over pages that are in Site site."""
- with suppress(KeyError):
- yield from self.tree[site]
-
- def __len__(self):
- """Length of the object."""
- return self.size
-
- def add(self, page):
- """Add a page to the tree."""
- site = page.site
- self.tree[site].append(page)
- self.size += 1
-
- def remove(self, page):
- """Remove a page from the tree."""
- with suppress(ValueError):
- self.tree[page.site].remove(page)
- self.size -= 1
-
- def removeSite(self, site):
- """Remove all pages from Site site."""
- with suppress(KeyError):
- self.size -= len(self.tree[site])
- del self.tree[site]
-
- def siteCounts(self):
- """Yield (Site, number of pages in site) pairs."""
- for site, d in self.tree.items():
- yield site, len(d)
-
- def __iter__(self):
- """Iterate through all items of the tree."""
- yield from chain.from_iterable(self.tree.values())
+ super().__init__('site')


class Subject(interwiki_graph.Subject):
@@ -689,7 +651,7 @@
# Mark the origin page as todo.
self.todo = PageTree()
if origin:
- self.todo.add(origin)
+ self.todo.append(origin)

# done is a list of all pages that have been analyzed and that
# are known to belong to this subject.
@@ -782,7 +744,7 @@

for link in links:
page = pywikibot.Page(link)
- self.todo.add(page)
+ self.todo.append(page)
self.found_in[page] = [None]
if keephintedsites:
self.hintedsites.add(page.site)
@@ -795,7 +757,7 @@
* site is a site where we still have work to do on
* count is the number of items in that Site that need work on
"""
- return self.todo.siteCounts()
+ return self.todo.iter_values_len()

def whatsNextPageBatch(self, site):
"""
@@ -815,19 +777,19 @@
# Prepare a list of suitable pages
result = []
for page in self.todo.filter(site):
- self.pending.add(page)
+ self.pending.append(page)
result.append(page)

- self.todo.removeSite(site)
+ self.todo.remove_key(site)

# If there are any, return them. Otherwise, nothing is in progress.
return result

def makeForcedStop(self, counter):
"""End work on the page before the normal end."""
- for site, count in self.todo.siteCounts():
+ for site, count in self.todo.iter_values_len():
counter.minus(site, count)
- self.todo = PageTree()
+ self.todo.clear()
self.forcedStop = True

def addIfNew(self, page, counter, linkingPage):
@@ -860,7 +822,7 @@
return False

self.found_in[page] = [linkingPage]
- self.todo.add(page)
+ self.todo.append(page)
counter.plus(page.site)
return True

@@ -1134,7 +1096,7 @@
# Loop over all the pages that should have been taken care of
for page in self.pending:
# Mark the page as done
- self.done.add(page)
+ self.done.append(page)

# make sure that none of the linked items is an auto item
if self.conf.skipauto:
@@ -1163,12 +1125,12 @@
if page == self.origin:
# The page we are working on is the page that does not
# exist. No use in doing any work on it in that case.
- for site, count in self.todo.siteCounts():
+ for site, count in self.todo.iter_values_len():
counter.minus(site, count)
- self.todo = PageTree()
+ self.todo.clear()
# In some rare cases it might be we already did check some
# 'automatic' links
- self.done = PageTree()
+ self.done.clear()
continue

if page.isRedirectPage() or page.isCategoryRedirect():
@@ -1189,15 +1151,15 @@
if not redirectTargetPage.isRedirectPage() \
and not redirectTargetPage.isCategoryRedirect():
self.origin = redirectTargetPage
- self.todo.add(redirectTargetPage)
+ self.todo.append(redirectTargetPage)
counter.plus(redirectTargetPage.site)
else:
# This is a redirect page to the origin. We don't need
# to follow the redirection.
# In this case we can also stop all hints!
- for site, count in self.todo.siteCounts():
+ for site, count in self.todo.iter_values_len():
counter.minus(site, count)
- self.todo = PageTree()
+ self.todo.clear()
elif not self.conf.followredirect:
self.conf.note('not following {}redirects.'.format(redir))
elif page.isStaticRedirect():
@@ -1220,10 +1182,10 @@
self.conf.remove.append(str(page))
self.conf.note('{} is empty. Skipping.'.format(page))
if page == self.origin:
- for site, count in self.todo.siteCounts():
+ for site, count in self.todo.iter_values_len():
counter.minus(site, count)
- self.todo = PageTree()
- self.done = PageTree()
+ self.todo.clear()
+ self.done.clear()
self.origin = None
continue

@@ -1339,7 +1301,7 @@
break

# These pages are no longer 'in progress'
- self.pending = PageTree()
+ self.pending.clear()
# Check whether we need hints and the user offered to give them
if self.untranslated and not self.hintsAsked:
self.reportInterwikilessPage(page)
@@ -1347,7 +1309,7 @@

def isDone(self):
"""Return True if all the work for this subject has completed."""
- return len(self.todo) == 0
+ return not self.todo

def problem(self, txt, createneed=True):
"""Report a problem with the resolution of this subject."""

To view, visit change 676040. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie847ec35cebc95f50a42551c32d721466a28c26f
Gerrit-Change-Number: 676040
Gerrit-PatchSet: 9
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: JJMC89 <JJMC89.Wikimedia@gmail.com>
Gerrit-MessageType: merged