jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] use textfile for interwiki dumps

Also enable -restore:all option

pickle format was introduced with core branch for interwiki dumps but
text format is more applicable for bot owners. Go back to the old
compat format.

- Introduce a new interwikidumps.py maintenance script
which converts dump files from old pickle format to new txt format
- All dump handling is done by the new InterwikiDumps class
- remove InterwikiBot.dump() method which is no longer needed
- add new InterwikiBot.dump_titles property which is a generator yielding
all page titles to be dumped

Bug: T74943
Bug: T213624
Change-Id: Ie7380d587aab42ace158335de4f41fe9a5709700
---
M docs/scripts/scripts.maintenance.rst
M scripts/README.rst
M scripts/interwiki.py
A scripts/maintenance/interwikidumps.py
M tox.ini
5 files changed, 245 insertions(+), 63 deletions(-)

diff --git a/docs/scripts/scripts.maintenance.rst b/docs/scripts/scripts.maintenance.rst
index b240a49..c7e5bf9 100644
--- a/docs/scripts/scripts.maintenance.rst
+++ b/docs/scripts/scripts.maintenance.rst
@@ -22,6 +22,11 @@

.. automodule:: scripts.maintenance.compat2core

+scripts.maintenance.interwikidumps script
+-----------------------------------------
+
+.. automodule:: scripts.maintenance.interwikidumps
+
scripts.maintenance.make\_i18n\_dict script
-------------------------------------------

diff --git a/scripts/README.rst b/scripts/README.rst
index 5130594..c8df3da 100644
--- a/scripts/README.rst
+++ b/scripts/README.rst
@@ -281,6 +281,8 @@
| compat2core.py | Helper script to convert compat 1.0 scripts to the core |
| | 3.0 framework. Also works for newer Pywikibot releases. |
+------------------------+---------------------------------------------------------+
+ | interwikidumps.py | Convert interwiki dumps from pickle to txt format. |
+ +------------------------+---------------------------------------------------------+
| make_i18n_dict.py | Generate a i18n file from a given script. |
+------------------------+---------------------------------------------------------+
| sorting_order.py | Updates interwiki sorting order in family.py file. |
diff --git a/scripts/interwiki.py b/scripts/interwiki.py
index 6cf62dd..08ab1e3 100755
--- a/scripts/interwiki.py
+++ b/scripts/interwiki.py
@@ -335,7 +335,6 @@
#
import codecs
import os
-import pickle
import re
import socket
import sys
@@ -350,7 +349,7 @@
from pywikibot import config, i18n, pagegenerators, textlib, interwiki_graph
from pywikibot import titletranslate

-from pywikibot.bot import ListOption, StandardOption
+from pywikibot.bot import OptionHandler, ListOption, StandardOption
from pywikibot.cosmetic_changes import moved_links
from pywikibot.tools import first_upper
from pywikibot.tools.formatter import color_format
@@ -1935,26 +1934,8 @@

@property
def dump_titles(self):
- """Return list of titles for dump file."""
- return [s.origin.title() for s in self.subjects]
-
- def dump(self, append=True):
- """Write dump file."""
- site = pywikibot.Site()
- dumpfn = pywikibot.config.datafilepath(
- 'data',
- 'interwiki-dumps',
- '{0}-{1}.pickle'.format(site.family.name, site.code)
- )
- if append:
- mode = 'appended'
- else:
- mode = 'written'
- with open(dumpfn, mode[0] + 'b') as f:
- pickle.dump(self.dump_titles, f, protocol=config.pickle_protocol)
- pywikibot.output('Dump {0} ({1}) {2}.'
- .format(site.code, site.family.name, mode))
- return dumpfn
+ """Return generator of titles for dump file."""
+ return (s.origin.title(as_link=True) for s in self.subjects)

def generateMore(self, number):
"""Generate more subjects.
@@ -2264,6 +2245,141 @@
return False


+class InterwikiDumps(OptionHandler):
+
+ """Handle interwiki dumps."""
+
+ available_options = {
+ 'do_continue': False,
+ 'restore_all': False
+ }
+
+ FILE_PATTERN = '{site.family.name}-{site.code}.txt'
+
+ def __init__(self, **kwargs):
+ """Initializer.
+
+ @keyword do_continue: If true, continue alphabetically starting at the
+ last of the dumped pages.
+ """
+ self.site = kwargs.pop('site', pywikibot.Site())
+ super().__init__(**kwargs)
+
+ self.restored_files = set()
+ self._next_page = '!'
+ self._next_namespace = 0
+ self.path = pywikibot.config.datafilepath('data', 'interwiki-dumps')
+
+ @property
+ def next_page(self):
+ """Return next page title string for continue option."""
+ if self._next_page == '!':
+ pywikibot.output('Dump file is empty! Starting at the beginning.')
+ return self._next_page
+
+ @property
+ def next_namespace(self):
+ """Return next page namespace for continue option."""
+ return self._next_namespace
+
+ def remove(self, filename: str):
+ """Remove filename from restored files.
+
+ @param filename: A filename to be removed from restored set.
+ """
+ with suppress(KeyError):
+ self.restored_files.remove(filename)
+
+ def get_files(self, mode='txt'):
+ """Get dump files from directory."""
+ pattern = (r'(?P<file>\A(?P<fam>[a-z]+)-(?P<code>[a-z]+)\.{}\Z)'
+ .format(mode))
+ for filename in os.listdir(self.path):
+ found = re.match(pattern, filename)
+ if found:
+ yield (found['file'],
+ pywikibot.Site(found['code'], found['fam']))
+
+ @property
+ def files(self):
+ """Return file generator depending on restore_all option.
+
+ rtype: generator
+ """
+ if self.opt.restore_all:
+ return self.get_files()
+ return iter([(self.FILE_PATTERN.format(site=self.site), self.site)])
+
+ def read_dump(self):
+ """Read the dump file.
+
+ @rtype: generator
+ """
+ for tail, site in self.files:
+ filename = os.path.join(self.path, tail)
+
+ if not os.path.exists(filename):
+ pywikibot.output(tail + ' does not exist.')
+ else:
+ pywikibot.output('Retrieving pages from dump file ' + tail)
+ for page in pagegenerators.TextfilePageGenerator(
+ filename, site):
+ if site == self.site:
+ self._next_page = page.title(with_ns=False) + '!'
+ self._next_namespace = page.namespace()
+ yield page
+ else:
+ self.restored_files.add(filename)
+
+ if self.opt.do_continue:
+ yield from self.site.allpages(start=self.next_page,
+ namespace=self.next_namespace,
+ filterredir=False)
+
+ def write_dump(self, iterable, append: bool = True):
+ """Write dump file.
+
+ @param iterable: an iterable of page titles to be dumped.
+ @type iterable: iterable
+ @param append: if a dump already exits, append the page titles to it
+ if True else overwrite it.
+ """
+ filename = os.path.join(self.path,
+ self.FILE_PATTERN.format(site=self.site))
+ mode = 'appended' if append else 'written'
+ with codecs.open(filename, mode[0], 'utf-8') as f:
+ f.write('\r\n'.join(iterable))
+ f.write('\r\n')
+ pywikibot.output('Dump {site.code} ({site.family.name}) {mode}.'
+ .format(site=self.site, mode=mode))
+ self.remove(filename)
+
+ def delete_dumps(self):
+ """Delete processed dumps."""
+ for filename in self.restored_files:
+ tail = os.path.split(filename)[-1]
+ try:
+ os.remove(filename)
+ pywikibot.output('Dumpfile {0} deleted'.format(tail))
+ except OSError as e:
+ pywikibot.error('Cannot delete {} due to\n{}\nDo it manually.'
+ .format(tail, e))
+
+ def old_dumps_found(self) -> bool:
+ """Check whether dumps are in old format.
+
+ @return: True if there are dumps in pickle format, False otherwise
+ """
+ try:
+ next(self.get_files(mode='pickle'))
+ except StopIteration:
+ return False
+ pywikibot.warning(fill(
+ 'The pickle format is deprecated. Use maintenance script '
+ 'interwikidumps.py to convert pickle files into text files.'))
+ return True
+
+
def main(*args):
"""
Process command line arguments and invoke bot.
@@ -2284,8 +2400,6 @@
hintlessPageGen = None
optContinue = False
optRestore = False
- restoredFiles = []
- dumpFileName = ''
append = True
newPages = None

@@ -2356,6 +2470,9 @@
mainpagename = site.siteinfo['mainpage']
iwconf.skip.add(pywikibot.Page(site, mainpagename))

+ dump = InterwikiDumps(site=site, do_continue=optContinue,
+ restore_all=iwconf.restore_all)
+
if newPages is not None:
if len(namespaces) == 0:
ns = 0
@@ -2374,34 +2491,10 @@
namespaces=ns)

elif optRestore or optContinue or iwconf.restore_all:
- dumpFileName = pywikibot.config.datafilepath(
- 'data',
- 'interwiki-dumps',
- '{0}-{1}.pickle'.format(site.family.name, site.code)
- )
- try:
- with open(dumpFileName, 'rb') as f:
- dumpedTitles = pickle.load(f)
- except (EOFError, IOError):
- dumpedTitles = []
- pages = [pywikibot.Page(site, title) for title in dumpedTitles]
-
- hintlessPageGen = iter(pages)
- if optContinue:
- if pages:
- last = pages[-1]
- nextPage = last.title(with_ns=False) + '!'
- namespace = last.namespace()
- else:
- pywikibot.output(
- 'Dump file is empty?! Starting at the beginning.')
- nextPage = '!'
- namespace = 0
- gen2 = site.allpages(start=nextPage,
- namespace=namespace,
- filterredir=False)
- hintlessPageGen = chain(hintlessPageGen, gen2)
- restoredFiles.append(dumpFileName)
+ if dump.old_dumps_found():
+ # There are dumps is pickle format; they must be converted first.
+ return
+ hintlessPageGen = dump.read_dump()

bot = InterwikiBot(iwconf)

@@ -2426,21 +2519,14 @@
try:
bot.run()
except KeyboardInterrupt:
- dumpFileName = bot.dump(append)
+ dump.write_dump(bot.dump_titles, append)
except Exception:
pywikibot.exception()
- dumpFileName = bot.dump(append)
+ dump.write_dump(bot.dump_titles, append)
else:
pywikibot.output('Script terminated sucessfully.')
finally:
- if dumpFileName:
- with suppress(ValueError):
- restoredFiles.remove(dumpFileName)
- for dumpFileName in restoredFiles:
- with suppress(OSError):
- os.remove(dumpFileName)
- pywikibot.output('Dumpfile {0} deleted'
- .format(dumpFileName.split('\\')[-1]))
+ dump.delete_dumps()


if __name__ == '__main__':
diff --git a/scripts/maintenance/interwikidumps.py b/scripts/maintenance/interwikidumps.py
new file mode 100644
index 0000000..7a1b5cf
--- /dev/null
+++ b/scripts/maintenance/interwikidumps.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""Script to convert interwiki dumps from pickle format to txt format."""
+#
+# (C) Pywikibot team, 2019-2020
+#
+# Distributed under the terms of the MIT license.
+#
+import codecs
+import os
+import pickle
+import re
+
+import pywikibot
+
+from pywikibot import config2 as config
+
+
+def pickle_files(path):
+ """Retrieve pickle files."""
+ pattern = r'(?P<old>(?P<new>\A(?P<fam>[a-z]+)-(?P<code>[a-z]+)\.)pickle\Z)'
+ for filename in os.listdir(path):
+ found = re.match(pattern, filename)
+ if not found:
+ continue
+
+ old = found['old']
+ if os.path.exists(os.path.join(path, old)):
+ yield (old, found['new'] + 'txt',
+ pywikibot.Site(found['code'], found['fam']))
+
+
+def read_content(filename):
+ """Read content of pickle file."""
+ try:
+ with open(filename, 'rb') as f:
+ titles = pickle.load(f)
+ except (EOFError, IOError):
+ pywikibot.exception()
+ titles = None
+ return titles
+
+
+def write_content(filename, site, content):
+ """Write content to txt file."""
+ titles = [pywikibot.Page(site, title).title(as_link=True)
+ for title in content]
+ with codecs.open(filename, 'w', 'utf-8') as f:
+ f.write('\r\n'.join(titles))
+ f.write('\r\n')
+
+
+def convert_dumps():
+ """Convert interwikidump from pickle format to txt format."""
+ folder = config.datafilepath('data', 'interwiki-dumps')
+ for old_file, new_file, site in pickle_files(folder):
+ # read old file
+ pywikibot.output('\nReading {}...'.format(old_file))
+ old_filepath = os.path.join(folder, old_file)
+ titles = read_content(old_filepath)
+
+ if not titles:
+ pywikibot.error('Unable to read ' + old_file)
+ continue
+
+ # write new file
+ pywikibot.output('Writing {}...'.format(new_file))
+ write_content(os.path.join(folder, new_file), site, titles)
+
+ # delete old file
+ try:
+ os.remove(old_filepath)
+ pywikibot.output('Old dumpfile {} deleted'.format(old_file))
+ except OSError as e:
+ pywikibot.error('Cannot delete {} due to\n{}\nDo it manually.'
+ .format(old_file, e))
+
+
+def main(*args):
+ """Main function."""
+ args = pywikibot.argvu[1:]
+ if args and args[0] == '-help':
+ pywikibot.output(__doc__)
+ else:
+ convert_dumps()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tox.ini b/tox.ini
index 6f62d87..bdac3fa 100644
--- a/tox.ini
+++ b/tox.ini
@@ -175,7 +175,7 @@
scripts/imagerecat.py : N803, N806, N802
scripts/imagetransfer.py : N803, N806, N816
scripts/imageuncat.py: N802, N816
- scripts/interwiki.py : N802, N803, N806, N816
+ scripts/interwiki.py : N802, N803, N806, N815, N816
scripts/isbn.py : N802, N803, N806, N816
scripts/maintenance/* : T001
scripts/maintenance/download_dump.py : N815

To view, visit change 483939. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie7380d587aab42ace158335de4f41fe9a5709700
Gerrit-Change-Number: 483939
Gerrit-PatchSet: 15
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa@seznam.cz>
Gerrit-Reviewer: JAn Dudík <jan.dudik@gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged