jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/483939 )
Change subject: [IMPR] use textfile for interwiki dumps ......................................................................
[IMPR] use textfile for interwiki dumps
Also enable -restore:all option
pickle format was introduced with core branch for interwiki dumps but text format is more applicable for bot owners. Go back to the old compat format.
- Introduce a new interwikidumps.py maintenance script which converts dump files from old pickle format to new txt format - All dump handling is done by the new InterwikiDumps class - remove InterwikiBot.dump() method which is no longer needed - add new InterwikiBot.dump_titles property which is a generator yielding all page titles to be dumped
Bug: T74943 Bug: T213624 Change-Id: Ie7380d587aab42ace158335de4f41fe9a5709700 --- M docs/scripts/scripts.maintenance.rst M scripts/README.rst M scripts/interwiki.py A scripts/maintenance/interwikidumps.py M tox.ini 5 files changed, 245 insertions(+), 63 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/docs/scripts/scripts.maintenance.rst b/docs/scripts/scripts.maintenance.rst index b240a49..c7e5bf9 100644 --- a/docs/scripts/scripts.maintenance.rst +++ b/docs/scripts/scripts.maintenance.rst @@ -22,6 +22,11 @@
.. automodule:: scripts.maintenance.compat2core
+scripts.maintenance.interwikidumps script +----------------------------------------- + +.. automodule:: scripts.maintenance.interwikidumps + scripts.maintenance.make_i18n_dict script -------------------------------------------
diff --git a/scripts/README.rst b/scripts/README.rst index 5130594..c8df3da 100644 --- a/scripts/README.rst +++ b/scripts/README.rst @@ -281,6 +281,8 @@ | compat2core.py | Helper script to convert compat 1.0 scripts to the core | | | 3.0 framework. Also works for newer Pywikibot releases. | +------------------------+---------------------------------------------------------+ + | interwikidumps.py | Convert interwiki dumps from pickle to txt format. | + +------------------------+---------------------------------------------------------+ | make_i18n_dict.py | Generate a i18n file from a given script. | +------------------------+---------------------------------------------------------+ | sorting_order.py | Updates interwiki sorting order in family.py file. | diff --git a/scripts/interwiki.py b/scripts/interwiki.py index 6cf62dd..08ab1e3 100755 --- a/scripts/interwiki.py +++ b/scripts/interwiki.py @@ -335,7 +335,6 @@ # import codecs import os -import pickle import re import socket import sys @@ -350,7 +349,7 @@ from pywikibot import config, i18n, pagegenerators, textlib, interwiki_graph from pywikibot import titletranslate
-from pywikibot.bot import ListOption, StandardOption +from pywikibot.bot import OptionHandler, ListOption, StandardOption from pywikibot.cosmetic_changes import moved_links from pywikibot.tools import first_upper from pywikibot.tools.formatter import color_format @@ -1935,26 +1934,8 @@
@property def dump_titles(self): - """Return list of titles for dump file.""" - return [s.origin.title() for s in self.subjects] - - def dump(self, append=True): - """Write dump file.""" - site = pywikibot.Site() - dumpfn = pywikibot.config.datafilepath( - 'data', - 'interwiki-dumps', - '{0}-{1}.pickle'.format(site.family.name, site.code) - ) - if append: - mode = 'appended' - else: - mode = 'written' - with open(dumpfn, mode[0] + 'b') as f: - pickle.dump(self.dump_titles, f, protocol=config.pickle_protocol) - pywikibot.output('Dump {0} ({1}) {2}.' - .format(site.code, site.family.name, mode)) - return dumpfn + """Return generator of titles for dump file.""" + return (s.origin.title(as_link=True) for s in self.subjects)
def generateMore(self, number): """Generate more subjects. @@ -2264,6 +2245,141 @@ return False
+class InterwikiDumps(OptionHandler): + + """Handle interwiki dumps.""" + + available_options = { + 'do_continue': False, + 'restore_all': False + } + + FILE_PATTERN = '{site.family.name}-{site.code}.txt' + + def __init__(self, **kwargs): + """Initializer. + + @keyword do_continue: If true, continue alphabetically starting at the + last of the dumped pages. + """ + self.site = kwargs.pop('site', pywikibot.Site()) + super().__init__(**kwargs) + + self.restored_files = set() + self._next_page = '!' + self._next_namespace = 0 + self.path = pywikibot.config.datafilepath('data', 'interwiki-dumps') + + @property + def next_page(self): + """Return next page title string for continue option.""" + if self._next_page == '!': + pywikibot.output('Dump file is empty! Starting at the beginning.') + return self._next_page + + @property + def next_namespace(self): + """Return next page namespace for continue option.""" + return self._next_namespace + + def remove(self, filename: str): + """Remove filename from restored files. + + @param filename: A filename to be removed from restored set. + """ + with suppress(KeyError): + self.restored_files.remove(filename) + + def get_files(self, mode='txt'): + """Get dump files from directory.""" + pattern = (r'(?P<file>\A(?P<fam>[a-z]+)-(?P<code>[a-z]+).{}\Z)' + .format(mode)) + for filename in os.listdir(self.path): + found = re.match(pattern, filename) + if found: + yield (found['file'], + pywikibot.Site(found['code'], found['fam'])) + + @property + def files(self): + """Return file generator depending on restore_all option. + + rtype: generator + """ + if self.opt.restore_all: + return self.get_files() + return iter([(self.FILE_PATTERN.format(site=self.site), self.site)]) + + def read_dump(self): + """Read the dump file. + + @rtype: generator + """ + for tail, site in self.files: + filename = os.path.join(self.path, tail) + + if not os.path.exists(filename): + pywikibot.output(tail + ' does not exist.') + else: + pywikibot.output('Retrieving pages from dump file ' + tail) + for page in pagegenerators.TextfilePageGenerator( + filename, site): + if site == self.site: + self._next_page = page.title(with_ns=False) + '!' + self._next_namespace = page.namespace() + yield page + else: + self.restored_files.add(filename) + + if self.opt.do_continue: + yield from self.site.allpages(start=self.next_page, + namespace=self.next_namespace, + filterredir=False) + + def write_dump(self, iterable, append: bool = True): + """Write dump file. + + @param iterable: an iterable of page titles to be dumped. + @type iterable: iterable + @param append: if a dump already exits, append the page titles to it + if True else overwrite it. + """ + filename = os.path.join(self.path, + self.FILE_PATTERN.format(site=self.site)) + mode = 'appended' if append else 'written' + with codecs.open(filename, mode[0], 'utf-8') as f: + f.write('\r\n'.join(iterable)) + f.write('\r\n') + pywikibot.output('Dump {site.code} ({site.family.name}) {mode}.' + .format(site=self.site, mode=mode)) + self.remove(filename) + + def delete_dumps(self): + """Delete processed dumps.""" + for filename in self.restored_files: + tail = os.path.split(filename)[-1] + try: + os.remove(filename) + pywikibot.output('Dumpfile {0} deleted'.format(tail)) + except OSError as e: + pywikibot.error('Cannot delete {} due to\n{}\nDo it manually.' + .format(tail, e)) + + def old_dumps_found(self) -> bool: + """Check whether dumps are in old format. + + @return: True if there are dumps in pickle format, False otherwise + """ + try: + next(self.get_files(mode='pickle')) + except StopIteration: + return False + pywikibot.warning(fill( + 'The pickle format is deprecated. Use maintenance script ' + 'interwikidumps.py to convert pickle files into text files.')) + return True + + def main(*args): """ Process command line arguments and invoke bot. @@ -2284,8 +2400,6 @@ hintlessPageGen = None optContinue = False optRestore = False - restoredFiles = [] - dumpFileName = '' append = True newPages = None
@@ -2356,6 +2470,9 @@ mainpagename = site.siteinfo['mainpage'] iwconf.skip.add(pywikibot.Page(site, mainpagename))
+ dump = InterwikiDumps(site=site, do_continue=optContinue, + restore_all=iwconf.restore_all) + if newPages is not None: if len(namespaces) == 0: ns = 0 @@ -2374,34 +2491,10 @@ namespaces=ns)
elif optRestore or optContinue or iwconf.restore_all: - dumpFileName = pywikibot.config.datafilepath( - 'data', - 'interwiki-dumps', - '{0}-{1}.pickle'.format(site.family.name, site.code) - ) - try: - with open(dumpFileName, 'rb') as f: - dumpedTitles = pickle.load(f) - except (EOFError, IOError): - dumpedTitles = [] - pages = [pywikibot.Page(site, title) for title in dumpedTitles] - - hintlessPageGen = iter(pages) - if optContinue: - if pages: - last = pages[-1] - nextPage = last.title(with_ns=False) + '!' - namespace = last.namespace() - else: - pywikibot.output( - 'Dump file is empty?! Starting at the beginning.') - nextPage = '!' - namespace = 0 - gen2 = site.allpages(start=nextPage, - namespace=namespace, - filterredir=False) - hintlessPageGen = chain(hintlessPageGen, gen2) - restoredFiles.append(dumpFileName) + if dump.old_dumps_found(): + # There are dumps is pickle format; they must be converted first. + return + hintlessPageGen = dump.read_dump()
bot = InterwikiBot(iwconf)
@@ -2426,21 +2519,14 @@ try: bot.run() except KeyboardInterrupt: - dumpFileName = bot.dump(append) + dump.write_dump(bot.dump_titles, append) except Exception: pywikibot.exception() - dumpFileName = bot.dump(append) + dump.write_dump(bot.dump_titles, append) else: pywikibot.output('Script terminated sucessfully.') finally: - if dumpFileName: - with suppress(ValueError): - restoredFiles.remove(dumpFileName) - for dumpFileName in restoredFiles: - with suppress(OSError): - os.remove(dumpFileName) - pywikibot.output('Dumpfile {0} deleted' - .format(dumpFileName.split('\')[-1])) + dump.delete_dumps()
if __name__ == '__main__': diff --git a/scripts/maintenance/interwikidumps.py b/scripts/maintenance/interwikidumps.py new file mode 100644 index 0000000..7a1b5cf --- /dev/null +++ b/scripts/maintenance/interwikidumps.py @@ -0,0 +1,89 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Script to convert interwiki dumps from pickle format to txt format.""" +# +# (C) Pywikibot team, 2019-2020 +# +# Distributed under the terms of the MIT license. +# +import codecs +import os +import pickle +import re + +import pywikibot + +from pywikibot import config2 as config + + +def pickle_files(path): + """Retrieve pickle files.""" + pattern = r'(?P<old>(?P<new>\A(?P<fam>[a-z]+)-(?P<code>[a-z]+).)pickle\Z)' + for filename in os.listdir(path): + found = re.match(pattern, filename) + if not found: + continue + + old = found['old'] + if os.path.exists(os.path.join(path, old)): + yield (old, found['new'] + 'txt', + pywikibot.Site(found['code'], found['fam'])) + + +def read_content(filename): + """Read content of pickle file.""" + try: + with open(filename, 'rb') as f: + titles = pickle.load(f) + except (EOFError, IOError): + pywikibot.exception() + titles = None + return titles + + +def write_content(filename, site, content): + """Write content to txt file.""" + titles = [pywikibot.Page(site, title).title(as_link=True) + for title in content] + with codecs.open(filename, 'w', 'utf-8') as f: + f.write('\r\n'.join(titles)) + f.write('\r\n') + + +def convert_dumps(): + """Convert interwikidump from pickle format to txt format.""" + folder = config.datafilepath('data', 'interwiki-dumps') + for old_file, new_file, site in pickle_files(folder): + # read old file + pywikibot.output('\nReading {}...'.format(old_file)) + old_filepath = os.path.join(folder, old_file) + titles = read_content(old_filepath) + + if not titles: + pywikibot.error('Unable to read ' + old_file) + continue + + # write new file + pywikibot.output('Writing {}...'.format(new_file)) + write_content(os.path.join(folder, new_file), site, titles) + + # delete old file + try: + os.remove(old_filepath) + pywikibot.output('Old dumpfile {} deleted'.format(old_file)) + except OSError as e: + pywikibot.error('Cannot delete {} due to\n{}\nDo it manually.' + .format(old_file, e)) + + +def main(*args): + """Main function.""" + args = pywikibot.argvu[1:] + if args and args[0] == '-help': + pywikibot.output(__doc__) + else: + convert_dumps() + + +if __name__ == '__main__': + main() diff --git a/tox.ini b/tox.ini index 6f62d87..bdac3fa 100644 --- a/tox.ini +++ b/tox.ini @@ -175,7 +175,7 @@ scripts/imagerecat.py : N803, N806, N802 scripts/imagetransfer.py : N803, N806, N816 scripts/imageuncat.py: N802, N816 - scripts/interwiki.py : N802, N803, N806, N816 + scripts/interwiki.py : N802, N803, N806, N815, N816 scripts/isbn.py : N802, N803, N806, N816 scripts/maintenance/* : T001 scripts/maintenance/download_dump.py : N815