jenkins-bot has submitted this change and it was merged.
Change subject: Implement item-centric WikidataBot.run ......................................................................
Implement item-centric WikidataBot.run
All wikidata scripts need to get the item for a page. Convert them to using run() and treat() semantics. Adds 'treat_missing_item' keyword, so newitem.py can indicate that it wants to perform custom handling of missing items for a page.
Also fix bug 66523 for all scripts, including category.py. The bug was introduced by 431cb77.
And fix all pep257 errors except missing docstrings.
Bug: 66523 Change-Id: Iaaaf3fa583a3e299899197ac9c67530f1972a861 --- M pywikibot/bot.py M scripts/category.py M scripts/claimit.py M scripts/coordinate_import.py M scripts/harvest_template.py M scripts/illustrate_wikidata.py M scripts/newitem.py M tox.ini 8 files changed, 232 insertions(+), 189 deletions(-)
Approvals: XZise: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/bot.py b/pywikibot/bot.py index 2be8a2f..941f4b0 100644 --- a/pywikibot/bot.py +++ b/pywikibot/bot.py @@ -980,3 +980,38 @@ source = pywikibot.Claim(self.repo, 'P143') source.setTarget(self.source_values.get(site.family.name).get(site.code)) return source + + def run(self): + """Process all pages in generator.""" + if not hasattr(self, 'generator'): + raise NotImplementedError('Variable %s.generator not set.' + % self.__class__.__name__) + + treat_missing_item = hasattr(self, 'treat_missing_item') + + try: + for page in self.generator: + if not page.exists(): + pywikibot.output('%s doesn't exist.' % page) + try: + item = pywikibot.ItemPage.fromPage(page) + except pywikibot.NoPage: + item = None + if not item: + if not treat_missing_item: + pywikibot.output( + '%s doesn't have a wikidata item.' % page) + #TODO FIXME: Add an option to create the item + continue + self.treat(page, item) + except QuitKeyboardInterrupt: + pywikibot.output('\nUser quit %s bot run...' % + self.__class__.__name__) + except KeyboardInterrupt: + if config.verbose_output: + raise + else: + pywikibot.output('\nKeyboardInterrupt during %s bot run...' % + self.__class__.__name__) + except Exception as e: + pywikibot.exception(msg=e, tb=True) diff --git a/scripts/category.py b/scripts/category.py index 09f960f..12beae3 100755 --- a/scripts/category.py +++ b/scripts/category.py @@ -697,8 +697,11 @@ Do not use this function from outside the class. """ if self.oldcat.exists(): - item = pywikibot.ItemPage.fromPage(self.oldcat) - if item.exists(): + try: + item = pywikibot.ItemPage.fromPage(self.oldcat) + except pywikibot.NoPage: + item = None + if item and item.exists(): comment = i18n.twtranslate(self.site, 'category-was-moved', {'newcat': self.newcat.title(), 'title': self.newcat.title()}) diff --git a/scripts/claimit.py b/scripts/claimit.py index 39fcf11..d389c93 100755 --- a/scripts/claimit.py +++ b/scripts/claimit.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -This script adds claims to Wikidata items based on categories. +A script that adds claims to Wikidata items based on categories.
------------------------------------------------------------------------------
@@ -68,11 +68,13 @@
class ClaimRobot(WikidataBot): - """ - A bot to add Wikidata claims - """ + + """A bot to add Wikidata claims.""" + def __init__(self, generator, claims, exists_arg=''): """ + Constructor. + Arguments: * generator - A generator that yields Page objects. * claims - A list of wikidata claims @@ -84,18 +86,14 @@ self.exists_arg = exists_arg self.repo = pywikibot.Site().data_repository() self.cacheSources() - - def run(self): - """Starts the robot.""" if self.exists_arg: pywikibot.output(''exists' argument set to '%s'' % self.exists_arg) - for page in self.generator: - self.current_page = page - item = pywikibot.ItemPage.fromPage(page) - if not item.exists(): - # TODO FIXME: We should provide an option to create the page - pywikibot.output('%s doesn't have a wikidata item :(' % page) - continue + + def treat(self, page, item): + """Treat each page.""" + self.current_page = page + + if item: for claim in self.claims: skip = False # If claim with same property already exists... @@ -137,7 +135,8 @@
def listsEqual(list1, list2): """ - Returns true if the lists are probably equal, ignoring order. + Return true if the lists are probably equal, ignoring order. + Works for lists of unhashable items (like dictionaries). """ if len(list1) != len(list2): diff --git a/scripts/coordinate_import.py b/scripts/coordinate_import.py index 322db55..71cef9c 100644 --- a/scripts/coordinate_import.py +++ b/scripts/coordinate_import.py @@ -1,6 +1,8 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ +Coordinate importing script. + Usage:
python coordinate_import.py -lang:en -family:wikipedia -cat:Category:Coordinates_not_on_Wikidata @@ -30,11 +32,13 @@
class CoordImportRobot(WikidataBot): - """ - A bot to import coordinates to Wikidata - """ + + """A bot to import coordinates to Wikidata.""" + def __init__(self, generator): """ + Constructor. + Arguments: * generator - A generator that yields Page objects.
@@ -60,41 +64,41 @@ if self.prop in claim.qualifiers: return prop
- def run(self): - """Start the robot.""" - for page in self.generator: - self.current_page = page - item = pywikibot.ItemPage.fromPage(page) + def treat(self, page, item): + """Treat page/item.""" + self.current_page = page
- if item.exists(): - pywikibot.output(u'Found %s' % item.title()) - coordinate = page.coordinates(primary_only=True) + coordinate = page.coordinates(primary_only=True)
- if coordinate: - claims = item.get().get('claims') - if self.prop in claims: - pywikibot.output(u'Item %s already contains coordinates (%s)' - % (item.title(), self.prop)) - else: - prop = self.has_coord_qualifier(claims) - if prop: - pywikibot.output(u'Item %s already contains coordinates' - u' (%s) as qualifier for %s' - % (item.title(), self.prop, prop)) - else: - newclaim = pywikibot.Claim(self.repo, self.prop) - newclaim.setTarget(coordinate) - pywikibot.output(u'Adding %s, %s to %s' % (coordinate.lat, - coordinate.lon, - item.title())) - try: - item.addClaim(newclaim) + if not coordinate: + return
- source = self.getSource(page.site) - if source: - newclaim.addSource(source, bot=True) - except CoordinateGlobeUnknownException as e: - pywikibot.output(u'Skipping unsupported globe: %s' % e.args) + claims = item.get().get('claims') + if self.prop in claims: + pywikibot.output(u'Item %s already contains coordinates (%s)' + % (item.title(), self.prop)) + return + + prop = self.has_coord_qualifier(claims) + if prop: + pywikibot.output(u'Item %s already contains coordinates' + u' (%s) as qualifier for %s' + % (item.title(), self.prop, prop)) + return + + newclaim = pywikibot.Claim(self.repo, self.prop) + newclaim.setTarget(coordinate) + pywikibot.output(u'Adding %s, %s to %s' % (coordinate.lat, + coordinate.lon, + item.title())) + try: + item.addClaim(newclaim) + + source = self.getSource(page.site) + if source: + newclaim.addSource(source, bot=True) + except CoordinateGlobeUnknownException as e: + pywikibot.output(u'Skipping unsupported globe: %s' % e.args)
def main(): diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py index d26245b..cdb1275 100755 --- a/scripts/harvest_template.py +++ b/scripts/harvest_template.py @@ -1,6 +1,8 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ +Template harvesting script. + Usage:
python harvest_template.py -transcludes:"..." template_parameter PID [template_parameter PID] @@ -38,11 +40,13 @@
class HarvestRobot(WikidataBot): - """ - A bot to add Wikidata claims - """ + + """A bot to add Wikidata claims.""" + def __init__(self, generator, templateTitle, fields): """ + Constructor. + Arguments: * generator - A generator that yields Page objects. * templateTitle - The template to work on @@ -55,18 +59,10 @@ self.fields = fields self.repo = pywikibot.Site().data_repository() self.cacheSources() - - def run(self): - """Starts the robot.""" self.templateTitles = self.getTemplateSynonyms(self.templateTitle) - for page in self.generator: - try: - self.processPage(page) - except Exception as e: - pywikibot.exception(msg=e, tb=True)
def getTemplateSynonyms(self, title): - """Fetches redirects of the title, so we can check against them.""" + """Fetch redirects of the title, so we can check against them.""" temp = pywikibot.Page(pywikibot.Site(), title, ns=10) if not temp.exists(): pywikibot.error(u'Template %s does not exist.' % temp.title()) @@ -94,9 +90,12 @@ if linked_page.isRedirectPage(): linked_page = linked_page.getRedirectTarget()
- linked_item = pywikibot.ItemPage.fromPage(linked_page) + try: + linked_item = pywikibot.ItemPage.fromPage(linked_page) + except pywikibot.NoPage: + linked_item = None
- if not linked_item.exists(): + if not item or not linked_item.exists(): pywikibot.output(u'%s doesn't have a wikidata item to link with. Skipping' % (linked_page)) return
@@ -106,35 +105,31 @@
return linked_item
- def processPage(self, page): - """Process a single page.""" - item = pywikibot.ItemPage.fromPage(page) + def treat(self, page, item): + """Process a single page/item.""" self.current_page = page - if not item.exists(): - pywikibot.output('%s doesn't have a wikidata item :(' % page) - #TODO FIXME: We should provide an option to create the page - return item.get() if set(self.fields.values()) <= set(item.claims.keys()): pywikibot.output(u'%s item %s has claims for all properties. Skipping' % (page, item.title())) - else: - pagetext = page.get() - templates = textlib.extract_templates_and_params(pagetext) - for (template, fielddict) in templates: - # Clean up template - try: - template = pywikibot.Page(page.site, template, - ns=10).title(withNamespace=False) - except pywikibot.exceptions.InvalidTitle: - pywikibot.error(u"Failed parsing template; '%s' should be the template name." % template) - continue - # We found the template we were looking for - if template in self.templateTitles: - for field, value in fielddict.items(): - field = field.strip() - value = value.strip() - if not field or not value: - continue + return + + pagetext = page.get() + templates = textlib.extract_templates_and_params(pagetext) + for (template, fielddict) in templates: + # Clean up template + try: + template = pywikibot.Page(page.site, template, + ns=10).title(withNamespace=False) + except pywikibot.exceptions.InvalidTitle: + pywikibot.error(u"Failed parsing template; '%s' should be the template name." % template) + return + # We found the template we were looking for + if template in self.templateTitles: + for field, value in fielddict.items(): + field = field.strip() + value = value.strip() + if not field or not value: + return
# This field contains something useful for us if field in self.fields: @@ -153,12 +148,12 @@ match = re.search(pywikibot.link_regex, value) if not match: pywikibot.output(u'%s field %s value %s isnt a wikilink. Skipping' % (claim.getID(), field, value)) - continue + return
link_text = match.group(1) linked_item = self._template_link_target(item, link_text) if not linked_item: - continue + return
claim.setTarget(linked_item) elif claim.type == 'string': @@ -171,11 +166,11 @@ image = pywikibot.FilePage(image.getRedirectTarget()) if not image.exists(): pywikibot.output('[[%s]] doesn't exist so I can't link to it' % (image.title(),)) - continue + return claim.setTarget(image) else: pywikibot.output("%s is not a supported datatype." % claim.type) - continue + return
pywikibot.output('Adding %s --> %s' % (claim.getID(), claim.getTarget())) item.addClaim(claim) diff --git a/scripts/illustrate_wikidata.py b/scripts/illustrate_wikidata.py index 21b1a36..0a4dc1c 100644 --- a/scripts/illustrate_wikidata.py +++ b/scripts/illustrate_wikidata.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- """ Bot to add images to Wikidata items. The image is extracted from the page_props. + For this to be available the PageImages extension (https://www.mediawiki.org/wiki/Extension:PageImages) needs to be installed
@@ -27,11 +28,13 @@
class IllustrateRobot(WikidataBot): - """ - A bot to add Wikidata image claims - """ + + """A bot to add Wikidata image claims.""" + def __init__(self, generator, wdproperty=u'P18'): """ + Constructor. + Arguments: * generator - A generator that yields Page objects. * wdproperty - The property to add. Should be of type commonsMedia @@ -47,38 +50,40 @@ raise ValueError(u'%s is of type %s, should be commonsMedia' % (self.wdproperty, claim.type))
- def run(self): - """Starts the bot.""" - for page in self.generator: - self.current_page = page - item = pywikibot.ItemPage.fromPage(page) + def treat(self, page, item): + """Treat a page / item.""" + self.current_page = page
- if item.exists(): - pywikibot.output(u'Found %s' % item.title()) - imagename = page.properties().get('page_image') + pywikibot.output(u'Found %s' % item.title()) + imagename = page.properties().get('page_image')
- if imagename: - claims = item.get().get('claims') - if self.wdproperty in claims: - pywikibot.output(u'Item %s already contains image (%s)' % (item.title(), self.wdproperty)) - else: - newclaim = pywikibot.Claim(self.repo, self.wdproperty) - commonssite = pywikibot.Site("commons", "commons") - imagelink = pywikibot.Link(imagename, source=commonssite, defaultNamespace=6) - image = pywikibot.FilePage(imagelink) - if image.isRedirectPage(): - image = pywikibot.FilePage(image.getRedirectTarget()) - if not image.exists(): - pywikibot.output('[[%s]] doesn't exist so I can't link to it' % (image.title(),)) - continue - newclaim.setTarget(image) - pywikibot.output('Adding %s --> %s' % (newclaim.getID(), newclaim.getTarget())) - item.addClaim(newclaim) + if not imagename: + return
- # A generator might yield pages from multiple sites - source = self.getSource(page.site) - if source: - newclaim.addSource(source, bot=True) + claims = item.get().get('claims') + if self.wdproperty in claims: + pywikibot.output(u'Item %s already contains image (%s)' % (item.title(), self.wdproperty)) + return + + newclaim = pywikibot.Claim(self.repo, self.wdproperty) + commonssite = pywikibot.Site("commons", "commons") + imagelink = pywikibot.Link(imagename, source=commonssite, defaultNamespace=6) + image = pywikibot.FilePage(imagelink) + if image.isRedirectPage(): + image = pywikibot.FilePage(image.getRedirectTarget()) + + if not image.exists(): + pywikibot.output('[[%s]] doesn't exist so I can't link to it' % (image.title(),)) + return + + newclaim.setTarget(image) + pywikibot.output('Adding %s --> %s' % (newclaim.getID(), newclaim.getTarget())) + item.addClaim(newclaim) + + # A generator might yield pages from multiple sites + source = self.getSource(page.site) + if source: + newclaim.addSource(source, bot=True)
def main(): diff --git a/scripts/newitem.py b/scripts/newitem.py index 7210d2a..b6ea11f 100644 --- a/scripts/newitem.py +++ b/scripts/newitem.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- """ This script creates new items on Wikidata based on certain criteria. + * When was the (Wikipedia) page created? * When was the last edit on the page? * Does the page contain interwiki's? @@ -27,12 +28,13 @@ #
import pywikibot -from pywikibot import pagegenerators +from pywikibot import pagegenerators, WikidataBot from datetime import timedelta
-class NewItemRobot(pywikibot.Bot): - """ A bot to create new items """ +class NewItemRobot(WikidataBot): + + """ A bot to create new items. """
def __init__(self, generator, **kwargs): """Only accepts options defined in availableOptions.""" @@ -49,9 +51,7 @@ self.lastEdit = self.getOption('lastedit') self.pageAgeBefore = self.repo.getcurrenttime() - timedelta(days=self.pageAge) self.lastEditBefore = self.repo.getcurrenttime() - timedelta(days=self.lastEdit) - - def run(self): - """ Start the bot. """ + self.treat_missing_item = True pywikibot.output('Page age is set to %s days so only pages created' '\nbefore %s will be considered.' % (self.pageAge, self.pageAgeBefore.isoformat())) @@ -59,65 +59,63 @@ '\nbefore %s will be considered.' % (self.lastEdit, self.lastEditBefore.isoformat()))
- for page in self.generator: - self.current_page = page - if not page.exists(): - pywikibot.output(u'%s does not exist anymore. Skipping...' - % page) - continue - try: - item = pywikibot.ItemPage.fromPage(page) - except pywikibot.NoPage: - pass - else: - pywikibot.output(u'%s already has an item: %s.' % (page, item)) - if self.getOption('touch'): - pywikibot.output(u'Doing a null edit on the page.') - page.put(page.text) - continue + def treat(self, page, item): + """ Treat page/item. """ + if item and item.exists(): + pywikibot.output(u'%s already has an item: %s.' % (page, item)) + if self.getOption('touch'): + pywikibot.output(u'Doing a null edit on the page.') + page.put(page.text) + return
- if page.isRedirectPage(): - pywikibot.output(u'%s is a redirect page. Skipping.' % page) - elif page.editTime() > self.lastEditBefore: - pywikibot.output( - u'Last edit on %s was on %s.\nToo recent. Skipping.' - % (page, page.editTime().isoformat())) - else: - (revId, revTimestamp, revUser, - revComment) = page.getVersionHistory(reverseOrder=True, - total=1)[0] - if revTimestamp > self.pageAgeBefore: - pywikibot.output( - u'Page creation of %s on %s is too recent. Skipping.' - % (page, page.editTime().isoformat())) - elif page.langlinks(): - # FIXME: Implement this - pywikibot.output( - "Found language links (interwiki links).\n" - "Haven't implemented that yet so skipping.") - else: - # FIXME: i18n - summary = (u'Bot: New item with sitelink from %s' - % page.title(asLink=True, insite=self.repo)) + self.current_page = page
- data = {'sitelinks': - {page.site.dbName(): - {'site': page.site.dbName(), - 'title': page.title()} - }, - 'labels': - {page.site.lang: - {'language': page.site.lang, - 'value': page.title()} - } - } - pywikibot.output(summary) + if page.isRedirectPage(): + pywikibot.output(u'%s is a redirect page. Skipping.' % page) + return + if page.editTime() > self.lastEditBefore: + pywikibot.output( + u'Last edit on %s was on %s.\nToo recent. Skipping.' + % (page, page.editTime().isoformat())) + return
- # Create empty item object and add 'data' - item = pywikibot.ItemPage(page.site.data_repository()) - item.editEntity(data, summary=summary) - # And do a null edit to force update - page.put(page.text) + (revId, revTimestamp, revUser, + revComment) = page.getVersionHistory(reverseOrder=True, total=1)[0] + if revTimestamp > self.pageAgeBefore: + pywikibot.output( + u'Page creation of %s on %s is too recent. Skipping.' + % (page, page.editTime().isoformat())) + return + + if page.langlinks(): + # FIXME: Implement this + pywikibot.output( + "Found language links (interwiki links).\n" + "Haven't implemented that yet so skipping.") + return + + # FIXME: i18n + summary = (u'Bot: New item with sitelink from %s' + % page.title(asLink=True, insite=self.repo)) + + data = {'sitelinks': + {page.site.dbName(): + {'site': page.site.dbName(), + 'title': page.title()} + }, + 'labels': + {page.site.lang: + {'language': page.site.lang, + 'value': page.title()} + } + } + + pywikibot.output(summary) + + item = pywikibot.ItemPage(page.site.data_repository()) + item.editEntity(data, summary=summary) + # And do a null edit to force update + page.put(page.text)
def main(): diff --git a/tox.ini b/tox.ini index 3772be9..9877c43 100644 --- a/tox.ini +++ b/tox.ini @@ -36,7 +36,11 @@ ./pywikibot/data/__init__.py \ ./pywikibot/compat/userlib.py ./pywikibot/compat/catlib.py \ ./pywikibot/compat/query.py \ + ./scripts/claimit.py ./scripts/coordinate_import.py \ + ./scripts/harvest_template.py ./scripts/illustrate_wikidata.py \ + ./scripts/newitem.py \ ./tests/aspects.py + deps = flake8-docstrings
[testenv:nose]
pywikibot-commits@lists.wikimedia.org