http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10840
Revision: 10840 Author: multichill Date: 2012-12-29 14:21:28 +0000 (Sat, 29 Dec 2012) Log Message: ----------- Add JSON support. This is used for the RCE batch upload
Modified Paths: -------------- trunk/pywikipedia/data_ingestion.py
Modified: trunk/pywikipedia/data_ingestion.py =================================================================== --- trunk/pywikipedia/data_ingestion.py 2012-12-29 13:16:16 UTC (rev 10839) +++ trunk/pywikipedia/data_ingestion.py 2012-12-29 14:21:28 UTC (rev 10840) @@ -4,11 +4,11 @@ A generic bot to do data ingestion (batch uploading) to Commons
''' -import pywikibot +import pywikibot, upload import posixpath, urlparse import urllib import hashlib, base64 -import StringIO +import StringIO, json
class Photo(object): ''' @@ -64,7 +64,7 @@ params = {} params.update(self.metadata) params.update(extraparams) - description = u'{{%s\n' % template + description = u'{{subst:%s|subst=subst:\n' % template for key in sorted(params.keys()): value = params[key] if not key.startswith("_"): @@ -83,6 +83,80 @@ for line in reader: yield Photo(line[urlcolumn], line)
+ +def JSONReader(baseurl, start=0, end=100, JSONBase=None, metadataFunction=None, fileurl=u'fileurl'): + ''' + Loops over a bunch of json objects. + For each json page you can rebase it to not get all the crap + You can apply a custom metadata function to do some modification on the metadata and checking + By default the field 'fileurl' is expected in the metadata to contain the file. You can change this. + + Will a Photo object with metadata + ''' + if baseurl: + for i in range(start , end): + # How to do recursion? + JSONPage = urllib.urlopen(baseurl % (i,)) + JSONData = json.load(JSONPage) + JSONPage.close() + + # Rebase based on jsonBase + if JSONBase: + JSONData = JSONRebase(JSONData, JSONBase) + + if JSONData: + # If rebasing worked, get the metadata + metadata = dict() + fieldlist = [u''] + metadata = JSONTree(metadata, [], JSONData) + + # If a metadataFunction is set, apply it + if metadataFunction: + metadata = metadataFunction(metadata) + + # If the metadataFunction didn't return none (something was wrong). Yield the photo + if metadata: + yield Photo(metadata.get(fileurl), metadata) + +def JSONRebase(JSONData, JSONBase): + ''' + Moves the base of the JSON object to the part you're intrested in. + JSONBase is a list to crawl the tree. If one of the steps is not found, return None + ''' + for step in JSONBase: + if JSONData: + if type(JSONBase) == dict: + JSONData = JSONData.get(step) + elif type(JSONBase) == list: + # FIXME: Needs error, length etc checking + JSONData = JSONData[step] + + return JSONData + + +def JSONTree(metadata, fieldlist, record): + ''' + metadata: Dict with end result + key: The key we encountered + record: Record to work on + ''' + if type(record) == list: + for r in record: + metadata = JSONTree(metadata, fieldlist, r) + elif type(record) == dict: + for k,v in record.items(): + metadata = JSONTree(metadata, fieldlist + [k], v) + elif type(record) == unicode: + key = u'_'.join(fieldlist) + if not key in metadata: + metadata[key] = record + else: + newkey = key + u'_2' + if not newkey in metadata: + metadata[newkey] = record + + return metadata + class DataIngestionBot: def __init__(self, reader, titlefmt, pagefmt, site=pywikibot.getSite(u'commons', u'commons')): self.reader = reader @@ -93,7 +167,6 @@ def _doUpload(self, photo): duplicates = photo.findDuplicateImages(self.site) if duplicates: - pywikibot.output(u"Skipping duplicate of %r" % (duplicates, )) return duplicates[0]
title = photo.getTitle(self.titlefmt) @@ -104,6 +177,7 @@ useFilename = title, keepFilename = True, verifyDescription = False, + ignoreWarning=True, targetSite = self.site) bot._contents = photo.downloadPhoto().getvalue() bot._retrieved = True