http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10841
Revision: 10841 Author: multichill Date: 2012-12-29 15:54:22 +0000 (Sat, 29 Dec 2012) Log Message: ----------- Split JSONReader
Modified Paths: -------------- trunk/pywikipedia/data_ingestion.py
Modified: trunk/pywikipedia/data_ingestion.py =================================================================== --- trunk/pywikipedia/data_ingestion.py 2012-12-29 14:21:28 UTC (rev 10840) +++ trunk/pywikipedia/data_ingestion.py 2012-12-29 15:54:22 UTC (rev 10841) @@ -86,38 +86,52 @@
def JSONReader(baseurl, start=0, end=100, JSONBase=None, metadataFunction=None, fileurl=u'fileurl'): ''' - Loops over a bunch of json objects. - For each json page you can rebase it to not get all the crap + Loops over a bunch of json page and process them with processJSONPage(). + + Will yield Photo objects with metadata + ''' + if baseurl: + for i in range(start , end): + url = baseurl % (i,) + photo = processJSONPage(url, JSONBase=JSONBase, metadataFunction=metadataFunction, fileurl=u'fileurl') + if photo: + yield photo + + + +def processJSONPage(url, JSONBase=None, metadataFunction=None, fileurl=u'fileurl'): + ''' + Process a single JSON page. + For the JSON page you can rebase it to not get all the crap You can apply a custom metadata function to do some modification on the metadata and checking By default the field 'fileurl' is expected in the metadata to contain the file. You can change this.
- Will a Photo object with metadata + Will a return Photo object with metadata or None if something is wrong ''' - if baseurl: - for i in range(start , end): - # How to do recursion? - JSONPage = urllib.urlopen(baseurl % (i,)) - JSONData = json.load(JSONPage) - JSONPage.close() + JSONPage = urllib.urlopen(url) + JSONData = json.load(JSONPage) + JSONPage.close()
- # Rebase based on jsonBase - if JSONBase: - JSONData = JSONRebase(JSONData, JSONBase) + # Rebase based on jsonBase + if JSONBase: + JSONData = JSONRebase(JSONData, JSONBase)
- if JSONData: - # If rebasing worked, get the metadata - metadata = dict() - fieldlist = [u''] - metadata = JSONTree(metadata, [], JSONData) + if JSONData: + # If rebasing worked, get the metadata + metadata = dict() + fieldlist = [u''] + metadata = JSONTree(metadata, [], JSONData)
- # If a metadataFunction is set, apply it - if metadataFunction: - metadata = metadataFunction(metadata) + # If a metadataFunction is set, apply it + if metadataFunction: + metadata = metadataFunction(metadata)
- # If the metadataFunction didn't return none (something was wrong). Yield the photo - if metadata: - yield Photo(metadata.get(fileurl), metadata) + # If the metadataFunction didn't return none (something was wrong). Return the photo + if metadata: + return Photo(metadata.get(fileurl), metadata)
+ return False + def JSONRebase(JSONData, JSONBase): ''' Moves the base of the JSON object to the part you're intrested in.
pywikipedia-svn@lists.wikimedia.org