Revision: 7194
Author: purodha
Date: 2009-08-31 21:52:10 +0000 (Mon, 31 Aug 2009)
Log Message:
-----------
Applying patch #2809532 of nakor-wikipedia fixing the interwiki sort order in the fy.wikipedia
Modified Paths:
--------------
trunk/pywikipedia/families/wikipedia_family.py
Modified: trunk/pywikipedia/families/wikipedia_family.py
===================================================================
--- trunk/pywikipedia/families/wikipedia_family.py 2009-08-31 20:49:50 UTC (rev 7193)
+++ trunk/pywikipedia/families/wikipedia_family.py 2009-08-31 21:52:10 UTC (rev 7194)
@@ -928,6 +928,10 @@
# Order for fy: alphabetical by code, but y counts as i
def fycomp(x,y):
+ if x == 'nb':
+ x = 'no'
+ if y == 'nb':
+ y = 'no'
x = x.replace("y","i")+x.count("y")*"!"
y = y.replace("y","i")+y.count("y")*"!"
return cmp(x,y)
Revision: 7192
Author: alexsh
Date: 2009-08-31 20:00:38 +0000 (Mon, 31 Aug 2009)
Log Message:
-----------
Page().getVersionHistory():
* remove duplicate codes
* use a temporary list to save revisions and dump into self._versionhistory or self._versionhistoryearliest after the process.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-08-31 15:40:11 UTC (rev 7191)
+++ trunk/pywikipedia/wikipedia.py 2009-08-31 20:00:38 UTC (rev 7192)
@@ -2283,7 +2283,6 @@
unless getAll is True.
"""
- site = self.site()
# regular expression matching one edit in the version history.
# results will have 4 groups: oldid, edit date/time, user name, and edit
@@ -2297,6 +2296,7 @@
startFromPage = None
thisHistoryDone = False
skip = False # Used in determining whether we need to skip the first page
+ dataQuery = []
RLinkToNextPage = re.compile('&offset=(.*?)&')
@@ -2309,6 +2309,7 @@
# Cause a reload, or at least make the loop run
thisHistoryDone = False
skip = True
+ dataQuery = self._versionhistoryearliest
else:
thisHistoryDone = True
elif not hasattr(self, '_versionhistory') or forceReload:
@@ -2317,11 +2318,12 @@
# Cause a reload, or at least make the loop run
thisHistoryDone = False
skip = True
+ dataQuery = self._versionhistory
else:
thisHistoryDone = True
while not thisHistoryDone:
- path = site.family.version_history_address(self.site().language(), self.urlname(), revCount)
+ path = self.site().family.version_history_address(self.site().language(), self.urlname(), revCount)
if reverseOrder:
path += '&dir=prev'
@@ -2341,108 +2343,60 @@
else:
output(u'Getting version history of %s' % self.aslink(forceInterwiki = True))
- txt = site.getUrl(path)
+ txt = self.site().getUrl(path)
# save a copy of the text
self_txt = txt
- if reverseOrder:
- # If we are getting all of the page history...
- if getAll:
- if len(self._versionhistoryearliest) == 0:
- matchObj = RLinkToNextPage.search(self_txt)
- if matchObj:
- startFromPage = matchObj.group(1)
- else:
- thisHistoryDone = True
-
- edits = editR.findall(self_txt)
- edits.reverse()
- for edit in edits:
- self._versionhistoryearliest.append(edit)
- if len(edits) < revCount:
- thisHistoryDone = True
- else:
- if not skip:
- edits = editR.findall(self_txt)
- edits.reverse()
- for edit in edits:
- self._versionhistoryearliest.append(edit)
- if len(edits) < revCount:
- thisHistoryDone = True
-
- matchObj = RLinkToNextPage.search(self_txt)
- if matchObj:
- startFromPage = matchObj.group(1)
- else:
- thisHistoryDone = True
-
- else:
- # Skip the first page only,
- skip = False
-
- matchObj = RLinkToNextPage.search(self_txt)
- if matchObj:
- startFromPage = matchObj.group(1)
- else:
- thisHistoryDone = True
+ # If we are getting all of the page history...
+ if getAll:
+ #Find the nextPage link, if not exist, the page is last history page
+ matchObj = RLinkToNextPage.search(self_txt)
+ if matchObj:
+ startFromPage = matchObj.group(1)
else:
- # If we are not getting all, we stop on the first page.
- for edit in editR.findall(self_txt):
- self._versionhistoryearliest.append(edit)
- self._versionhistoryearliest.reverse()
-
thisHistoryDone = True
- else:
- # If we are getting all of the page history...
- if getAll:
- if len(self._versionhistory) == 0:
- matchObj = RLinkToNextPage.search(self_txt)
- if matchObj:
- startFromPage = matchObj.group(1)
- else:
- thisHistoryDone = True
+ if len(dataQuery) == 0:
+ edits = editR.findall(self_txt)
+ if reverseOrder:
+ edits.reverse()
+ #for edit in edits:
+ dataQuery.extend([edit for edit in edits])
+ if len(edits) < revCount:
+ thisHistoryDone = True
+ else:
+ if not skip:
edits = editR.findall(self_txt)
- for edit in edits:
- self._versionhistory.append(edit)
+ if reverseOrder:
+ edits.reverse()
+ #for edit in edits:
+ dataQuery.extend([edit for edit in edits])
if len(edits) < revCount:
thisHistoryDone = True
else:
- if not skip:
- edits = editR.findall(self_txt)
- for edit in edits:
- self._versionhistory.append(edit)
- if len(edits) < revCount:
- thisHistoryDone = True
+ # Skip the first page only,
+ skip = False
+ else:
+ # If we are not getting all, we stop on the first page.
+ #for edit in editR.findall(self_txt):
+ dataQuery.extend([edit for edit in editR.findall(self_txt)] )
+ if reverseOrder:
+ dataQuery.reverse()
+ thisHistoryDone = True
- matchObj = RLinkToNextPage.findall(self_txt)
- if len(matchObj) >= 2:
- startFromPage = matchObj[1]
- else:
- thisHistoryDone = True
- else:
- # Skip the first page only,
- skip = False
-
- matchObj = RLinkToNextPage.search(self_txt)
- if matchObj:
- startFromPage = matchObj.group(1)
- else:
- thisHistoryDone = True
- else:
- # If we are not getting all, we stop on the first page.
- for edit in editR.findall(self_txt):
- self._versionhistory.append(edit)
-
- thisHistoryDone = True
-
if reverseOrder:
# Return only revCount edits, even if the version history is extensive
+ if dataQuery != []:
+ self._versionhistoryearliest = dataQuery
+ del dataQuery
if len(self._versionhistoryearliest) > revCount and not getAll:
return self._versionhistoryearliest[0:revCount]
return self._versionhistoryearliest
+ if dataQuery != []:
+ self._versionhistory = dataQuery
+ del dataQuery
# Return only revCount edits, even if the version history is extensive
if len(self._versionhistory) > revCount and not getAll:
return self._versionhistory[0:revCount]
Revision: 7191
Author: multichill
Date: 2009-08-31 15:40:11 +0000 (Mon, 31 Aug 2009)
Log Message:
-----------
More comments and prevent naming collisions
Modified Paths:
--------------
trunk/pywikipedia/flickrripper.py
Modified: trunk/pywikipedia/flickrripper.py
===================================================================
--- trunk/pywikipedia/flickrripper.py 2009-08-31 15:09:31 UTC (rev 7190)
+++ trunk/pywikipedia/flickrripper.py 2009-08-31 15:40:11 UTC (rev 7191)
@@ -42,6 +42,9 @@
def getPhoto(flickr = None, photo_id = ''):
'''
Get the photo info and the photo sizes so we can use these later on
+
+ TODO: Add exception handling
+
'''
photoInfo = flickr.photos_getInfo(photo_id=photo_id)
#xml.etree.ElementTree.dump(photoInfo)
@@ -52,6 +55,8 @@
def isAllowedLicense(photoInfo = None):
'''
Check if the image contains the right license
+
+ TODO: Maybe add more licenses
'''
license = photoInfo.find('photo').attrib['license']
if license == '4' or license == '5':
@@ -72,10 +77,20 @@
return url
def downloadPhoto(photoUrl=''):
+ '''
+ Download the photo and store it in a StrinIO.StringIO object.
+
+ TODO: Add exception handling
+ '''
imageFile=urllib.urlopen(photoUrl).read()
return StringIO.StringIO(imageFile)
def findDuplicateImages(photo=None, site=wikipedia.getSite()):
+ '''
+ Takes the photo, calculates the SHA1 hash and asks the mediawiki api for a list of duplicates.
+
+ TODO: Add exception handling, fix site thing
+ '''
result = []
hashObject = hashlib.sha1()
hashObject.update(photo.getvalue())
@@ -87,7 +102,7 @@
'aisha1' : sha1Hash,
'aiprop' : '',
}
- data = query.GetData(params, wikipedia.getSite(), useAPI = True, encodeTitle = False)
+ data = query.GetData(params, site=wikipedia.getSite(), useAPI = True, encodeTitle = False)
for image in data['query']['allimages']:
result.append(image['name'])
return result
@@ -105,6 +120,8 @@
def getFlinfoDescription(photo_id = 0):
'''
Get the description from http://wikipedia.ramselehof.de/flinfo.php
+
+ TODO: Add exception handling, try a couple of times
'''
parameters = urllib.urlencode({'id' : photo_id, 'raw' : 'on'})
@@ -113,9 +130,11 @@
#print rawDescription.decode('utf-8')
return rawDescription.decode('utf-8')
-def getFilename(photoInfo=None):
+def getFilename(photoInfo=None, site=wikipedia.getSite()):
'''
- Build a good filename for the upload based on the username and the title
+ Build a good filename for the upload based on the username and the title.
+ Prevents naming collisions.
+
'''
username = photoInfo.find('photo').find('owner').attrib['username']
title = photoInfo.find('photo').find('title').text
@@ -124,9 +143,20 @@
else:
title = u''
- return u'Flickr - %s - %s.jpg' % (username, title)
+ if (wikipedia.Page(title=u'File:Flickr - %s - %s.jpg' % (username, title), site=wikipedia.getSite()).exists()):
+ i = 1
+ while True:
+ if (wikipedia.Page(title=u'File:Flickr - %s - %s (%s).jpg' % (username, title, str(i)), site=wikipedia.getSite()).exists()):
+ i = i + 1
+ else:
+ return u'Flickr - %s - %s (%s).jpg' % (username, title, str(i))
+ else:
+ return u'Flickr - %s - %s.jpg' % (username, title)
def cleanUpTitle(title):
+ '''
+ Clean up the title of a potential mediawiki page. Otherwise the title of the page might not be allowed by the software.
+ '''
title = title.strip()
title = re.sub("[<{\\[]", "(", title)
@@ -168,6 +198,9 @@
return description
def processPhoto(flickr=None, photo_id=u'', flickrreview=False, reviewer=u'', override=u''):
+ '''
+ Process a single Flickr photo
+ '''
if(photo_id):
print photo_id
(photoInfo, photoSizes) = getPhoto(flickr=flickr, photo_id=photo_id)
@@ -202,6 +235,9 @@
return 0
class Tkdialog:
+ '''
+ The user dialog.
+ '''
def __init__(self, photoDescription, photo, filename):
self.root=Tk()
#"%dx%d%+d%+d" % (width, height, xoffset, yoffset)
@@ -257,6 +293,9 @@
self.descriptionScrollbar.grid(row=14, column=5)
def getImage(self, photo, width, height):
+ '''
+ Take the StringIO object and build an imageTK thumbnail
+ '''
image = Image.open(photo)
image.thumbnail((width, height))
imageTk = ImageTk.PhotoImage(image)
@@ -285,8 +324,12 @@
return (self.photoDescription, self.filename, self.skip)
def getPhotos(flickr=None, user_id=u'', group_id=u'', photoset_id=u'', tags=u''):
+ '''
+ Loop over a set of Flickr photos.
+ '''
result = []
# http://www.flickr.com/services/api/flickr.groups.pools.getPhotos.html
+ # Get the photos in a group
if(group_id):
#First get the total number of photo's in the group
photos = flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page='1')
@@ -297,6 +340,7 @@
yield photo.attrib['id']
# http://www.flickr.com/services/api/flickr.photosets.getPhotos.html
+ # Get the photos in a photoset
elif(photoset_id):
photos = flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page='1')
pages = photos.find('photos').attrib['pages']
@@ -306,6 +350,7 @@
yield photo.attrib['id']
# http://www.flickr.com/services/api/flickr.people.getPublicPhotos.html
+ # Get the (public) photos uploaded by a user
elif(user_id):
photos = flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page='1')
pages = photos.find('photos').attrib['pages']
@@ -316,6 +361,11 @@
return
def usage():
+ '''
+ Print usage information
+
+ TODO : Need more.
+ '''
wikipedia.output(u"Flickrripper is a tool to transfer flickr photos to Wikimedia Commons")
wikipedia.output(u"-group_id:<group_id>\n")
wikipedia.output(u"-photoset_id:<photoset_id>\n")
Revision: 7189
Author: multichill
Date: 2009-08-31 11:00:07 +0000 (Mon, 31 Aug 2009)
Log Message:
-----------
Raised the delay to 2 hours.
Modified Paths:
--------------
trunk/pywikipedia/imageuncat.py
Modified: trunk/pywikipedia/imageuncat.py
===================================================================
--- trunk/pywikipedia/imageuncat.py 2009-08-31 09:30:38 UTC (rev 7188)
+++ trunk/pywikipedia/imageuncat.py 2009-08-31 11:00:07 UTC (rev 7189)
@@ -1254,7 +1254,7 @@
return pagegenerators.PagesFromTitlesGenerator(result, site)
-def recentChanges(site = None, delay=60, block=70):
+def recentChanges(site = None, delay=0, block=70):
'''
Return a pagegenerator containing all the images edited in a certain timespan.
The delay is the amount of minutes to wait and the block is the timespan to return images in.
@@ -1348,7 +1348,7 @@
if arg.startswith('-yesterday'):
generator = uploadedYesterday(site)
elif arg.startswith('-recentchanges'):
- generator = recentChanges(site)
+ generator = recentChanges(site=site, delay=120)
else:
genFactory.handleArg(arg)
Revision: 7185
Author: a_engels
Date: 2009-08-28 20:52:34 +0000 (Fri, 28 Aug 2009)
Log Message:
-----------
adding command-line option -lack, to only work on pages lacking an interwiki to a specified language.
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2009-08-28 17:09:46 UTC (rev 7184)
+++ trunk/pywikipedia/interwiki.py 2009-08-28 20:52:34 UTC (rev 7185)
@@ -81,6 +81,11 @@
-skipauto use to skip all pages that can be translated automatically,
like dates, centuries, months, etc. (note: without ending colon)
+ -lack: used as -lack:xx with xx a language code: only work on pages
+ without links to language xx. You can also add a number nn
+ lick -lack:xx:nn, so that the bot only works on pages with
+ at least n interwiki links (the default value for n is 1).
+
These arguments are useful to provide hints to the bot:
-hint: used as -hint:de:Anweisung to give the robot a hint
@@ -506,6 +511,8 @@
nobackonly = False
hintsareright = False
contentsondisk = config.interwiki_contents_on_disk
+ lacklanguage = None
+ minlinks = 0
class StoredPage(wikipedia.Page):
"""
@@ -745,6 +752,7 @@
self.untranslated = None
self.hintsAsked = False
self.forcedStop = False
+ self.workonme = True
def getFoundDisambig(self, site):
"""
@@ -1007,6 +1015,9 @@
f.close()
def askForHints(self, counter):
+ if not self.workonme:
+ # Do not ask hints for pages that we don't work on anyway
+ return
if (self.untranslated or globalvar.askhints) and not self.hintsAsked and not self.originPage.isRedirectPage():
# Only once!
self.hintsAsked = True
@@ -1138,6 +1149,13 @@
if globalvar.untranslatedonly:
# Ignore the interwiki links.
iw = ()
+ if globalvar.lacklanguage:
+ if globalvar.lacklanguage in [link.site().language() for link in iw]:
+ iw = ()
+ self.workonme = False
+ if len(iw) < globalvar.minlinks:
+ iw = ()
+ self.workonme = False
elif globalvar.autonomous and duplicate:
@@ -1324,6 +1342,8 @@
be told to make another get request first."""
if not self.isDone():
raise "Bugcheck: finish called before done"
+ if not self.workonme:
+ return
if self.forcedStop:
wikipedia.output(u"======Aborted processing %s======" % self.originPage.aslink(True))
return
@@ -2080,6 +2100,13 @@
globalvar.minsubjects = int(arg[7:])
elif arg.startswith('-query:'):
globalvar.maxquerysize = int(arg[7:])
+ elif arg.startswith('-lack:'):
+ remainder = arg[6:].split(':')
+ globalvar.lacklanguage = remainder[0]
+ if len(remainder) > 1:
+ globalvar.minlinks = int(remainder[1])
+ else:
+ globalvar.minlinks = 1
elif arg == '-back':
globalvar.nobackonly = True
else: