[Pywikipedia-l] SVN: [4662] trunk/pywikipedia/checkimages.py
filnik at svn.wikimedia.org
filnik at svn.wikimedia.org
Sun Dec 9 14:23:05 UTC 2007
Revision: 4662
Author: filnik
Date: 2007-12-09 14:23:00 +0000 (Sun, 09 Dec 2007)
Log Message:
-----------
Updated. Now it uses pagegenerator. Rewrite of some parts
Modified Paths:
--------------
trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================
--- trunk/pywikipedia/checkimages.py 2007-12-09 14:01:24 UTC (rev 4661)
+++ trunk/pywikipedia/checkimages.py 2007-12-09 14:23:00 UTC (rev 4662)
@@ -127,27 +127,27 @@
# if the file has an unknown extension it will be tagged with this template.
# In reality, there aren't unknown extension, they are only not allewed... ^__^
delete_immediately = {
- 'commons':"{{db-meta|The file has .%s as extension.}}",
- 'en' :"{{db-meta|The file has .%s as extension.}}",
- 'it' :'{{cancella subito|motivo=Il file ha come estensione ".%s"}}',
- 'hu' :u'{{azonnali|A fájlnak .%s a kiterjesztése}}',
- }
+ 'commons':"{{db-meta|The file has .%s as extension.}}",
+ 'en' :"{{db-meta|The file has .%s as extension.}}",
+ 'it' :'{{cancella subito|motivo=Il file ha come estensione ".%s"}}',
+ 'hu' :u'{{azonnali|A fájlnak .%s a kiterjesztése}}',
+ }
# The header of the Unknown extension's message.
delete_immediately_head = {
- 'commons':"\n== Unknown extension! ==\n",
- 'en' :"\n== Unknown extension! ==\n",
- 'it' :'\n== File non specificato ==\n',
- 'hu' :u'\n== Ismeretlen kiterjesztésű fájl ==\n',
- }
+ 'commons':"\n== Unknown extension! ==\n",
+ 'en' :"\n== Unknown extension! ==\n",
+ 'it' :'\n== File non specificato ==\n',
+ 'hu' :u'\n== Ismeretlen kiterjesztésű fájl ==\n',
+ }
# Text that will be add if the bot find a unknown extension.
delete_immediately_notification = {
- 'commons':'The [[:Image:%s]] file has a wrong extension, please check. ~~~~',
- 'en' :'The [[:Image:%s]] file has a wrong extension, please check. ~~~~',
- 'it' :'{{subst:Utente:Filbot/Ext|%s}}',
- 'hu' :u'A [[:Kép:%s]] fájlnak rossz a kiterjesztése, kérlek ellenőrízd. ~~~~',
- }
+ 'commons':'The [[:Image:%s]] file has a wrong extension, please check. ~~~~',
+ 'en' :'The [[:Image:%s]] file has a wrong extension, please check. ~~~~',
+ 'it' :'{{subst:Utente:Filbot/Ext|%s}}',
+ 'hu' :u'A [[:Kép:%s]] fájlnak rossz a kiterjesztése, kérlek ellenőrízd. ~~~~',
+ }
# Summary of the delate immediately. (f.e: Adding {{db-meta|The file has .%s as extension.}})
del_comm = {
'commons':'Bot: Adding %s',
@@ -242,6 +242,7 @@
class NothingFound(wikipedia.Error):
""" An exception indicating that a regex has return [] instead of results."""
+# When the page is not a wiki-page (as for untagged generator) you need that function
def pageText(url):
try:
request = urllib2.Request(url)
@@ -252,7 +253,7 @@
response.close()
# When you load to many users, urllib2 can give this error.
except urllib2.HTTPError:
- wikipedia.output(u"Server error. Pausing for 10 seconds... " + time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime()) )
+ wikipedia.output(u"Server error. Pausing for 10 seconds... %s" % time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime()) )
time.sleep(10)
request = urllib2.Request(url)
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7'
@@ -264,9 +265,9 @@
# Here there is the main class.
class main:
- def __init__(self, site):
+ def __init__(self, site, logFulNumber = 25000):
self.site = site
- self.logFulNumber = 25000
+ self.logFulNumber = logFulNumber
def general(self, newtext, image, notification, head, botolist):
""" This class can be called for two reason. So I need two different __init__, one with common data
and another with the data that I required... maybe it can be added on the other function, but in this way
@@ -307,7 +308,7 @@
self.report_image(rep_page, self.image, com, repme)
return False
luser = wikipedia.url2link(nick, self.site, self.site)
- pagina_discussione = self.site.namespace(3) + ':' + luser
+ pagina_discussione = "%s:%s" % (self.site.namespace(3), luser)
# Defing the talk page (pagina_discussione = talk_page ^__^ )
talk_page = wikipedia.Page(self.site, pagina_discussione)
self.talk_page = talk_page
@@ -332,7 +333,7 @@
history = talk_page.getVersionHistory(False, False, False)
latest_edit = history[0]
latest_user = latest_edit[2]
- wikipedia.output(u'The latest user that has written something is: ' + latest_user)
+ wikipedia.output(u'The latest user that has written something is: %s' % latest_user)
else:
wikipedia.output(u'The user page is blank')
@@ -360,53 +361,28 @@
else:
commentox = commx
if second_text == True:
- talk_page.put(testoattuale + "\n\n:" + notification2, comment = commentox, minorEdit = False)
+ talk_page.put("%s\n\n:%s" % (testoattuale, notification2), comment = commentox, minorEdit = False)
elif second_text == False:
talk_page.put(testoattuale + head + notification, comment = commentox, minorEdit = False)
- def run_bot(self, textrun, rep_page, com):
- # Search regular expression to find links like this (and the class attribute is optional too)
- # class="new" title="Immagine:Soldatino2.jpg">Immagine:Soldatino2.jpg</a>" <span class="comment">
- regexp = r'(class=\"new\" |)title=\"' + image_namespace + '(.*?)\.(\w\w\w|jpeg)\">.*?</a>\".*?<span class=\"comment\">'
- pos = 0
- done = list()
- ext_list = list()
- r = re.compile(regexp, re.UNICODE)
- while 1:
- m = r.search(textrun, pos)
- if m == None:
- wikipedia.output(u"\t\t>> All images checked. <<")
- break
- pos = m.end()
- new = m.group(1)
- im = m.group(2)
- ext = m.group(3)
- # This prevent pages with strange characters. They will be loaded without problem.
- image = im + "." + ext
- if new != '':
- wikipedia.output(u"Skipping %s because it has been deleted." % image)
- done.append(image)
- if image not in done:
- done.append(image)
- yield image
- #continue
-
+
def untaggedGenerator(self, untaggedProject, rep_page, com):
lang = untaggedProject.split('.', 1)[0]
- project = '.' + untaggedProject.split('.', 1)[1]
+ project = '.%s' % untaggedProject.split('.', 1)[1]
if lang == 'commons':
link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikifam=commons.wikimedia.org&since=-100d&until=&img_user_text=&order=img_timestamp&max=100&order=img_timestamp&format=html'
else:
- link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikilang=' + lang + '&wikifam=' + project + '&order=img_timestamp&max=' + str(limit) + '&ofs=0&max=' + str(limit)
+ link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikilang=%s&wikifam=%s&order=img_timestamp&max=%s&ofs=0&max=%s' % (lang, project, limit, limit)
text = pageText(link)
#print text
- regexp = r"""<td valign='top' title='Name'><a href='http://.*?\..*?\.org/w/index\.php\?title=(.*?)'>.*?</a></td>"""
+ regexp = r"""<td valign='top' title='Name'><a href='http://.*?\.org/w/index\.php\?title=(.*?)'>.*?</a></td>"""
results = re.findall(regexp, text)
if results == []:
print link
raise NothingFound('Nothing found! Try to use the tool by yourself to be sure that it works!')
else:
for result in results:
- yield wikipedia.Page(self.site, result)
+ wikiPage = wikipedia.Page(self.site, result)
+ yield wikiPage
def regexGenerator(self, regexp, textrun):
pos = 0
@@ -430,9 +406,9 @@
# title="Immagine:Nvidia.jpg"
wikipedia.output(u'Checking if %s is on commons...' % image)
commons = wikipedia.getSite('commons', 'commons')
- if wikipedia.Page(commons, u'Image:' + image).exists():
+ if wikipedia.Page(commons, u'Image:%s' % image).exists():
wikipedia.output(u'%s is on commons!' % image)
- imagePage = wikipedia.ImagePage(self.site, 'Image:' + image)
+ imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % image)
on_commons_text = imagePage.getImagePageHtml()
if "<div class='sharedUploadNotice'>" in on_commons_text:
wikipedia.output(u"But, the image doesn't exist on your project! Skip...")
@@ -459,7 +435,7 @@
else:
text_get = str()
if len(text_get) >= self.logFulNumber:
- raise LogIsFull("The log page (%s) is full! Please delete the old images reported." % another_page.title())
+ raise LogIsFull("The log page (%s) is full! Please delete the old images reported." % another_page.title())
pos = 0
# The talk page includes "_" between the two names, in this way i replace them to " "
regex = image
@@ -622,7 +598,7 @@
firstPageTitle = str(wikipedia.input(u'From witch page do you want to start?'))
elif len(arg) > 6:
firstPageTitle = str(arg[7:])
- generator = wikipedia.getSite().allpages(start='Image:'+firstPageTitle)
+ generator = wikipedia.getSite().allpages(start='Image:%s' % firstPageTitle)
repeat = False
elif arg.startswith('-page:'):
if len(arg) == 6:
@@ -651,7 +627,7 @@
catName = str(wikipedia.input(u'In which category do I work?'))
elif len(arg) > 4:
catName = str(arg[5:])
- catSelected = catlib.Category(wikipedia.getSite(), 'Category:'+catName)
+ catSelected = catlib.Category(wikipedia.getSite(), 'Category:%s' % catName)
generator = pagegenerators.CategorizedPageGenerator(catSelected)
repeat = False
elif arg.startswith('-untagged'):
@@ -666,10 +642,7 @@
generator
except NameError:
normal = True
-
- # URL of the log of newimages. (http:/en.wikipedia.org/ will generated according to the project... you won't see it in the url)
- url = "/w/index.php?title=Special:Log&type=upload&user=&page=&pattern=&limit=%d&offset=0" % limit
-
+
# Define the site.
site = wikipedia.getSite()
@@ -679,7 +652,7 @@
# Block of text to translate the parameters set above.
image_n = site.image_namespace()
- image_namespace = image_n + ":"
+ image_namespace = "%s:" % image_n
unvertext = wikipedia.translate(site, n_txt)
commento = wikipedia.translate(site, comm)
commento2 = wikipedia.translate(site, comm2)
@@ -715,7 +688,7 @@
wikipedia.output(u"Your project is not supported by this script. You have to edit the script and add it!")
wikipedia.stopme()
- di = '\n' + di
+ di = '\n%s' % di
dels = dels % di
# Reading the log of the new images
@@ -725,18 +698,13 @@
else:
wikipedia.output(u"Retrieving the lastest %d files for checking..." % limit)
- while 1:
- # If I use the standard way, I have to download the page to parse it.
- if normal == True:
- textrun = site.getUrl(url)
-
+ while 1:
mainClass = main(site)
if untagged == True:
generator = mainClass.untaggedGenerator(projectUntagged, rep_page, com)
normal = False
if normal == True:
- generator = mainClass.run_bot(textrun, rep_page, com)
-
+ generator = pagegenerators.newImages(limit, site)
if urlUsed == True and regexGen == True:
textRegex = pagetext(regexPageUrl)
elif regexGen == True:
@@ -764,28 +732,27 @@
wikipedia.output(u"No additional settings found!")
if skip == True:
skip_list = list()
- wikipedia.output(u'Skipping the first ' + str(skip_number) + u' images:\n')
+ wikipedia.output(u'Skipping the first %s images:\n' % skip_number)
else:
wikipedia.output(u'\t\t>> No images to skip...<<')
skipok = False
for image in generator:
if normal == False and regexGen == False:
- if image_namespace not in image.title():
+ if image_namespace.lower() not in image.title().lower() and \
+ 'image:' not in image.title().lower():
continue
- image = image.title().split(image_namespace)[1]
- elif regexGen == True:
- image = image.split(image_namespace)[1]
+ imageName = image.title().split(image_namespace)[1]
if skip == True:
if len(skip_list) < skip_number:
- wikipedia.output(u'Skipping %s...' % image)
- skip_list.append(image)
+ wikipedia.output(u'Skipping %s...' % imageName)
+ skip_list.append(imageName)
continue
else:
if skipok == False:
wikipedia.output('')
skipok = True
if commonsActive == True:
- response = mainClass.checkImage(image)
+ response = mainClass.checkImage(imageName)
if response == False:
continue
if tupla_written != None:
@@ -794,17 +761,17 @@
parentesi = False
delete = False
tagged = False
- extension = image.split('.')[-1]
- page = image_namespace + image
- p = wikipedia.ImagePage(site, page)
+ extension = imageName.split('.')[-1]
+ # Page => ImagePage
+ p = wikipedia.ImagePage(site, image.title())
# Skip deleted images
try:
g = p.get()
except wikipedia.NoPage:
- wikipedia.output(u"Skipping %s because it has been deleted." % image)
+ wikipedia.output(u"Skipping %s because it has been deleted." % imageName)
continue
except wikipedia.IsRedirectPage:
- wikipedia.output(u"The file description for %s is a redirect?!" % image )
+ wikipedia.output(u"The file description for %s is a redirect?!" % imageName )
continue
for l in hiddentemplate:
if l.lower() in g.lower():
@@ -841,7 +808,7 @@
summary = tupla[5]
head_2 = tupla[6]
text = tupla[7]
- text = text % image
+ text = text % imageName
mexCatched = tupla[8]
wikipedia.setAction(summary)
del tupla[0:8]
@@ -867,9 +834,9 @@
mex_used = mexCatched
continue
if p.exists():
- # Here there is the checkin ^^
+ # Here begins the check block.
if tagged == True:
- wikipedia.output(image + u' is already tagged... ' + time.strftime("%H:%M:%S", time.localtime()))
+ wikipedia.output(u'%s is already tagged... %s' % (imageName, time.strftime("%H:%M:%S", time.localtime())))
continue
if some_problem == True:
if mex_used in g:
@@ -884,13 +851,13 @@
reported = True
if reported == True:
#if imagestatus_used == True:
- report(mex_used, image, text_used, "\n" + head_used + "\n", None, imagestatus_used, summary_used)
+ report(mex_used, imageName, text_used, "\n%s\n" % head_used, None, imagestatus_used, summary_used)
else:
wikipedia.output(u"Skipping the image...")
some_problem = False
continue
elif parentesi == True:
- wikipedia.output(image + u" seems ok, " + time.strftime("%H:%M:%S", time.localtime()))
+ wikipedia.output(u"%s seems ok, %s" % (imageName, time.strftime("%H:%M:%S", time.localtime())))
# It works also without this... but i want only to be sure ^^
parentesi = False
continue
@@ -901,7 +868,7 @@
canctext = di % extension
notification = din % image
head = dih
- report(canctext, image, notification, head)
+ report(canctext, imageName, notification, head)
delete = False
continue
elif g in nothing:
@@ -914,7 +881,7 @@
notification = nn
else:
notification = nn % image
- report(unvertext, image, notification, head, smwl)
+ report(unvertext, imageName, notification, head, smwl)
continue
else:
wikipedia.output(u"%s has only text and not the specific license..." % image)
@@ -926,11 +893,11 @@
notification = nn
else:
notification = nn % image
- report(unvertext, image, notification, head, smwl)
+ report(unvertext, imageName, notification, head, smwl)
continue
# A little block to perform the repeat or to break.
if repeat == True:
- wikipedia.output(u"Waiting for " + str(time_sleep) + u" seconds, " + time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime()) )
+ wikipedia.output(u"Waiting for %s seconds, %s" % (time_sleep, time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime())))
time.sleep(time_sleep)
elif repeat == False:
wikipedia.output(u"\t\t\t>> STOP! <<")
More information about the Pywikipedia-l
mailing list