http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10988
Revision: 10988
Author: xqt
Date: 2013-01-27 11:10:19 +0000 (Sun, 27 Jan 2013)
Log Message:
-----------
A new parent class for all wikimedia families
Modified Paths:
--------------
trunk/pywikipedia/family.py
Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2013-01-27 10:47:27 UTC (rev 10987)
+++ trunk/pywikipedia/family.py 2013-01-27 11:10:19 UTC (rev 10988)
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
-# (C) Pywikipedia bot team, 2004-2012
+# (C) Pywikipedia bot team, 2004-2013
#
# Distributed under the terms of the MIT license.
#
@@ -14,9 +14,9 @@
import config
import wikipedia as pywikibot
+
# Parent class for all wiki families
-
-class Family:
+class Family(object):
def __init__(self):
self.name = None
# For interwiki sorting order see
@@ -3234,7 +3234,7 @@
#},
}
- self.namespacesWithSubpage = [2] + range(1, 16, 2)
+ self.namespacesWithSubpage = [2] + range(1, 16, 2)
# letters that can follow a wikilink and are regarded as part of
# this link
@@ -4374,3 +4374,11 @@
"""Does a conversion on the text to insert on the wiki
i.e. Esperanto X-conversion """
return putText
+
+
+# Parent class for all wikimedia families
+class WikimediaFamily(Family):
+ def __init__(self):
+ super(WikimediaFamily, self).__init__()
+
+ self.namespacesWithSubpage.extend([4, 12])
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10987
Revision: 10987
Author: xqt
Date: 2013-01-27 10:47:27 +0000 (Sun, 27 Jan 2013)
Log Message:
-----------
enable shorten links for namespaces with subpage and return the right page with linkedPage() (bug #3602219)
Modified Paths:
--------------
trunk/pywikipedia/family.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2013-01-26 17:23:40 UTC (rev 10986)
+++ trunk/pywikipedia/family.py 2013-01-27 10:47:27 UTC (rev 10987)
@@ -3234,6 +3234,8 @@
#},
}
+ self.namespacesWithSubpage = [2] + range(1, 16, 2)
+
# letters that can follow a wikilink and are regarded as part of
# this link
# This depends on the linktrail setting in LanguageXx.php and on
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2013-01-26 17:23:40 UTC (rev 10986)
+++ trunk/pywikipedia/wikipedia.py 2013-01-27 10:47:27 UTC (rev 10987)
@@ -2717,6 +2717,14 @@
for match in Rlink.finditer(thistxt):
title = match.group('title')
title = title.replace("_", " ").strip(" ")
+ if self.namespace() in self.site.namespacesWithSubpage:
+ # convert relative link to absolute link
+ if title.startswith(".."):
+ parts = self.title().split('/')
+ parts.pop()
+ title = u'/'.join(parts) + title[2:]
+ elif title.startswith("/"):
+ title = u'%s/%s' % (self.title(), title[1:])
if title.startswith("#"):
# this is an internal section link
continue
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10985
Revision: 10985
Author: xqt
Date: 2013-01-25 15:11:32 +0000 (Fri, 25 Jan 2013)
Log Message:
-----------
remove obsolete code stuff, obsolete comments, outcommented code;
some PEP 8
Modified Paths:
--------------
trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================
--- trunk/pywikipedia/checkimages.py 2013-01-25 14:33:32 UTC (rev 10984)
+++ trunk/pywikipedia/checkimages.py 2013-01-25 15:11:32 UTC (rev 10985)
@@ -541,25 +541,7 @@
pywikibot.output(u"%s%s" % (message, time_zone))
-class Global(object):
- # default environment settings
- # Command line configurable parameters
- repeat = True # Restart after having check all the images?
- limit = 80 # How many images check?
- time_sleep = 30 # How many time sleep after the check?
- skip_number = 0 # How many images to skip before checking?
- waitTime = 0 # How many time sleep before the check?
- commonsActive = False # Check if on commons there's an image with the same name?
- normal = False # Check the new images or use another generator?
- urlUsed = False # Use the url-related function instead of the new-pages generator
- regexGen = False # Use the regex generator
- untagged = False # Use the untagged generator
- duplicatesActive = False # Use the duplicate option
- duplicatesReport = False # Use the duplicate-report option
- sendemailActive = False # Use the send-email
- logFullError = True # Raise an error when the log is full
-
class checkImagesBot(object):
def __init__(self, site, logFulNumber=25000, sendemailActive=False,
duplicatesReport=False, logFullError=True):
@@ -706,7 +688,6 @@
u"Seems that %s has only the description and not the file..."
% self.image_to_report)
repme = u"\n*[[:File:%s]] problems '''with the APIs'''"
- # We have a problem! Report and exit!
self.report_image(self.image_to_report, self.rep_page, self.com,
repme)
return False
@@ -786,7 +767,7 @@
try:
emailText = emailPage.get()
except (pywikibot.NoPage, pywikibot.IsRedirectPage):
- return # Exit
+ return
if self.sendemailActive:
text_to_send = re.sub(r'__user-nickname__', r'%s'
% self.luser, emailText)
@@ -795,7 +776,7 @@
emailClass.sendMail(emailSubj, text_to_send)
except userlib.UserActionRefuse:
pywikibot.output("User is not mailable, aborted")
- return # exit
+ return
def untaggedGenerator(self, untaggedProject, limit):
""" Generator that yield the files without license. It's based on a
@@ -842,9 +823,6 @@
self.hiddentemplates.add(pywikibot.Page(self.site,
u'Template:%s' % langK))
- # The template #if: and #switch: aren't something to care about
- #self.hiddentemplates.extend([u'#if:', u'#switch:']) FIXME
-
# Hidden template loading
if self.pageHidden:
try:
@@ -921,7 +899,8 @@
def checkImageOnCommons(self):
""" Checking if the file is on commons """
- pywikibot.output(u'Checking if [[%s]] is on commons...' % self.imageName)
+ pywikibot.output(u'Checking if [[%s]] is on commons...'
+ % self.imageName)
commons_site = pywikibot.getSite('commons', 'commons')
regexOnCommons = r"\[\[:File:%s\]\] is also on '''Commons''': \[\[commons:File:.*?\]\](?: \(same name\)|)$" \
% re.escape(self.imageName)
@@ -1079,7 +1058,8 @@
# Two iteration: report the "problem" to the user only once (the last)
if len(images_to_tag_list) > 1:
for image_to_tag in images_to_tag_list[:-1]:
- already_reported_in_past = self.countEdits(u'File:%s' % image_to_tag, self.botolist)
+ already_reported_in_past = self.countEdits(
+ u'File:%s' % image_to_tag, self.botolist)
# if you want only one edit, the edit found should be more than 0 -> num - 1
if already_reported_in_past > duplicates_rollback - 1:
only_report = True
@@ -1100,7 +1080,8 @@
from_regex = r'\n\*\[\[:File:%s\]\]' \
% re.escape(self.convert_to_url(self.imageName))
# Delete the image in the list where we're write on
- text_for_the_report = re.sub(from_regex, '', text_for_the_report)
+ text_for_the_report = re.sub(from_regex, '',
+ text_for_the_report)
# if you want only one edit, the edit found should be more than 0 -> num - 1
if already_reported_in_past > duplicates_rollback - 1:
only_report = True
@@ -1227,7 +1208,6 @@
pywikibot.output(u"The settings' page doesn't exist!")
self.settingsData = None
except pywikibot.Error:
- # Error? Settings = None
pywikibot.output(
u'Problems with loading the settigs, run without them.')
self.settingsData = None
@@ -1297,7 +1277,8 @@
return True
if template in self.hiddentemplates:
- # if the whitetemplate is not in the images description, we don't care
+ # if the whitetemplate is not in the images description, we don't
+ # care
try:
self.allLicenses.remove(template)
except ValueError:
@@ -1341,10 +1322,8 @@
self.whiteTemplatesFound = False
regex_find_licenses = re.compile(
r'(?<!\{)\{\{(?:[Tt]emplate:|)([^{]+?)[|\n<}]', re.DOTALL)
- # see below to understand the use of this regex
regex_are_licenses = re.compile(
r'(?<!\{)\{\{(?:[Tt]emplate:|)([^{]+?)\}\}', re.DOTALL)
- #dummy_edit = False
while True:
self.hiddentemplates = self.loadHiddenTemplates()
self.licenses_found = self.image.getTemplates()
@@ -1374,20 +1353,8 @@
== self.convert_to_url(
templateReal.title()).lower().replace('template%3a',
''):
- if templateReal not in self.allLicenses: # don't put the same template, twice.
+ if templateReal not in self.allLicenses:
self.allLicenses.append(templateReal)
- # perform a dummy edit, sometimes there are problems with the Job queue
- # it happends that there is listed only the template used and not all the template that are in the templates used in the page
- # for example: there's only self, and not GFDL and the other licenses.
- #if self.allLicenses == self.licenses_found and not dummy_edit and self.licenses_found != []:
- # pywikibot.output(u"Seems that there's a problem regarding the Job queue, trying with a dummy edit to solve the problem.")
- # try:
- # self.imageCheckText = self.image.get()
- # self.image.put(self.imageCheckText, 'Bot: Dummy edit,if you see this comment write [[User talk:%s|here]].' % self.botnick)
- # except (pywikibot.NoPage, pywikibot.IsRedirectPage):
- # return (None, list())
- # dummy_edit = True
- #else:
break
if self.licenses_found:
@@ -1482,7 +1449,8 @@
if skip_number == 1:
pywikibot.output(u'Skipping the first file:\n')
else:
- pywikibot.output(u'Skipping the first %s files:\n' % skip_number)
+ pywikibot.output(u'Skipping the first %s files:\n'
+ % skip_number)
# If we still have pages to skip:
if len(self.skip_list) < skip_number:
pywikibot.output(u'Skipping %s...' % self.imageName)
@@ -1572,7 +1540,6 @@
return True
elif i.lower() in self.imageCheckText:
return True
-
return False # Nothing Found
def findAdditionalProblems(self):
@@ -1654,11 +1621,7 @@
di = u'\n%s' % di
dels = dels % di
- # Page => ImagePage
- # Get the text in the image (called imageCheckText)
try:
- # the checkText will be modified in order to make the check phase
- # easier
self.imageCheckText = self.image.get()
except pywikibot.NoPage:
pywikibot.output(u"Skipping %s because it has been deleted."
@@ -1673,7 +1636,7 @@
regex_pre = re.compile(r'<pre>(.*?)</pre>', re.DOTALL)
self.imageCheckText = regex_nowiki.sub('', self.imageCheckText)
self.imageCheckText = regex_pre.sub('', self.imageCheckText)
- # Deleting the useless template from the description (before adding something
+ # Deleting the useless template from the description (before adding sth
# in the image the original text will be reloaded, don't worry).
if self.isTagged():
printWithTimeZone(u'%s is already tagged...' % self.imageName)
@@ -1731,9 +1694,7 @@
return True
-gbv = Global()
-
-def checkbot():
+def main():
""" Main function """
# Command line configurable parameters
repeat = True # Restart after having check all the images?
@@ -1866,13 +1827,15 @@
# Define the site.
site = pywikibot.getSite()
- # If the images to skip are 0, set the skip variable to False (the same for the wait time)
+ # If the images to skip are 0, set the skip variable to False (the same for
+ # the wait time)
if skip_number == 0:
skip = False
if waitTime == 0:
wait = False
- # A little block-statement to ensure that the bot will not start with en-parameters
+ # A little block-statement to ensure that the bot will not start with
+ # en-parameters
if site.lang not in project_inserted:
pywikibot.output(u"Your project is not supported by this script.\n"
u"You have to edit the script and add it!")
@@ -1885,7 +1848,6 @@
else:
pywikibot.output(u"Retrieving the latest %d files for checking..."
% limit)
- # Main Loop
while True:
# Defing the Main Class.
Bot = checkImagesBot(site, sendemailActive=sendemailActive,
@@ -1909,16 +1871,13 @@
except pywikibot.NoPage:
pywikibot.output(u"%s doesn't exist!" % pageRegex.title())
textRegex = '' # No source, so the bot will quit later.
- # If generator is the regex' one, use your own Generator using an url or page and a regex.
+ # If generator is the regex' one, use your own Generator using an url
+ # or page and a regex.
if generator == 'regex' and regexGen:
generator = Bot.regexGenerator(regexpToUse, textRegex)
- # Ok, We (should) have a generator, so let's go on.
- # Take the additional settings for the Project
+
Bot.takesettings()
- # Not the main, but the most important loop.
- #parsed = False
if wait:
- # Let's sleep...
generator = Bot.wait(waitTime, generator, normal, limit)
generator = pg.NamespaceFilterPageGenerator(generator, 6, site)
for image in generator:
@@ -1939,26 +1898,23 @@
continue
if Bot.checkStep():
continue
- # A little block to perform the repeat or to break.
+
if repeat:
- printWithTimeZone(u"Waiting for %s seconds," % time_sleep)
+ pywikibot.output(u"Waiting for %s seconds," % time_sleep)
time.sleep(time_sleep)
else:
- pywikibot.output(u"\t\t\t>> STOP! <<")
- break # Exit
+ break
-# Main loop will take all the (name of the) images and then i'll check them.
if __name__ == "__main__":
- #timezones are UTC
old = datetime.datetime.strptime(
str(datetime.datetime.utcnow()).split('.')[0], "%Y-%m-%d %H:%M:%S")
try:
- checkbot()
+ main()
finally:
final = datetime.datetime.strptime(
str(datetime.datetime.utcnow()).split('.')[0], "%Y-%m-%d %H:%M:%S")
delta = final - old
secs_of_diff = delta.seconds
- pywikibot.output("Execution time: %s" % secs_of_diff)
+ pywikibot.output("Execution time: %s seconds\n" % secs_of_diff)
pywikibot.stopme()
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10984
Revision: 10984
Author: xqt
Date: 2013-01-25 14:33:32 +0000 (Fri, 25 Jan 2013)
Log Message:
-----------
fix for wait generator
Modified Paths:
--------------
trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================
--- trunk/pywikipedia/checkimages.py 2013-01-25 13:10:38 UTC (rev 10983)
+++ trunk/pywikipedia/checkimages.py 2013-01-25 14:33:32 UTC (rev 10984)
@@ -1500,34 +1500,32 @@
first x seconds.
"""
imagesToSkip = 0
- # if normal, we can take as many images as "limit" has told us, otherwise, sorry, nope.
+ # if normal, we can take as many images as "limit" has told us,
+ # otherwise, sorry, nope.
if normal:
- printWithTimeZone(u'Skipping the files uploaded less than %s seconds ago..' % waitTime)
+ printWithTimeZone(
+ u'Skipping the files uploaded less than %s seconds ago..'
+ % waitTime)
imagesToSkip = 0
while True:
loadOtherImages = True # ensure that all the images loaded aren't to skip!
for image in generator:
- if normal:
- imageData = image
- image = imageData[0]
- #20100511133318L --- 15:33, 11 mag 2010 e 18 sec
- b = str(imageData[1]) # use b as variable to make smaller the timestamp-formula used below..
- # fixing the timestamp to the format that we normally use..
- timestamp = "%s-%s-%sT%s:%s:%sZ" % (b[0:4], b[4:6], b[6:8], b[8:10], b[10:12], b[12:14])
- else:
- #http://pytz.sourceforge.net/ <- maybe useful?
- # '2008-06-18T08:04:29Z'
- timestamp = image.getLatestUploader()[1]
- img_time = datetime.datetime.strptime(timestamp, u"%Y-%m-%dT%H:%M:%SZ") #not relative to localtime
+ timestamp = image.getLatestUploader()[1]
+ img_time = datetime.datetime.strptime(timestamp,
+ u"%Y-%m-%dT%H:%M:%SZ") #not relative to localtime
- now = datetime.datetime.strptime(str(datetime.datetime.utcnow()).split('.')[0], "%Y-%m-%d %H:%M:%S") #timezones are UTC
+ now = datetime.datetime.strptime(
+ str(datetime.datetime.utcnow()).split('.')[0],
+ "%Y-%m-%d %H:%M:%S") #timezones are UTC
# + seconds to be sure that now > img_time
while now < img_time:
now = (now + datetime.timedelta(seconds=1))
delta = now - img_time
secs_of_diff = delta.seconds
if waitTime > secs_of_diff:
- pywikibot.output(u'Skipping %s, uploaded %s seconds ago..' % (image.title(), int(secs_of_diff)))
+ pywikibot.output(
+ u'Skipping %s, uploaded %s seconds ago..'
+ % (image.title(), int(secs_of_diff)))
imagesToSkip += 1
continue # Still wait
else:
@@ -1535,7 +1533,9 @@
break # No ok, continue
# if yes, we have skipped all the images given!
if loadOtherImages:
- generator = self.site.newimages(number = limit, lestart = timestamp)
+ generator = (x[0] for x in
+ self.site.newimages(number=limit,
+ lestart=timestamp))
imagesToSkip = 0
# continue to load images! continue
continue
@@ -1544,26 +1544,17 @@
newGen = list()
imagesToSkip += 1 # some calcs, better add 1
# Add new images, instead of the images skipped
- newImages = self.site.newimages(number = imagesToSkip, lestart = timestamp)
- for imageData in generator:
- if normal:
- image = imageData[0]
- #20100511133318L --- 15:33, 11 mag 2010 e 18 sec
- b = str(imageData[1]) # use b as variable to make smaller the timestamp-formula used below..
- # fixing the timestamp to the format that we normally use..
- timestamp = "%s-%s-%sT%s:%s:%sZ" % (b[0:4], b[4:6], b[6:8], b[8:10], b[10:12], b[12:14])
- uploader = imageData[2]
- comment = imageData[3]
- newGen.append([image, timestamp, uploader, comment])
- else:
- image = imageData
- newGen.append(image)
- num = 0
+ newImages = self.site.newimages(number=imagesToSkip,
+ lestart=timestamp)
+ for image in generator:
+ newGen.append(image)
for imageData in newImages:
- newGen.append(imageData)
+ newGen.append(imageData[0])
return newGen
else:
- pywikibot.output(u"The wait option is available only with the standard generator.")
+ pywikibot.output(
+ u"The wait option is available only with the standard "
+ u"generator.")
return generator
def isTagged(self):
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10983
Revision: 10983
Author: xqt
Date: 2013-01-25 13:10:38 +0000 (Fri, 25 Jan 2013)
Log Message:
-----------
use set for self.hiddentemplates to prevent exhausting memory usage
Modified Paths:
--------------
trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================
--- trunk/pywikipedia/checkimages.py 2013-01-25 12:11:39 UTC (rev 10982)
+++ trunk/pywikipedia/checkimages.py 2013-01-25 13:10:38 UTC (rev 10983)
@@ -574,8 +574,8 @@
self.com = pywikibot.translate(self.site, msg_comm10)
hiddentemplatesRaw = pywikibot.translate(self.site, HiddenTemplate,
fallback=False)
- self.hiddentemplates = [pywikibot.Page(self.site, tmp)
- for tmp in hiddentemplatesRaw]
+ self.hiddentemplates = set([pywikibot.Page(self.site, tmp)
+ for tmp in hiddentemplatesRaw])
self.pageHidden = pywikibot.translate(self.site,
PageWithHiddenTemplates,
fallback=False)
@@ -836,10 +836,11 @@
def loadHiddenTemplates(self):
""" Function to load the white templates """
- # A template as {{en is not a license! Adding also them in the whitelist template...
+ # A template as {{en is not a license! Adding also them in the
+ # whitelist template...
for langK in pywikibot.Family(u'wikipedia').langs.keys():
- self.hiddentemplates.append(pywikibot.Page(self.site,
- u'Template:%s' % langK))
+ self.hiddentemplates.add(pywikibot.Page(self.site,
+ u'Template:%s' % langK))
# The template #if: and #switch: aren't something to care about
#self.hiddentemplates.extend([u'#if:', u'#switch:']) FIXME
@@ -853,7 +854,7 @@
pageHiddenText = ''
for element in self.load(pageHiddenText):
- self.hiddentemplates.append(pywikibot.Page(self.site, element))
+ self.hiddentemplates.add(pywikibot.Page(self.site, element))
return self.hiddentemplates
def returnOlderTime(self, listGiven, timeListGiven):
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10982
Revision: 10982
Author: xqt
Date: 2013-01-25 12:11:39 +0000 (Fri, 25 Jan 2013)
Log Message:
-----------
if we only use re.sub the object is the same and both aliases self.imageCheckText and self.imageFullText points to image._contens. So we do not need to assing that alias.
Modified Paths:
--------------
trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================
--- trunk/pywikipedia/checkimages.py 2013-01-25 10:48:44 UTC (rev 10981)
+++ trunk/pywikipedia/checkimages.py 2013-01-25 12:11:39 UTC (rev 10982)
@@ -1584,7 +1584,7 @@
return False # Nothing Found
def findAdditionalProblems(self):
- # In every tupla there's a setting configuration
+ # In every tuple there's a setting configuration
for tupla in self.settingsData:
name = tupla[1]
find_tipe = tupla[2]
@@ -1605,7 +1605,8 @@
mexCatched = tupla[8]
for k in find_list:
if find_tipe.lower() == 'findonly':
- searchResults = re.findall(r'%s' % k.lower(), self.imageCheckText.lower())
+ searchResults = re.findall(r'%s' % k.lower(),
+ self.imageCheckText.lower())
if searchResults != []:
if searchResults[0] == self.imageCheckText.lower():
self.some_problem = True
@@ -1664,24 +1665,25 @@
# Page => ImagePage
# Get the text in the image (called imageCheckText)
try:
- # the checkText will be modified in order to make the check phase easier
- # the imageFullText will be used when the full text is needed without changes
+ # the checkText will be modified in order to make the check phase
+ # easier
self.imageCheckText = self.image.get()
- self.imageFullText = self.imageCheckText
except pywikibot.NoPage:
- pywikibot.output(u"Skipping %s because it has been deleted." % self.imageName)
+ pywikibot.output(u"Skipping %s because it has been deleted."
+ % self.imageName)
return True
except pywikibot.IsRedirectPage:
- pywikibot.output(u"Skipping %s because it's a redirect." % self.imageName)
+ pywikibot.output(u"Skipping %s because it's a redirect."
+ % self.imageName)
return True
# Delete the fields where the templates cannot be loaded
regex_nowiki = re.compile(r'<nowiki>(.*?)</nowiki>', re.DOTALL)
regex_pre = re.compile(r'<pre>(.*?)</pre>', re.DOTALL)
- self.imageCheckText = regex_nowiki.sub('', self.imageCheckText); self.imageCheckText = regex_pre.sub('', self.imageCheckText)
+ self.imageCheckText = regex_nowiki.sub('', self.imageCheckText)
+ self.imageCheckText = regex_pre.sub('', self.imageCheckText)
# Deleting the useless template from the description (before adding something
# in the image the original text will be reloaded, don't worry).
if self.isTagged():
- # Tagged? Yes, skip.
printWithTimeZone(u'%s is already tagged...' % self.imageName)
return True
for a_word in something: # something is the array with {{, MIT License and so on.