Revision: 4656
Author: rotem
Date: 2007-12-09 13:28:18 +0000 (Sun, 09 Dec 2007)
Log Message:
-----------
(patch 1843787) catlib _getContentsAndSupercats performance issue (changed patch to avoid possible problems in caching)
Modified Paths:
--------------
trunk/pywikipedia/catlib.py
Modified: trunk/pywikipedia/catlib.py
===================================================================
--- trunk/pywikipedia/catlib.py 2007-12-09 13:25:52 UTC (rev 4655)
+++ trunk/pywikipedia/catlib.py 2007-12-09 13:28:18 UTC (rev 4656)
@@ -83,7 +83,7 @@
return '[[%s]]' % titleWithSortKey
def _getContentsAndSupercats(self, recurse=False, purge=False,
- startFrom=None):
+ startFrom=None, cache=[]):
"""
Cache results of _parseCategory for a second call.
@@ -107,36 +107,46 @@
newrecurse = recurse
if self.completelyCached:
for article in self.articleCache:
- yield ARTICLE, article
+ if article not in cache:
+ cache.append(article)
+ yield ARTICLE, article
for subcat in self.subcatCache:
- yield SUBCATEGORY, subcat
- if recurse:
- # contents of subcategory are cached by calling
- # this method recursively; therefore, do not cache
- # them again
- for item in subcat._getContentsAndSupercats(newrecurse,
+ if subcat not in cache:
+ cache.append(subcat)
+ yield SUBCATEGORY, subcat
+ if recurse:
+ # contents of subcategory are cached by calling
+ # this method recursively; therefore, do not cache
+ # them again
+ for item in subcat._getContentsAndSupercats(newrecurse,
purge):
- if item[0] != SUPERCATEGORY:
- yield item
+ if item[0] != SUPERCATEGORY:
+ yield item
for supercat in self.supercatCache:
yield SUPERCATEGORY, supercat
else:
for tag, page in self._parseCategory(purge, startFrom):
if tag == ARTICLE:
self.articleCache.append(page)
+ if not page in cache:
+ cache.append(page)
+ yield ARTICLE, page
elif tag == SUBCATEGORY:
self.subcatCache.append(page)
- if recurse:
- # contents of subcategory are cached by calling
- # this method recursively; therefore, do not cache
- # them again
- for item in page._getContentsAndSupercats(newrecurse,
+ if not page in cache:
+ cache.append(page)
+ yield SUBCATEGORY, page
+ if recurse:
+ # contents of subcategory are cached by calling
+ # this method recursively; therefore, do not cache
+ # them again
+ for item in page._getContentsAndSupercats(newrecurse,
purge):
- if item[0] != SUPERCATEGORY:
- yield item
+ if item[0] != SUPERCATEGORY:
+ yield item
elif tag == SUPERCATEGORY:
self.supercatCache.append(page)
- yield tag, page
+ yield SUPERCATEGORY, page
if not startFrom:
self.completelyCached = True
Revision: 4655
Author: filnik
Date: 2007-12-09 13:25:52 +0000 (Sun, 09 Dec 2007)
Log Message:
-----------
Bugfix (if repeat is not used it returns only one image)
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-12-09 13:23:16 UTC (rev 4654)
+++ trunk/pywikipedia/wikipedia.py 2007-12-09 13:25:52 UTC (rev 4655)
@@ -4227,21 +4227,22 @@
re.UNICODE)
pos = 0
seen = list()
- ext_list = list()
- for m in regexp.finditer(html):
- new = m.group('new')
- im = m.group('image')
- ext = m.group('ext')
- # This prevent pages with strange characters. They will be loaded without problem.
- image = "%s.%s" % (im, ext)
- if new != '':
- output(u"Skipping %s because it has been deleted." % image)
+ ext_list = list()
+ while True:
+ for m in regexp.finditer(html):
+ new = m.group('new')
+ im = m.group('image')
+ ext = m.group('ext')
+ # This prevent pages with strange characters. They will be loaded without problem.
+ image = "%s.%s" % (im, ext)
+ if new != '':
+ output(u"Skipping %s because it has been deleted." % image)
+ if image not in seen:
+ seen.append(image)
if image not in seen:
seen.append(image)
- if image not in seen:
- seen.append(image)
- page = Page(self, 'Image:%s' % image)
- yield page
+ page = Page(self, 'Image:%s' % image)
+ yield page
if not repeat:
output(u"\t\t>> All images checked. <<")
break
Revision: 4653
Author: filnik
Date: 2007-12-09 13:04:42 +0000 (Sun, 09 Dec 2007)
Log Message:
-----------
Bugfix (wikipedia.output() is only output() in the wikipedia.py file...)
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-12-09 12:28:55 UTC (rev 4652)
+++ trunk/pywikipedia/wikipedia.py 2007-12-09 13:04:42 UTC (rev 4653)
@@ -4235,7 +4235,7 @@
# This prevent pages with strange characters. They will be loaded without problem.
image = "%s.%s" % (im, ext)
if new != '':
- wikipedia.output(u"Skipping %s because it has been deleted." % image)
+ output(u"Skipping %s because it has been deleted." % image)
if image not in seen:
seen.append(image)
if image not in seen:
@@ -4243,7 +4243,7 @@
page = Page(self, 'Image:%s' % image)
yield page
if not repeat:
- wikipedia.output(u"\t\t>> All images checked. <<")
+ output(u"\t\t>> All images checked. <<")
break
def uncategorizedimages(self, number = 10, repeat = False):
Revision: 4652
Author: filnik
Date: 2007-12-09 12:28:55 +0000 (Sun, 09 Dec 2007)
Log Message:
-----------
Adding documentation and a check-block for people that don't set their preferences
Modified Paths:
--------------
trunk/pywikipedia/blockpageschecker.py
Modified: trunk/pywikipedia/blockpageschecker.py
===================================================================
--- trunk/pywikipedia/blockpageschecker.py 2007-12-09 12:10:26 UTC (rev 4651)
+++ trunk/pywikipedia/blockpageschecker.py 2007-12-09 12:28:55 UTC (rev 4652)
@@ -12,8 +12,13 @@
Note: This script uses also genfactory, you can use those generator as default.
-Example of how to use the script:
+--- Warning! ---
+You have to edit this script in order to add your preferences otherwise the script won't work!
+If you have problems, ask on botwiki ( http://botwiki.sno.cc ) or on IRC (#pywikipediabot)
+
+--- Example of how to use the script ---
+
python blockpageschecker.py -always
python blockpageschecker.py -cat:Geography -always
@@ -29,9 +34,13 @@
#
import re
-import wikipedia, catlib, pagegenerators
+import wikipedia, catlib, pagegenerators, config
-# Use only regex!
+#######################################################
+#--------------------- PREFERENCES -------------------#
+################### -- Edit below! -- #################
+
+# Use only regex! - Regex to delete the template
templateToRemove = {
'en':[r'\{\{(?:[Tt]emplate:|)[Pp]p-protected\}\}', r'{\{([Tt]emplate:|)[Pp]p-dispute\}\}',
r'{\{(?:[Tt]emplate:|)[Pp]p-template\}\}', r'{\{([Tt]emplate:|)[Pp]p-usertalk\}\}'],
@@ -41,21 +50,31 @@
],
'it':[r'{\{(?:[Tt]emplate:|)[Aa]vvisobloccoparziale(?:|[ _]scad\|(.*?))\}\}', r'{\{(?:[Tt]emplate:|)[Aa]vvisoblocco(?:|[ _]scad\|(?:.*?))\}\}'],
}
+# Category where the bot will check
categoryToCheck = {
'en':[u'Category:Protected'],
'fr':[u'Category:Page semi-protégée', u'Category:Page protégée'],
'it':[u'Categoria:Pagine semiprotette', u'Categoria:Voci_protette'],
}
-
+# Comment used when the Bot edits
comment = {
'en':u'Bot: Deleting out-dated template',
'fr':u'Robot : Retrait du bandeau protection/semi-protection d\'une page qui ne l\'es plus',
'it':u'Bot: Tolgo template di avviso blocco scaduto',
}
+# Check list to block the users that haven't set their preferences
+project_inserted = ['en', 'fr', 'it']
+#######################################################
+#------------------ END PREFERENCES ------------------#
+################## -- Edit above! -- ##################
+
def main():
# Loading the comments
- global templateToRemove; global categoryToCheck; global comment
+ global templateToRemove; global categoryToCheck; global comment; global project_inserted
+ if config.mylang not in project_inserted:
+ wikipedia.output(u"Your project is not supported by this script. You have to edit the script and add it!")
+ wikipedia.stopme()
# always, define a generator to understand if the user sets one, defining what's genFactory
always = False; generator = False; genFactory = pagegenerators.GeneratorFactory()
# To prevent Infinite loops
Bugs item #1843759, was opened at 2007-12-04 02:38
Message generated for change (Comment added) made by rotemliss
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1843759&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: other
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Pietro Battiston (toobaz)
Assigned to: Nobody/Anonymous (nobody)
Summary: [patch] image.py doesn't work
Initial Comment:
The following command:
python image.py pippo.png
gives the following output:
Checked for running processes. 1 processes currently running, including the current process.
'Page' object has no attribute 'usingPages'
The problem is that oldImagePage is an instance of wikipedia.Page instead of wikipedia.ImagePage.
I attach a very simple patch that fixes it. Then, everything works.
----------------------------------------------------------------------
Comment By: Rotem Liss (rotemliss)
Date: 2007-12-09 13:52
Message:
Logged In: YES
user_id=1327030
Originator: NO
Fixed in r4649.
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1843759&group_…
Revision: 4648
Author: filnik
Date: 2007-12-09 11:41:22 +0000 (Sun, 09 Dec 2007)
Log Message:
-----------
Adding comments in the code and adding two example of how to use the script
Modified Paths:
--------------
trunk/pywikipedia/blockpageschecker.py
Modified: trunk/pywikipedia/blockpageschecker.py
===================================================================
--- trunk/pywikipedia/blockpageschecker.py 2007-12-09 11:40:15 UTC (rev 4647)
+++ trunk/pywikipedia/blockpageschecker.py 2007-12-09 11:41:22 UTC (rev 4648)
@@ -10,8 +10,14 @@
-always Doesn't ask every time if the bot should make the change or not, do it always.
-page Work only on one page
-Note: This script uses also genfactory, you can use these generator as default.
+Note: This script uses also genfactory, you can use those generator as default.
+Example of how to use the script:
+
+python blockpageschecker.py -always
+
+python blockpageschecker.py -cat:Geography -always
+
"""
#
# (C) Wikihermit, 2007
@@ -26,7 +32,6 @@
import wikipedia, catlib, pagegenerators
# Use only regex!
-#fr regexes added by Darkoneko 09 oct 07, THEY ARE UNTESTED at the moment, please check !
templateToRemove = {
'en':[r'\{\{(?:[Tt]emplate:|)[Pp]p-protected\}\}', r'{\{([Tt]emplate:|)[Pp]p-dispute\}\}',
r'{\{(?:[Tt]emplate:|)[Pp]p-template\}\}', r'{\{([Tt]emplate:|)[Pp]p-usertalk\}\}'],
@@ -49,12 +54,11 @@
}
def main():
- global templateToRemove
- global categoryToCheck
- global comment
- always = False
- generator = False
- genFactory = pagegenerators.GeneratorFactory()
+ # Loading the comments
+ global templateToRemove; global categoryToCheck; global comment
+ # always, define a generator to understand if the user sets one, defining what's genFactory
+ always = False; generator = False; genFactory = pagegenerators.GeneratorFactory()
+ # To prevent Infinite loops
errorCount = 0
# Loading the default options.
for arg in wikipedia.handleArgs():
@@ -73,16 +77,19 @@
TTR = wikipedia.translate(site, templateToRemove)
category = wikipedia.translate(site, categoryToCheck)
commentUsed = wikipedia.translate(site, comment)
- # Define the category
if not generator:
+ # Define the category if no other generator has been setted
for CAT in category:
cat = catlib.Category(site, CAT)
# Define the generator
generator = pagegenerators.CategorizedPageGenerator(cat)
+ # Main Loop
for page in generator:
pagename = page.title()
wikipedia.output('Loading %s...' % pagename)
try:
+ # The same as .get() but it loads also the editRestriction var, that's what we
+ # need to understand if the page is protected or not.
(text, useless, editRestriction) = page._getEditPage()
except wikipedia.NoPage:
wikipedia.output("%s doesn't exist! Skipping..." % pagename)
@@ -98,9 +105,11 @@
wikipedia.output(u'The page is editable for all, deleting the template...')
# Only to see if the text is the same or not...
oldtext = text
+ # Deleting the template because the page doesn't need it.
for replaceToPerform in TTR:
text = re.sub(replaceToPerform, '', text)
if oldtext != text:
+ # Ok, asking if the change has to be performed and do it.
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
wikipedia.showDiff(oldtext, text)
choice = ''
@@ -118,12 +127,15 @@
wikipedia.output(u'Edit conflict! skip!')
break
except wikipedia.ServerError:
+ # Sometimes there is this error that's quite annoying because
+ # can block the whole process for nothing.
errorCount += 1
if errorCount < 5:
wikipedia.output(u'Server Error! Wait..')
time.sleep(3)
continue
else:
+ # Prevent Infinite Loops
raise wikipedia.ServerError(u'Fifth Server Error!')
except wikipedia.SpamfilterError, e:
wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
@@ -135,7 +147,7 @@
wikipedia.output(u'The page is still protected. Skipping...')
break
else:
- # Break only if the errors are one after the other...
+ # Break only if the errors are one after the other
errorCount = 0
break
if __name__ == "__main__":