Pywikipedia-l December 2007

pywikipedia-l@lists.wikimedia.org

26 participants
320 discussions

SVN: [4656] trunk/pywikipedia/catlib.py
by rotem＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4656 Author: rotem Date: 2007-12-09 13:28:18 +0000 (Sun, 09 Dec 2007) Log Message: ----------- (patch 1843787) catlib _getContentsAndSupercats performance issue (changed patch to avoid possible problems in caching) Modified Paths: -------------- trunk/pywikipedia/catlib.py Modified: trunk/pywikipedia/catlib.py =================================================================== --- trunk/pywikipedia/catlib.py 2007-12-09 13:25:52 UTC (rev 4655) +++ trunk/pywikipedia/catlib.py 2007-12-09 13:28:18 UTC (rev 4656) @@ -83,7 +83,7 @@ return '[[%s]]' % titleWithSortKey def _getContentsAndSupercats(self, recurse=False, purge=False, - startFrom=None): + startFrom=None, cache=[]): """ Cache results of _parseCategory for a second call. @@ -107,36 +107,46 @@ newrecurse = recurse if self.completelyCached: for article in self.articleCache: - yield ARTICLE, article + if article not in cache: + cache.append(article) + yield ARTICLE, article for subcat in self.subcatCache: - yield SUBCATEGORY, subcat - if recurse: - # contents of subcategory are cached by calling - # this method recursively; therefore, do not cache - # them again - for item in subcat._getContentsAndSupercats(newrecurse, + if subcat not in cache: + cache.append(subcat) + yield SUBCATEGORY, subcat + if recurse: + # contents of subcategory are cached by calling + # this method recursively; therefore, do not cache + # them again + for item in subcat._getContentsAndSupercats(newrecurse, purge): - if item[0] != SUPERCATEGORY: - yield item + if item[0] != SUPERCATEGORY: + yield item for supercat in self.supercatCache: yield SUPERCATEGORY, supercat else: for tag, page in self._parseCategory(purge, startFrom): if tag == ARTICLE: self.articleCache.append(page) + if not page in cache: + cache.append(page) + yield ARTICLE, page elif tag == SUBCATEGORY: self.subcatCache.append(page) - if recurse: - # contents of subcategory are cached by calling - # this method recursively; therefore, do not cache - # them again - for item in page._getContentsAndSupercats(newrecurse, + if not page in cache: + cache.append(page) + yield SUBCATEGORY, page + if recurse: + # contents of subcategory are cached by calling + # this method recursively; therefore, do not cache + # them again + for item in page._getContentsAndSupercats(newrecurse, purge): - if item[0] != SUPERCATEGORY: - yield item + if item[0] != SUPERCATEGORY: + yield item elif tag == SUPERCATEGORY: self.supercatCache.append(page) - yield tag, page + yield SUPERCATEGORY, page if not startFrom: self.completelyCached = True

1 0

SVN: [4655] trunk/pywikipedia/wikipedia.py
by filnik＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4655 Author: filnik Date: 2007-12-09 13:25:52 +0000 (Sun, 09 Dec 2007) Log Message: ----------- Bugfix (if repeat is not used it returns only one image) Modified Paths: -------------- trunk/pywikipedia/wikipedia.py Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-12-09 13:23:16 UTC (rev 4654) +++ trunk/pywikipedia/wikipedia.py 2007-12-09 13:25:52 UTC (rev 4655) @@ -4227,21 +4227,22 @@ re.UNICODE) pos = 0 seen = list() - ext_list = list() - for m in regexp.finditer(html): - new = m.group('new') - im = m.group('image') - ext = m.group('ext') - # This prevent pages with strange characters. They will be loaded without problem. - image = "%s.%s" % (im, ext) - if new != '': - output(u"Skipping %s because it has been deleted." % image) + ext_list = list() + while True: + for m in regexp.finditer(html): + new = m.group('new') + im = m.group('image') + ext = m.group('ext') + # This prevent pages with strange characters. They will be loaded without problem. + image = "%s.%s" % (im, ext) + if new != '': + output(u"Skipping %s because it has been deleted." % image) + if image not in seen: + seen.append(image) if image not in seen: seen.append(image) - if image not in seen: - seen.append(image) - page = Page(self, 'Image:%s' % image) - yield page + page = Page(self, 'Image:%s' % image) + yield page if not repeat: output(u"\t\t>> All images checked. <<") break

1 0

SVN: [4654] trunk/pywikipedia/pagegenerators.py
by filnik＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4654 Author: filnik Date: 2007-12-09 13:23:16 +0000 (Sun, 09 Dec 2007) Log Message: ----------- Adding newImages() in the GenFactory class Modified Paths: -------------- trunk/pywikipedia/pagegenerators.py Modified: trunk/pywikipedia/pagegenerators.py =================================================================== --- trunk/pywikipedia/pagegenerators.py 2007-12-09 13:04:42 UTC (rev 4653) +++ trunk/pywikipedia/pagegenerators.py 2007-12-09 13:23:16 UTC (rev 4654) @@ -772,6 +772,12 @@ if namespace: prefix = prefix[colon+1:] gen = PrefixingPageGenerator(prefix = prefix, namespace = namespace) + elif arg.startswith('-newimages'): + if len(arg) == 10: + limit = wikipedia.input(u'How many images do you want to check?') + else: + limit = arg[11:] + gen = newImages(limit, wikipedia.getSite()) elif arg.startswith('-new'): if len(arg) >=5: gen = NewpagesPageGenerator(number = int(arg[5:]))

1 0

SVN: [4653] trunk/pywikipedia/wikipedia.py
by filnik＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4653 Author: filnik Date: 2007-12-09 13:04:42 +0000 (Sun, 09 Dec 2007) Log Message: ----------- Bugfix (wikipedia.output() is only output() in the wikipedia.py file...) Modified Paths: -------------- trunk/pywikipedia/wikipedia.py Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-12-09 12:28:55 UTC (rev 4652) +++ trunk/pywikipedia/wikipedia.py 2007-12-09 13:04:42 UTC (rev 4653) @@ -4235,7 +4235,7 @@ # This prevent pages with strange characters. They will be loaded without problem. image = "%s.%s" % (im, ext) if new != '': - wikipedia.output(u"Skipping %s because it has been deleted." % image) + output(u"Skipping %s because it has been deleted." % image) if image not in seen: seen.append(image) if image not in seen: @@ -4243,7 +4243,7 @@ page = Page(self, 'Image:%s' % image) yield page if not repeat: - wikipedia.output(u"\t\t>> All images checked. <<") + output(u"\t\t>> All images checked. <<") break def uncategorizedimages(self, number = 10, repeat = False):

1 0

SVN: [4652] trunk/pywikipedia/blockpageschecker.py
by filnik＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4652 Author: filnik Date: 2007-12-09 12:28:55 +0000 (Sun, 09 Dec 2007) Log Message: ----------- Adding documentation and a check-block for people that don't set their preferences Modified Paths: -------------- trunk/pywikipedia/blockpageschecker.py Modified: trunk/pywikipedia/blockpageschecker.py =================================================================== --- trunk/pywikipedia/blockpageschecker.py 2007-12-09 12:10:26 UTC (rev 4651) +++ trunk/pywikipedia/blockpageschecker.py 2007-12-09 12:28:55 UTC (rev 4652) @@ -12,8 +12,13 @@ Note: This script uses also genfactory, you can use those generator as default. -Example of how to use the script: +--- Warning! --- +You have to edit this script in order to add your preferences otherwise the script won't work! +If you have problems, ask on botwiki ( http://botwiki.sno.cc ) or on IRC (#pywikipediabot) + +--- Example of how to use the script --- + python blockpageschecker.py -always python blockpageschecker.py -cat:Geography -always @@ -29,9 +34,13 @@ # import re -import wikipedia, catlib, pagegenerators +import wikipedia, catlib, pagegenerators, config -# Use only regex! +####################################################### +#--------------------- PREFERENCES -------------------# +################### -- Edit below! -- ################# + +# Use only regex! - Regex to delete the template templateToRemove = { 'en':[r'\{\{(?:[Tt]emplate:|)[Pp]p-protected\}\}', r'{\{([Tt]emplate:|)[Pp]p-dispute\}\}', r'{\{(?:[Tt]emplate:|)[Pp]p-template\}\}', r'{\{([Tt]emplate:|)[Pp]p-usertalk\}\}'], @@ -41,21 +50,31 @@ ], 'it':[r'{\{(?:[Tt]emplate:|)[Aa]vvisobloccoparziale(?:|[ _]scad\|(.*?))\}\}', r'{\{(?:[Tt]emplate:|)[Aa]vvisoblocco(?:|[ _]scad\|(?:.*?))\}\}'], } +# Category where the bot will check categoryToCheck = { 'en':[u'Category:Protected'], 'fr':[u'Category:Page semi-protégée', u'Category:Page protégée'], 'it':[u'Categoria:Pagine semiprotette', u'Categoria:Voci_protette'], } - +# Comment used when the Bot edits comment = { 'en':u'Bot: Deleting out-dated template', 'fr':u'Robot : Retrait du bandeau protection/semi-protection d\'une page qui ne l\'es plus', 'it':u'Bot: Tolgo template di avviso blocco scaduto', } +# Check list to block the users that haven't set their preferences +project_inserted = ['en', 'fr', 'it'] +####################################################### +#------------------ END PREFERENCES ------------------# +################## -- Edit above! -- ################## + def main(): # Loading the comments - global templateToRemove; global categoryToCheck; global comment + global templateToRemove; global categoryToCheck; global comment; global project_inserted + if config.mylang not in project_inserted: + wikipedia.output(u"Your project is not supported by this script. You have to edit the script and add it!") + wikipedia.stopme() # always, define a generator to understand if the user sets one, defining what's genFactory always = False; generator = False; genFactory = pagegenerators.GeneratorFactory() # To prevent Infinite loops

1 0

SVN: [4651] trunk/pywikipedia/pagegenerators.py
by filnik＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4651 Author: filnik Date: 2007-12-09 12:10:26 +0000 (Sun, 09 Dec 2007) Log Message: ----------- Adding a new generator, newImages() Modified Paths: -------------- trunk/pywikipedia/pagegenerators.py Modified: trunk/pywikipedia/pagegenerators.py =================================================================== --- trunk/pywikipedia/pagegenerators.py 2007-12-09 12:09:34 UTC (rev 4650) +++ trunk/pywikipedia/pagegenerators.py 2007-12-09 12:10:26 UTC (rev 4651) @@ -195,6 +195,12 @@ for page in site.uncategorizedimages(number=number, repeat=repeat): yield page +def newImages(limit = 50, site = None, repeat = False): + if site is None: + site = wikipedia.getSite() + for page in site.newImages(limit, repeat=repeat): + yield page + def UnCategorizedPageGenerator(number = 100, repeat = False, site = None): if site is None: site = wikipedia.getSite()

1 0

SVN: [4650] trunk/pywikipedia/wikipedia.py
by filnik＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4650 Author: filnik Date: 2007-12-09 12:09:34 +0000 (Sun, 09 Dec 2007) Log Message: ----------- Adding a new generator, newImages() Modified Paths: -------------- trunk/pywikipedia/wikipedia.py Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-12-09 11:49:32 UTC (rev 4649) +++ trunk/pywikipedia/wikipedia.py 2007-12-09 12:09:34 UTC (rev 4650) @@ -3468,6 +3468,7 @@ search(query): query results from Special:Search allpages(): Special:Allpages newpages(): Special:Newpages + newImages(): Special:Log&type=upload longpages(): Special:Longpages shortpages(): Special:Shortpages categories(): Special:Categories (yields Category objects) @@ -4214,6 +4215,37 @@ if not repeat: break + def newImages(self, limit = 50, repeat = False): + """Yield ImagePages from Special:Log&type=upload""" + # Url of the new images + url = "/w/index.php?title=Special:Log&type=upload&user=&page=&pattern=&limit=%d&offset=0" % int(limit) + # Get the HTML text + html = self.getUrl(url) + image_namespace = self.image_namespace() + regexp = re.compile( + r'(?P<new>class=\"new\" |)title=\"%s:(?P<image>.*?)\.(?P<ext>\w\w\w|jpeg)\">.*?</a>\".*?<span class=\"comment\">' % image_namespace, + re.UNICODE) + pos = 0 + seen = list() + ext_list = list() + for m in regexp.finditer(html): + new = m.group('new') + im = m.group('image') + ext = m.group('ext') + # This prevent pages with strange characters. They will be loaded without problem. + image = "%s.%s" % (im, ext) + if new != '': + wikipedia.output(u"Skipping %s because it has been deleted." % image) + if image not in seen: + seen.append(image) + if image not in seen: + seen.append(image) + page = Page(self, 'Image:%s' % image) + yield page + if not repeat: + wikipedia.output(u"\t\t>> All images checked. <<") + break + def uncategorizedimages(self, number = 10, repeat = False): """Yield ImagePages from Special:Uncategorizedimages.""" seen = set()

1 0

[ pywikipediabot-Bugs-1843759 ] [patch] image.py doesn't work
by SourceForge.net 09 Dec '07

09 Dec '07

Bugs item #1843759, was opened at 2007-12-04 02:38 Message generated for change (Comment added) made by rotemliss You can respond by visiting: https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1843759&group_… Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: other Group: None Status: Open Resolution: None Priority: 5 Private: No Submitted By: Pietro Battiston (toobaz) Assigned to: Nobody/Anonymous (nobody) Summary: [patch] image.py doesn't work Initial Comment: The following command: python image.py pippo.png gives the following output: Checked for running processes. 1 processes currently running, including the current process. 'Page' object has no attribute 'usingPages' The problem is that oldImagePage is an instance of wikipedia.Page instead of wikipedia.ImagePage. I attach a very simple patch that fixes it. Then, everything works. ---------------------------------------------------------------------- Comment By: Rotem Liss (rotemliss) Date: 2007-12-09 13:52 Message: Logged In: YES user_id=1327030 Originator: NO Fixed in r4649. ---------------------------------------------------------------------- You can respond by visiting: https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1843759&group_…

1 0

SVN: [4649] trunk/pywikipedia/image.py
by rotem＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4649 Author: rotem Date: 2007-12-09 11:49:32 +0000 (Sun, 09 Dec 2007) Log Message: ----------- (bug 1843759) Using ImagePage rather than Page in image.py Modified Paths: -------------- trunk/pywikipedia/image.py Modified: trunk/pywikipedia/image.py =================================================================== --- trunk/pywikipedia/image.py 2007-12-09 11:41:22 UTC (rev 4648) +++ trunk/pywikipedia/image.py 2007-12-09 11:49:32 UTC (rev 4649) @@ -155,7 +155,7 @@ mysite = wikipedia.getSite() ns = mysite.image_namespace() - oldImagePage = wikipedia.Page(mysite, ns + ':' + oldImage) + oldImagePage = wikipedia.ImagePage(mysite, ns + ':' + oldImage) gen = pagegenerators.FileLinksGenerator(oldImagePage) preloadingGen = pagegenerators.PreloadingGenerator(gen)

1 0

SVN: [4648] trunk/pywikipedia/blockpageschecker.py
by filnik＠svn.wikimedia.org 09 Dec '07

09 Dec '07

Revision: 4648 Author: filnik Date: 2007-12-09 11:41:22 +0000 (Sun, 09 Dec 2007) Log Message: ----------- Adding comments in the code and adding two example of how to use the script Modified Paths: -------------- trunk/pywikipedia/blockpageschecker.py Modified: trunk/pywikipedia/blockpageschecker.py =================================================================== --- trunk/pywikipedia/blockpageschecker.py 2007-12-09 11:40:15 UTC (rev 4647) +++ trunk/pywikipedia/blockpageschecker.py 2007-12-09 11:41:22 UTC (rev 4648) @@ -10,8 +10,14 @@ -always Doesn't ask every time if the bot should make the change or not, do it always. -page Work only on one page -Note: This script uses also genfactory, you can use these generator as default. +Note: This script uses also genfactory, you can use those generator as default. +Example of how to use the script: + +python blockpageschecker.py -always + +python blockpageschecker.py -cat:Geography -always + """ # # (C) Wikihermit, 2007 @@ -26,7 +32,6 @@ import wikipedia, catlib, pagegenerators # Use only regex! -#fr regexes added by Darkoneko 09 oct 07, THEY ARE UNTESTED at the moment, please check ! templateToRemove = { 'en':[r'\{\{(?:[Tt]emplate:|)[Pp]p-protected\}\}', r'{\{([Tt]emplate:|)[Pp]p-dispute\}\}', r'{\{(?:[Tt]emplate:|)[Pp]p-template\}\}', r'{\{([Tt]emplate:|)[Pp]p-usertalk\}\}'], @@ -49,12 +54,11 @@ } def main(): - global templateToRemove - global categoryToCheck - global comment - always = False - generator = False - genFactory = pagegenerators.GeneratorFactory() + # Loading the comments + global templateToRemove; global categoryToCheck; global comment + # always, define a generator to understand if the user sets one, defining what's genFactory + always = False; generator = False; genFactory = pagegenerators.GeneratorFactory() + # To prevent Infinite loops errorCount = 0 # Loading the default options. for arg in wikipedia.handleArgs(): @@ -73,16 +77,19 @@ TTR = wikipedia.translate(site, templateToRemove) category = wikipedia.translate(site, categoryToCheck) commentUsed = wikipedia.translate(site, comment) - # Define the category if not generator: + # Define the category if no other generator has been setted for CAT in category: cat = catlib.Category(site, CAT) # Define the generator generator = pagegenerators.CategorizedPageGenerator(cat) + # Main Loop for page in generator: pagename = page.title() wikipedia.output('Loading %s...' % pagename) try: + # The same as .get() but it loads also the editRestriction var, that's what we + # need to understand if the page is protected or not. (text, useless, editRestriction) = page._getEditPage() except wikipedia.NoPage: wikipedia.output("%s doesn't exist! Skipping..." % pagename) @@ -98,9 +105,11 @@ wikipedia.output(u'The page is editable for all, deleting the template...') # Only to see if the text is the same or not... oldtext = text + # Deleting the template because the page doesn't need it. for replaceToPerform in TTR: text = re.sub(replaceToPerform, '', text) if oldtext != text: + # Ok, asking if the change has to be performed and do it. wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) wikipedia.showDiff(oldtext, text) choice = '' @@ -118,12 +127,15 @@ wikipedia.output(u'Edit conflict! skip!') break except wikipedia.ServerError: + # Sometimes there is this error that's quite annoying because + # can block the whole process for nothing. errorCount += 1 if errorCount < 5: wikipedia.output(u'Server Error! Wait..') time.sleep(3) continue else: + # Prevent Infinite Loops raise wikipedia.ServerError(u'Fifth Server Error!') except wikipedia.SpamfilterError, e: wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) @@ -135,7 +147,7 @@ wikipedia.output(u'The page is still protected. Skipping...') break else: - # Break only if the errors are one after the other... + # Break only if the errors are one after the other errorCount = 0 break if __name__ == "__main__":

1 0

← Newer
1
...
23
24
25
26
27
28
29
...
32
Older →

Jump to page:

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

Pywikipedia-l December 2007