Revision: 6779
Author: cosoleto
Date: 2009-04-30 15:27:26 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
Changed default number of pages returned from RandomPageGenerator and RandomRedirectPageGenerator. 10 pages looks more fair for me, instead of 100, as here isn't used Special:Export or API to get data.
Modified Paths:
--------------
trunk/pywikipedia/pagegenerators.py
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2009-04-30 14:55:19 UTC (rev 6778)
+++ trunk/pywikipedia/pagegenerators.py 2009-04-30 15:27:26 UTC (rev 6779)
@@ -125,12 +125,12 @@
-random Work on random pages returned by [[Special:Random]].
Can also be given as "-random:n" where n is the number
- of pages to be returned, else 100 pages are returned.
+ of pages to be returned, else 10 pages are returned.
-randomredirect Work on random redirect target pages returned by
[[Special:Randomredirect]]. Can also be given as
"-randomredirect:n" where n is the number of pages to be
- returned, else 100 pages are returned.
+ returned, else 10 pages are returned.
-gorandom Specifies that the robot should starting at the random pages
returned by [[Special:Random]].
@@ -393,13 +393,13 @@
for page in linkingPage.linkedPages():
yield page
-def RandomPageGenerator(number = 100, site = None):
+def RandomPageGenerator(number = 10, site = None):
if site is None:
site = wikipedia.getSite()
for i in range(number):
yield site.randompage()
-def RandomRedirectPageGenerator(number = 100, site = None):
+def RandomRedirectPageGenerator(number = 10, site = None):
if site is None:
site = wikipedia.getSite()
for i in range(number):
Revision: 6778
Author: cosoleto
Date: 2009-04-30 14:55:19 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
Rewritten site.randompages() and site.randomredirectpages(), renamed into randompage() and randomredirectpage() and changed related generators accordingly. Now the functions return a single page only, as these Site class methods should be derived from MediaWiki's 'Special:' pages. They returns as soon as it is possible, instead of sleeping 1 second as well as the download time (I couldn't reproduce that problem using a very fast line). Deleted useless parts and duplicated code (repeat, set...).
Modified Paths:
--------------
trunk/pywikipedia/pagegenerators.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2009-04-30 13:48:48 UTC (rev 6777)
+++ trunk/pywikipedia/pagegenerators.py 2009-04-30 14:55:19 UTC (rev 6778)
@@ -393,17 +393,17 @@
for page in linkingPage.linkedPages():
yield page
-def RandomPageGenerator(number = 100, repeat = False, site = None):
+def RandomPageGenerator(number = 100, site = None):
if site is None:
site = wikipedia.getSite()
- for page in site.randompages(number=number, repeat=repeat):
- yield page
+ for i in range(number):
+ yield site.randompage()
-def RandomRedirectPageGenerator(number = 100, repeat = False, site = None):
+def RandomRedirectPageGenerator(number = 100, site = None):
if site is None:
site = wikipedia.getSite()
- for page in site.randomredirectpages(number=number, repeat=repeat):
- yield page
+ for i in range(number):
+ yield site.randomredirectpage()
def TextfilePageGenerator(filename=None, site=None):
'''
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-04-30 13:48:48 UTC (rev 6777)
+++ trunk/pywikipedia/wikipedia.py 2009-04-30 14:55:19 UTC (rev 6778)
@@ -4270,8 +4270,8 @@
ImagePage objects)
unusedcategories(): Special:Unusuedcategories (yields Category)
unusedfiles(): Special:Unusedimages (yields ImagePage)
- randompages: Special:Random
- randomredirectpages: Special:Random
+ randompage: Special:Random
+ randomredirectpage: Special:RandomRedirect
withoutinterwiki: Special:Withoutinterwiki
linksearch: Special:Linksearch
@@ -5434,56 +5434,20 @@
if not repeat:
break
- def randompages(self, number=1, repeat=False, randomredirect=False):
- """Yield random pages via Special:Random, or Special:RandomRedirect."""
- seen = set()
- if randomredirect:
- path = self.randomredirect_address()
- else:
- path = self.random_address()
- entryR = re.compile('var wgPageName = "(?P<title>.+?)";')
- while True:
- for ignored in range(number):
- # MediaWiki advances its random pages only every second.
- time.sleep(1)
- html = self.getUrl(path)
- # output(u' html=%s' % (html))
- m = entryR.search(html)
- if m is not None:
- title = m.group('title')
- # output(u' title=%s' % ( title ))
- if title not in seen:
- seen.add(title)
- yield Page(self, title)
- if not repeat:
- break
+ def randompage(self):
+ """Yield random page via Special:Random"""
+ html = self.getUrl(self.random_address())
+ m = re.search('var wgPageName = "(?P<title>.+?)";', html)
+ if m is not None:
+ return Page(self, m.group('title'))
- def randomredirectpages(self, number=1, repeat=False, randomredirect=True):
- """Yield random pages via Special:Random, or Special:RandomRedirect."""
- seen = set()
- if randomredirect:
- path = self.randomredirect_address()
- else:
- path = self.random_address()
- entryR = re.compile('var wgPageName = "(?P<title>.+?)";')
- while True:
- for ignored in range(number):
- # MediaWiki advances its random pages only every second.
- time.sleep(1)
- html = self.getUrl(path)
- # output(u' html=%s' % (html))
- m = entryR.search(html)
- if m is not None:
- title = m.group('title')
- # output(u' title=%s' % ( title ))
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page
- if not repeat:
- break
+ def randomredirectpage(self):
+ """Yield random redirect page via Special:RandomRedirect."""
+ html = self.getUrl(self.randomredirect_address())
+ m = re.search('var wgPageName = "(?P<title>.+?)";', html)
+ if m is not None:
+ return Page(self, m.group('title'))
-
def allpages(self, start='!', namespace=None, includeredirects=True,
throttle=True):
"""
Revision: 6777
Author: nicdumz
Date: 2009-04-30 13:48:48 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
Being bold. Languages_by_size should contain all the site codes.
This fixes bug [ 2783407 ] -limittwo option does not work correctly
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2009-04-30 12:54:17 UTC (rev 6776)
+++ trunk/pywikipedia/interwiki.py 2009-04-30 13:48:48 UTC (rev 6777)
@@ -211,7 +211,7 @@
-localonly only work on the local wiki, not on other wikis in the family
I have a login at. (note: without ending colon)
- -limittwo only update two pages - one in the local wiki (if loged-in),
+ -limittwo only update two pages - one in the local wiki (if logged-in)
and one in the top available one.
For example, if the local page has links to de and fr,
this option will make sure that only local and de: (larger)
@@ -1352,14 +1352,7 @@
lclSiteDone = False
frgnSiteDone = False
- # XXX Do we really need to make an union here?
- # we should have sorted(languages_by_size) = sorted(langs) ?!
- langBySize = set(lclSite.family.languages_by_size)
- allLangs = set(lclSite.family.langs)
-
- langToCheck = (langBySize | allLangs).difference(lclSite.family.obsolete)
-
- for siteCode in langToCheck:
+ for siteCode in lclSite.family.languages_by_size:
site = wikipedia.getSite(code = siteCode)
if (not lclSiteDone and site == lclSite) or (not frgnSiteDone and site != lclSite and site in new):
if site == lclSite:
Revision: 6776
Author: cosoleto
Date: 2009-04-30 12:54:17 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
_GetAll.run: Avoid to search inside the entire XML data when you want to check only the final bytes. Not explicitly suggested by me in bug #2771272.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-04-30 12:27:23 UTC (rev 6775)
+++ trunk/pywikipedia/wikipedia.py 2009-04-30 12:54:17 UTC (rev 6776)
@@ -2974,7 +2974,7 @@
else:
if "<title>Wiki does not exist</title>" in data:
raise NoSuchSite(u'Wiki %s does not exist yet' % self.site)
- elif "</mediawiki>" not in data:
+ elif "</mediawiki>" not in data[-20:]:
# HTML error Page got thrown because of an internal
# error when fetching a revision.
output(u'Remote site has a problem, it probably ' \
Revision: 6773
Author: cosoleto
Date: 2009-04-30 12:08:54 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
Fixed incorrect reference to CaptchaError.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-04-30 11:04:13 UTC (rev 6772)
+++ trunk/pywikipedia/wikipedia.py 2009-04-30 12:08:54 UTC (rev 6773)
@@ -4564,13 +4564,13 @@
answer = input('What is the answer to the captcha "%s" ?' % match.group('question'))
else:
if not config.solve_captcha:
- raise wikipedia.CaptchaError(id)
+ raise CaptchaError(id)
url = self.protocol() + '://' + self.hostname() + self.captcha_image_address(id)
answer = ui.askForCaptcha(url)
return {'id':id, 'answer':answer}
Recaptcha = re.compile('<script type="text/javascript" src="http://api\.recaptcha\.net/[^"]*"></script>')
if Recaptcha.search(data):
- raise wikipedia.CaptchaError('We have been prompted for a ReCaptcha, but pywikipedia does not yet support ReCaptchas')
+ raise CaptchaError('We have been prompted for a ReCaptcha, but pywikipedia does not yet support ReCaptchas')
return None
def postForm(self, address, predata, sysop=False, cookies = None):