Revision: 8024
Author: xqt
Date: 2010-03-18 14:27:48 +0000 (Thu, 18 Mar 2010)
Log Message:
-----------
Do not yield a page in logpages() if result['query']['logevents'][index].has_key('actionhidden')
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-03-18 07:16:02 UTC (rev 8023)
+++ trunk/pywikipedia/wikipedia.py 2010-03-18 14:27:48 UTC (rev 8024)
@@ -4027,8 +4027,8 @@
# TODO: why isn't this a Site method?
pages = list(pages) # if pages is an iterator, we need to make it a list
output(u'Getting %d pages from %s' % (len(pages), site), newline=False)
- #if site.has_api():
- # output(u' via API', newline=False)
+ if site.has_api() and debug:
+ output(u' via API', newline=False)
output(u'...')
limit = config.special_page_limit / 4 # default is 500/4, but It might have good point for server.
if len(pages) > limit:
@@ -4935,7 +4935,7 @@
if config.retry_on_fail:
retry_attempt += 1
if retry_attempt > config.maxretries:
- raise ServerError()
+ raise MaxTriesExceededError()
output(u"""WARNING: Could not open '%s'.\nMaybe the server is down. Retrying in %i minutes..."""
% (url, retry_idle_time))
time.sleep(retry_idle_time * 60)
@@ -5703,7 +5703,8 @@
output('%s' % result)
raise Error
for c in result['query']['logevents']:
- if not namespace or c['ns'] in namespace:
+ if (not namespace or c['ns'] in namespace) and \
+ not c.has_key('actionhidden'):
yield (Page(self, c['title'], defaultNamespace=c['ns']),
c['user'],
parsetime2stamp(c['timestamp']),
Revision: 8023
Author: xqt
Date: 2010-03-18 07:16:02 +0000 (Thu, 18 Mar 2010)
Log Message:
-----------
insert debug information for #2972249
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2010-03-18 06:46:50 UTC (rev 8022)
+++ trunk/pywikipedia/weblinkchecker.py 2010-03-18 07:16:02 UTC (rev 8023)
@@ -420,7 +420,11 @@
if isinstance(error, basestring):
msg = error
else:
- msg = error[1]
+ try:
+ msg = error[1]
+ except IndexError:
+ print u'### DEBUG information for #2972249'
+ raise IndexError, error
# TODO: decode msg. On Linux, it's encoded in UTF-8.
# How is it encoded in Windows? Or can we somehow just
# get the English message?
Revision: 8022
Author: xqt
Date: 2010-03-18 06:46:50 +0000 (Thu, 18 Mar 2010)
Log Message:
-----------
patch #2972270 for bug #2970428 (not striping | following link). Thanks masti.
Modified Paths:
--------------
branches/rewrite/pywikibot/textlib.py
trunk/pywikipedia/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2010-03-17 17:55:22 UTC (rev 8021)
+++ branches/rewrite/pywikibot/textlib.py 2010-03-18 06:46:50 UTC (rev 8022)
@@ -7,7 +7,7 @@
"""
#
-# (C) Pywikipedia bot team, 2008
+# (C) Pywikipedia bot team, 2008-2010
#
# Distributed under the terms of the MIT license.
#
@@ -92,7 +92,8 @@
# this matches internal wikilinks, but also interwiki, categories, and
# images.
'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
- 'interwiki': re.compile(r'(?i)\[\[(%s)\s?:[^\]]*\]\][\s]*'
+ # also finds links to foreign sites with preleading ":"
+ 'interwiki': re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*'
% '|'.join(site.validLanguageLinks()
+ site.family.obsolete.keys())
),
@@ -678,7 +679,7 @@
# Note: While allowing parenthesis inside URLs, MediaWiki will regard
# right parenthesis at the end of the URL as not part of that URL.
# The same applies to dot, comma, colon and some other characters.
- notAtEnd = '\]\s\)\.:;,<>"'
+ notAtEnd = '\]\s\)\.:;,<>"\|'
# So characters inside the URL can be anything except whitespace,
# closing squared brackets, quotation marks, greater than and less
# than, and the last character also can't be parenthesis or another
Modified: trunk/pywikipedia/pywikibot/textlib.py
===================================================================
--- trunk/pywikipedia/pywikibot/textlib.py 2010-03-17 17:55:22 UTC (rev 8021)
+++ trunk/pywikipedia/pywikibot/textlib.py 2010-03-18 06:46:50 UTC (rev 8022)
@@ -680,7 +680,7 @@
# Note: While allowing parenthesis inside URLs, MediaWiki will regard
# right parenthesis at the end of the URL as not part of that URL.
# The same applies to dot, comma, colon and some other characters.
- notAtEnd = '\]\s\)\.:;,<>"'
+ notAtEnd = '\]\s\)\.:;,<>"\|'
# So characters inside the URL can be anything except whitespace,
# closing squared brackets, quotation marks, greater than and less
# than, and the last character also can't be parenthesis or another
Revision: 8017
Author: xqt
Date: 2010-03-17 06:42:37 +0000 (Wed, 17 Mar 2010)
Log Message:
-----------
revert r8011: sometimes _GetAll() via API does not found any revisions
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2010-03-16 20:01:58 UTC (rev 8016)
+++ trunk/pywikipedia/wikipedia.py 2010-03-17 06:42:37 UTC (rev 8017)
@@ -3663,8 +3663,12 @@
def run(self):
if self.pages:
+ doAPI = None
+ # API Implemented Check
+ # doAPI = self.site.has_api()
+ # Sometimes query does not contains revisions
- if self.site.has_api():
+ if doAPI:
while True:
try:
data = self.getDataApi()
@@ -4021,8 +4025,8 @@
# TODO: why isn't this a Site method?
pages = list(pages) # if pages is an iterator, we need to make it a list
output(u'Getting %d pages from %s' % (len(pages), site), newline=False)
- if site.has_api():
- output(u' via API', newline=False)
+ #if site.has_api():
+ # output(u' via API', newline=False)
output(u'...')
limit = config.special_page_limit / 4 # default is 500/4, but It might have good point for server.
Revision: 8016
Author: valhallasw
Date: 2010-03-16 20:01:58 +0000 (Tue, 16 Mar 2010)
Log Message:
-----------
And one for the rewrite branch
Added Paths:
-----------
branches/rewrite/README
Added: branches/rewrite/README
===================================================================
--- branches/rewrite/README (rev 0)
+++ branches/rewrite/README 2010-03-16 20:01:58 UTC (rev 8016)
@@ -0,0 +1 @@
+This is the rewrite of the Python Wikipedia Robot Framework. It features several improvements, such as full API usage and a pythonic package layout.