Revision: 5014
Author: russblau
Date: 2008-02-12 22:20:38 +0000 (Tue, 12 Feb 2008)
Log Message:
-----------
Refactor page-put logic for better error detection; if any of the "DEBUG" messages appear during use, please post a bug report!
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-02-12 19:17:56 UTC (rev 5013)
+++ trunk/pywikipedia/wikipedia.py 2008-02-12 22:20:38 UTC (rev 5014)
@@ -1278,105 +1278,114 @@
# I'm not sure what to check in this case, so I just assume
# things went ok. Very naive, I agree.
data = u''
- else:
- try:
- response, data = self.site().postForm(address, predata, sysop)
- except httplib.BadStatusLine, line:
- raise PageNotSaved('Bad status line: %s' % line.line)
- except ServerError:
- output(u''.join(traceback.format_exception(*sys.exc_info())))
- output(u'Got a server error when putting; will retry in %i minutes.' % retry_delay)
- time.sleep(60 * retry_delay)
- retry_delay *= 2
- if retry_delay > 30:
- retry_delay = 30
- continue
- if data != u'':
- # Saving unsuccessful. Possible reasons:
- # server lag, edit conflict or invalid edit token.
- # A second text area means that an edit conflict has occured.
- if response.status == 503 \
- and 'x-database-lag' in response.msg.keys():
- # server lag; Mediawiki recommends waiting 5 seconds and
- # retrying
- if verbose:
- output(data, newline=False)
- output(u"Pausing 5 seconds due to database server lag.")
- time.sleep(5)
- continue
- if data.find( "<title>Wikimedia Error</title>") > -1:
- output(
- u"Wikimedia has technical problems; will retry in %i minutes."
- % retry_delay)
- time.sleep(60 * retry_delay)
- retry_delay *= 2
- if retry_delay > 30:
- retry_delay = 30
- continue
- if 'id=\'wpTextbox2\' name="wpTextbox2"' in data:
- raise EditConflict(u'An edit conflict has occured.')
- elif self.site().has_mediawiki_message("spamprotectiontitle")\
- and self.site().mediawiki_message('spamprotectiontitle') in data:
- try:
- reasonR = re.compile(re.escape(self.site().mediawiki_message('spamprotectionmatch')).replace('\$1', '(?P<url>[^<]*)'))
- url = reasonR.search(data).group('url')
- except:
- # Some wikis have modified the spamprotectionmatch
- # template in a way that the above regex doesn't work,
- # e.g. on he.wikipedia the template includes a
- # wikilink, and on fr.wikipedia there is bold text.
- # This is a workaround for this: it takes the region
- # which should contain the spamfilter report and the
- # URL. It then searches for a plaintext URL.
- relevant = data[data.find('<!-- start content -->')+22:data.find('<!-- end content -->')].strip()
- # Throw away all the other links etc.
- relevant = re.sub('<.*?>', '', relevant)
- relevant = relevant.replace(':', ':')
- # MediaWiki only spam-checks HTTP links, and only the
- # domain name part of the URL.
- m = re.search('http://[\w\-\.]+', relevant)
- if m:
- url = m.group()
- else:
- # Can't extract the exact URL. Let the user search.
- url = relevant
- raise SpamfilterError(url)
- elif '<label for=\'wpRecreate\'' in data:
- # Make sure your system clock is correct if this error occurs
- # without any reason!
- raise EditConflict(u'Someone deleted the page.')
- elif self.site().has_mediawiki_message("viewsource")\
- and self.site().mediawiki_message('viewsource') in data:
- # The page is locked. This should have already been
- # detected when getting the page, but there are some
- # reasons why this didn't work, e.g. the page might be
- # locked via a cascade lock.
- try:
- # Page is restricted - try using the sysop account, unless we're using one already
- if not sysop:
- self.site().forceLogin(sysop = True)
- output(u'Page is locked, retrying using sysop account.')
- return self._putPage(text, comment, watchArticle,
- minorEdit, newPage, token=None,
- gettoken=True, sysop=True)
- except NoUsername:
- raise LockedPage()
- elif not newTokenRetrieved and "<textarea" in data:
- # We might have been using an outdated token
- output(u"Changing page has failed. Retrying.")
- return self._putPage(text = text, comment = comment,
- watchArticle = watchArticle, minorEdit = minorEdit, newPage = newPage,
- token = None, gettoken = True, sysop = sysop)
- else:
- # Something went wrong, and we don't know what. Show the
- # HTML code that hopefully includes some error message.
- output(data)
- return response.status, response.reason, data
- if self.site().hostname() in config.authenticate.keys():
# No idea how to get the info now.
return None
- else:
+ try:
+ response, data = self.site().postForm(address, predata, sysop)
+ except httplib.BadStatusLine, line:
+ raise PageNotSaved('Bad status line: %s' % line.line)
+ except ServerError:
+ output(u''.join(traceback.format_exception(*sys.exc_info())))
+ output(
+ u'Got a server error when putting; will retry in %i minute%s.'
+ % (retry_delay, retry_delay != 1 and "s" or ""))
+ time.sleep(60 * retry_delay)
+ retry_delay *= 2
+ if retry_delay > 30:
+ retry_delay = 30
+ continue
+ if response.status == 503 \
+ and 'x-database-lag' in response.msg.keys():
+ # server lag; Mediawiki recommends waiting 5 seconds and
+ # retrying
+ if verbose:
+ output(data, newline=False)
+ output(u"Pausing 5 seconds due to database server lag.")
+ time.sleep(5)
+ continue
+ # A second text area means that an edit conflict has occured.
+ if 'id=\'wpTextbox2\' name="wpTextbox2"' in data:
+ raise EditConflict(u'An edit conflict has occured.')
+ if self.site().has_mediawiki_message("spamprotectiontitle")\
+ and self.site().mediawiki_message('spamprotectiontitle') in data:
+ try:
+ reasonR = re.compile(re.escape(self.site().mediawiki_message('spamprotectionmatch')).replace('\$1', '(?P<url>[^<]*)'))
+ url = reasonR.search(data).group('url')
+ except:
+ # Some wikis have modified the spamprotectionmatch
+ # template in a way that the above regex doesn't work,
+ # e.g. on he.wikipedia the template includes a
+ # wikilink, and on fr.wikipedia there is bold text.
+ # This is a workaround for this: it takes the region
+ # which should contain the spamfilter report and the
+ # URL. It then searches for a plaintext URL.
+ relevant = data[data.find('<!-- start content -->')+22:data.find('<!-- end content -->')].strip()
+ # Throw away all the other links etc.
+ relevant = re.sub('<.*?>', '', relevant)
+ relevant = relevant.replace(':', ':')
+ # MediaWiki only spam-checks HTTP links, and only the
+ # domain name part of the URL.
+ m = re.search('http://[\w\-\.]+', relevant)
+ if m:
+ url = m.group()
+ else:
+ # Can't extract the exact URL. Let the user search.
+ url = relevant
+ raise SpamfilterError(url)
+ if '<label for=\'wpRecreate\'' in data:
+ # Make sure your system clock is correct if this error occurs
+ # without any reason!
+ raise EditConflict(u'Someone deleted the page.')
+ if self.site().has_mediawiki_message("viewsource")\
+ and self.site().mediawiki_message('viewsource') in data:
+ # The page is locked. This should have already been
+ # detected when getting the page, but there are some
+ # reasons why this didn't work, e.g. the page might be
+ # locked via a cascade lock.
+ try:
+ # Page is restricted - try using the sysop account, unless we're using one already
+ if not sysop:
+ self.site().forceLogin(sysop = True)
+ output(u'Page is locked, retrying using sysop account.')
+ return self._putPage(text, comment, watchArticle,
+ minorEdit, newPage, token=None,
+ gettoken=True, sysop=True)
+ except NoUsername:
+ raise LockedPage()
+ if not newTokenRetrieved and "<textarea" in data:
+ # We might have been using an outdated token
+ output(u"Changing page has failed. Retrying.")
+ return self._putPage(text = text, comment = comment,
+ watchArticle = watchArticle, minorEdit = minorEdit, newPage = newPage,
+ token = None, gettoken = True, sysop = sysop)
+ if response.status != 302:
+ # normal response to a page save is 302
+ # anything else is abnormal (and is flagged with DEBUG)
+ output(
+ u"Abnormal response %i from server; will try again in %i minute%s."
+ % (response.status, retry_delay,
+ retry_delay != 1 and "s" or ""))
+ time.sleep(60 * retry_delay)
+ retry_delay *= 2
+ if retry_delay > 30:
+ retry_delay = 30
+ continue
+ if data.find("<title>Wikimedia Error</title>") > -1:
+ output(
+ u"DEBUG:Wikimedia has technical problems; will retry in %i minute%s."
+ % (retry_delay, retry_delay != 1 and "s" or ""))
+ time.sleep(60 * retry_delay)
+ retry_delay *= 2
+ if retry_delay > 30:
+ retry_delay = 30
+ continue
+ if data != u"":
+ # Something went wrong, and we don't know what. Show the
+ # HTML code that hopefully includes some error message.
+ output(u"DEBUG:Unexpected response from wiki server.")
+ output(data)
return response.status, response.reason, data
+ return response.status, response.reason, data
def canBeEdited(self):
"""Return bool indicating whether this page can be edited.
Bugs item #1891797, was opened at 2008-02-12 10:45
Message generated for change (Settings changed) made by rotemliss
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1891797&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
>Status: Closed
>Resolution: Fixed
Priority: 5
Private: No
Submitted By: Alex S.H. Lin (lin4h)
Assigned to: Nobody/Anonymous (nobody)
Summary: watchlist.py cannot parse with -all parameter
Initial Comment:
>watchlist -all
.....
Traceback (most recent call last):
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 109, in <mo
dule>
main()
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 98, in main
refresh_all()
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 87, in refr
esh_all
family, lang = match.group(1).split('-')
ValueError: too many values to unpack
I think these bug only occur in the langcode had "-" character, (e.g. be-x-old)
----------------------------------------------------------------------
Comment By: Rotem Liss (rotemliss)
Date: 2008-02-12 17:23
Message:
Logged In: YES
user_id=1327030
Originator: NO
Fixed in r5012.
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1891797&group_…
Bugs item #1891797, was opened at 2008-02-12 10:45
Message generated for change (Comment added) made by rotemliss
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1891797&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Alex S.H. Lin (lin4h)
Assigned to: Nobody/Anonymous (nobody)
Summary: watchlist.py cannot parse with -all parameter
Initial Comment:
>watchlist -all
.....
Traceback (most recent call last):
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 109, in <mo
dule>
main()
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 98, in main
refresh_all()
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 87, in refr
esh_all
family, lang = match.group(1).split('-')
ValueError: too many values to unpack
I think these bug only occur in the langcode had "-" character, (e.g. be-x-old)
----------------------------------------------------------------------
>Comment By: Rotem Liss (rotemliss)
Date: 2008-02-12 17:23
Message:
Logged In: YES
user_id=1327030
Originator: NO
Fixed in r5012.
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1891797&group_…
Feature Requests item #1891916, was opened at 2008-02-12 13:08
Message generated for change (Tracker Item Submitted) made by Item Submitter
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1891916&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Priority: 5
Private: No
Submitted By: Nightshadow28 (nightshadow28)
Assigned to: Nobody/Anonymous (nobody)
Summary: Blockpageschecker's default value of -move option
Initial Comment:
A project using "moving protection", always use -move option. That is, if bot operator forgets to use the option, it will be miss operation with false editing. Therefore, would you change default value of the option by site?
For example, by code as below:
---- (1)
# Check list to block the users that haven't set their preferences
project_inserted = ['en', 'fr', 'it', 'ja', 'pt', 'zh']
# (patch) Check list to change default value of -move ----------------- Start
project_moving_prot = [u'wikipedia:ja']
# (patch) ------------------------------------------------------------- End
---- (2)
# Load the right site
site = wikipedia.getSite()
# (patch) default option value of -move ---------------------------------- Start
if unicode(site) in project_moving_prot:
wikipedia.output('The site needs to check for protection of moving, so bot will check it anyway.')
moveBlockCheck = True
# (patch) --------------------------------------------------------- End
Sincerely yours, Nightshadow28.
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603141&aid=1891916&group_…
Bugs item #1891797, was opened at 2008-02-12 16:45
Message generated for change (Tracker Item Submitted) made by Item Submitter
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1891797&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Alex S.H. Lin (lin4h)
Assigned to: Nobody/Anonymous (nobody)
Summary: watchlist.py cannot parse with -all parameter
Initial Comment:
>watchlist -all
.....
Traceback (most recent call last):
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 109, in <mo
dule>
main()
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 98, in main
refresh_all()
File "D:\My Documents\SOURCE\mwbot\pywikipedia\watchlist.py", line 87, in refr
esh_all
family, lang = match.group(1).split('-')
ValueError: too many values to unpack
I think these bug only occur in the langcode had "-" character, (e.g. be-x-old)
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1891797&group_…
Support Requests item #1813173, was opened at 2007-10-14 14:22
Message generated for change (Comment added) made by wikipedian
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603139&aid=1813173&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
>Status: Closed
Priority: 5
Private: No
Submitted By: André Malafaya Baptista (malafaya)
Assigned to: Nobody/Anonymous (nobody)
Summary: 'Title ... not found in list. Expected one of ...'
Initial Comment:
I'm constantly getting this message while processing [[en:Democratic Republic of the Congo]]:
---
Getting 1 pages from wikipedia:ka...
BUG>> title kongos demokratiuli respublika ([[ka:kongos demokratiuli respublika]]) not found in list
Expected one of: [[ka:kongos demokratiuli respublika?]]
---
The ka article (title is yellow/transliterated) existswithout the '?'. Why does the bot expect the page title to have that '?' at the end?
----------------------------------------------------------------------
>Comment By: Daniel Herding (wikipedian)
Date: 2008-02-12 02:42
Message:
Logged In: YES
user_id=880694
Originator: NO
Left-to-right and right-to-left markers in page titles are now removed in
the Page constructor and thus ignored by the bot framework, so this bug is
fixed.
----------------------------------------------------------------------
Comment By: André Malafaya Baptista (malafaya)
Date: 2007-10-24 23:03
Message:
Logged In: YES
user_id=1037345
Originator: YES
Is there any way of going around this? Maybe by using the article name
returned by Special:Export instead of the name given by the interwiki
(being aware of redirects)? This problem is expanding because one bad new
interwiki is enough for the bot to spread the mistake to all Wikipedias and
then it's harder to correct.
----------------------------------------------------------------------
Comment By: André Malafaya Baptista (malafaya)
Date: 2007-10-24 01:14
Message:
Logged In: YES
user_id=1037345
Originator: YES
It seems interwiki.py itself is adding those invisible characters.
Take a look at:
http://en.wikipedia.org/w/index.php?title=Williamsburg%2C_Colorado&diff=166…
There you can see a first bot add (by Rei-bot) which adds a pt interwiki
with invisible character.
The next change by MalafayaBot happens after I manually correct one of the
interwikis by removing the invisible character.
----------------------------------------------------------------------
Comment By: Daniel Herding (wikipedian)
Date: 2007-10-15 00:48
Message:
Logged In: YES
user_id=880694
Originator: NO
This happens with titles that include invisible left-to-right or
right-to-left control characters. These are omitted in Special:Export or
something like that, I have forgotten what exactly happens.
----------------------------------------------------------------------
Comment By: André Malafaya Baptista (malafaya)
Date: 2007-10-14 15:05
Message:
Logged In: YES
user_id=1037345
Originator: YES
I think I got to a conclusion:
there seemed to be an invisible character in the ka interwiki in all
Wikipedias. After deleting and retyping the interwiki in English Wikipedia
(http://en.wikipedia.org/w/index.php?title=Democratic_Republic_of_the_Congo&…),
the bot now says there are 2 interwikis while processing: one plain title
and the other one with the yellow '?'.
The problem is that apparently when the bot fetches the page 'kongos
demokratiuli respublika?' it actually works and fetches the page 'kongos
demokratiuli respublika' (as if it were an implicit redirect).
Another problem is that the bot tries to replace the bad interwiki (the
one with '?') by the correct one (without it), MediaWiki detects 'no
changes' and just ignores the page update. So the incorrect interwiki will
still prevail until a major update to the page by the bot is made.
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603139&aid=1813173&group_…