Bugs item #1672346, was opened at 2007-03-02 03:33
Message generated for change (Comment added) made by nobody
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1672346&group_…
Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Nobody/Anonymous (nobody)
Assigned to: Nobody/Anonymous (nobody)
Summary: uncaught socket.error exception
Initial Comment:
I got an uncaught socket error in one of my scripts today:
Traceback (most recent call last):
File "./replace-link.py", line 256, in ?
page.put(changedict[title], summ)
File "wikipedia.py", line 981, in put
return self.putPage(newtext, comment, watchArticle, minorEdit, newPage, self.site().getToken(sysop = sysop), sysop = sysop)
File "wikipedia.py", line 1049, in putPage
response, data = self.site().postForm(address, predata, sysop = sysop)
File "wikipedia.py", line 2673, in postForm
return self.postData(address, data, sysop = sysop)
File "wikipedia.py", line 2696, in postData
conn.endheaders()
File "/usr/lib/python2.4/httplib.py", line 798, in endheaders
self._send_output()
File "/usr/lib/python2.4/httplib.py", line 679, in _send_output
self.send(msg)
File "/usr/lib/python2.4/httplib.py", line 646, in send
self.connect()
File "/usr/lib/python2.4/httplib.py", line 630, in connect
raise socket.error, msg
socket.error: (110, 'Connection timed out')
Since all that socket juggling is private to wikipedia.py, I suggest that this exception be caught and re-raised as a wikipedia.Error.
----------------------------------------------------------------------
Comment By: Nobody/Anonymous (nobody)
Date: 2007-08-23 07:44
Message:
Logged In: NO
As of SVN revision 4096, socket errors from wikipedia.py:3174-3176 are not
caught (the comment just before these lines even makes note of this
fact).
I suggest
response = conn.getresponse()
data = response.read().decode(self.encoding())
conn.close()
be changed to
try:
response = conn.getresponse()
data = response.read().decode(self.encoding())
conn.close()
except socket.error, errmsg:
raise ServerError(errmsg)
It's really annoying to have a longer bot job that is aware of
wikipedia.Error crash because of an intermittent "Connection reset by
peer".
----------------------------------------------------------------------
You can respond by visiting:
https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1672346&group_…
Revision: 4096
Author: valhallasw
Date: 2007-08-23 14:17:57 +0000 (Thu, 23 Aug 2007)
Log Message:
-----------
Page.put() now obeys {{bots}} and {{nobots}} if not asked to ignore them (using the force=True parameter).
Using _flush(), get_throttle is now dropped at the end of the run.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-08-23 13:09:56 UTC (rev 4095)
+++ trunk/pywikipedia/wikipedia.py 2007-08-23 14:17:57 UTC (rev 4096)
@@ -1,4 +1,4 @@
-## -*- coding: utf-8 -*-
+## -*- coding: utf-8 -*-
"""
Library to get and put pages on a MediaWiki.
@@ -1030,13 +1030,13 @@
yield Page(site, fileLink)
def put_async(self, newtext,
- comment=None, watchArticle=None, minorEdit=True):
+ comment=None, watchArticle=None, minorEdit=True, force=False):
"""Asynchronous version of put (takes the same arguments), which
places pages on a queue to be saved by a daemon thread.
"""
- page_put_queue.put((self, newtext, comment, watchArticle, minorEdit))
+ page_put_queue.put((self, newtext, comment, watchArticle, minorEdit, force))
- def put(self, newtext, comment=None, watchArticle = None, minorEdit = True):
+ def put(self, newtext, comment=None, watchArticle = None, minorEdit = True, force=False):
"""Replace the new page with the contents of the first argument.
The second argument is a string that is to be used as the
summary for the modification
@@ -1051,6 +1051,11 @@
#except NoPage:
# pass
+ # Determine if we are allowed to edit
+ if not force:
+ if not self.botMayEdit():
+ raise LockedPage(u'Not allowed to edit %s because of a restricting template' % self.aslink())
+
# If there is an unchecked edit restriction, we need to load the page
if self._editrestriction:
output(u'Page %s is semi-protected. Getting edit page to find out if we are allowed to edit.' % self.aslink())
@@ -4641,13 +4646,13 @@
Daemon that takes pages from the queue and tries to save them on the wiki.
'''
while True:
- page, newtext, comment, watchArticle, minorEdit = page_put_queue.get()
+ page, newtext, comment, watchArticle, minorEdit, force = page_put_queue.get()
if page is None:
# needed for compatibility with Python 2.3 and 2.4
# in 2.5, we could use the Queue's task_done() and join() methods
return
try:
- page.put(newtext, comment, watchArticle, minorEdit)
+ page.put(newtext, comment, watchArticle, minorEdit, force)
except SpamfilterError, ex:
output(u"Saving page [[%s]] prevented by spam filter: %s"
% (page.title(), ex.url))
@@ -4683,7 +4688,7 @@
remaining = datetime.timedelta(seconds=(page_put_queue.qsize()+1) * config.put_throttle)
output('Waiting for %i pages to be put. Estimated time remaining: %s' % (page_put_queue.qsize()+1, remaining))
- page_put_queue.put((None, None, None, None, None))
+ page_put_queue.put((None, None, None, None, None, None))
while(_putthread.isAlive()):
try:
@@ -4694,6 +4699,7 @@
['yes', 'no'], ['y', 'N'], 'N')
if answer in ['y', 'Y']:
return
+ get_throttle.drop()
import atexit
atexit.register(_flush)
Revision: 4092
Author: cosoleto
Date: 2007-08-23 08:30:49 +0000 (Thu, 23 Aug 2007)
Log Message:
-----------
economize_query() to skip sequence of numbers or wide comma separated lists.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-08-22 21:52:43 UTC (rev 4091)
+++ trunk/pywikipedia/config.py 2007-08-23 08:30:49 UTC (rev 4092)
@@ -344,6 +344,14 @@
# Append length of URL to script result
copyright_show_length = True
+# By default the script try to identify and skip text that contents a wide
+# comma separated list or only numbers. But sometimes that might be the
+# only part unmodified of a slightly edited and not otherwise reported
+# copyright violation. You can disable this feature to try to increase
+# accuracy.
+
+copyright_economize_query = True
+
############## FURTHER SETTINGS ##############
# The bot can make some additional changes to each page it edits, e.g. fix
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-08-22 21:52:43 UTC (rev 4091)
+++ trunk/pywikipedia/copyright.py 2007-08-23 08:30:49 UTC (rev 4092)
@@ -75,10 +75,15 @@
__version__='$Id$'
-# Try to skip quoted text
+# Try to skip quoted text.
exclude_quote = True
-# No checks if the page is a disambiguation page
+# If ratio between query length and number of commas is greater or equal
+# to 'comma_ratio' then the script identify a comma separated list and
+# don't send data to search engine.
+comma_ratio = 5
+
+# No checks if the page is a disambiguation page.
skip_disambig = True
appdir = "copyright/"
@@ -323,6 +328,27 @@
f.close()
#
+# Ignore text that contents comma separated list, only numbers,
+# punctuation...
+
+def economize_query(text)
+ # Comma separated list
+ if text.count(', ') > 4:
+ l = len(text)
+ c = text.count(', ')
+ r = 100 * c / l
+
+ if r >= comma_ratio
+ return True
+
+ # write_log("%d/%d/%d: %s\n" % (l,c,r,text), "copyright/skip" + str(r) + ".txt")
+
+ # Numbers
+ if re.search('[^0-9\'*/,. +?:;-]{5}', text):
+ return False
+ return True
+
+#
# Set regex used in cleanwikicode() to remove [[Image:]] tags
# and regex used in check_in_source() to reject pages with
# 'Wikipedia'.
@@ -442,6 +468,11 @@
line = cleanwikicode(line)
for search_words in mysplit(line, 31, " "):
if len(search_words) > 120:
+ if config.copyright_economize_query:
+ if economize_query(search_words):
+ wikipedia.output('SKIP TEXT: ' + search_words)
+ consecutive = False
+ continue
n_query += 1
#wikipedia.output(search_words)
if config.copyright_max_query_for_page and n_query > config.copyright_max_query_for_page:
Revision: 4091
Author: cosoleto
Date: 2007-08-22 21:52:43 +0000 (Wed, 22 Aug 2007)
Log Message:
-----------
Patch to disable search engine/sleep/stop/ignore if search engine refuse your query because you exceed daily quota.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-08-22 20:52:26 UTC (rev 4090)
+++ trunk/pywikipedia/config.py 2007-08-22 21:52:43 UTC (rev 4091)
@@ -288,10 +288,10 @@
############## SEARCH ENGINE SETTINGS ##############
-# Some scripts allow querying Google via the Google Web API. To use this feature, you must
-# install the pyGoogle module from http://pygoogle.sf.net/ and have a Google
-# Web API license key. Note that
-# Google doesn't give out license keys anymore.
+# Some scripts allow querying Google via the Google Web API. To use this feature,
+# you must install the pyGoogle module from http://pygoogle.sf.net/ and have a
+# Google Web API license key. Note that Google doesn't give out license keys
+# anymore.
google_key = ''
# Some scripts allow using the Yahoo! Search Web Services. To use this feature,
@@ -326,6 +326,18 @@
# Number of attempts on connection error.
copyright_connection_tries = 10
+# Behavior if an exceeded error occur.
+#
+# Possibilities:
+#
+# 0 = None
+# 1 = Disable search engine
+# 2 = Sleep (default)
+# 3 = Stop
+
+copyright_exceeded_in_queries = 2
+copyright_exceeded_in_queries_sleep_hours = 6
+
# Append last modified date of URL to script result
copyright_show_date = True
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-08-22 20:52:26 UTC (rev 4090)
+++ trunk/pywikipedia/copyright.py 2007-08-22 21:52:43 UTC (rev 4091)
@@ -659,6 +659,20 @@
url.append((add_item, engine, comment))
return
+def exceeded_in_queries(engine):
+ """Behavior if an exceeded error occur."""
+
+ # Disable search engine
+ if config.copyright_exceeded_in_queries == 1:
+ exec('config.copyright_' + engine + ' = False')
+ # Sleeping
+ if config.copyright_exceeded_in_queries == 2:
+ print "Got a queries exceeded error. Sleeping for %d hours..." % (config.copyright_exceeded_in_queries_sleep_hours)
+ time.sleep(config.copyright_exceeded_in_queries_sleep_hours * 60 * 60)
+ # Stop execution
+ if config.copyright_exceeded_in_queries == 3:
+ raise 'Got a queries exceeded error.'
+
def get_results(query, numresults = 10):
url = list()
query = re.sub("[()\"<>]", "", query)
@@ -677,9 +691,13 @@
except KeyboardInterrupt:
raise
except Exception, err:
- #SOAP.faultType: <Fault SOAP-ENV:Server: Exception from service object:
- # Daily limit of 1000 queries exceeded for key xxx>
print "Got an error ->", err
+ #
+ # SOAP.faultType: <Fault SOAP-ENV:Server: Exception from service object:
+ # Daily limit of 1000 queries exceeded for key ***>
+ #
+ if 'Daily limit' in str(err):
+ exceeded_in_queries('google')
if search_request_retry:
search_request_retry -= 1
if config.copyright_yahoo:
@@ -696,6 +714,8 @@
search_request_retry = 0
except Exception, err:
print "Got an error ->", err
+ if 'limit exceeded' in str(err):
+ exceeded_in_queries('yahoo')
if search_request_retry:
search_request_retry -= 1
if config.copyright_msn:
Revision: 4090
Author: cosoleto
Date: 2007-08-22 20:52:26 +0000 (Wed, 22 Aug 2007)
Log Message:
-----------
Added support for Live Search
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-08-22 20:35:48 UTC (rev 4089)
+++ trunk/pywikipedia/config.py 2007-08-22 20:52:26 UTC (rev 4090)
@@ -288,13 +288,10 @@
############## SEARCH ENGINE SETTINGS ##############
-# Some scripts allow querying Google either via the Google Web API, or by
-# just parsing the HTML from the Google website.
-# To use the Google Web API, you must install the pyGoogle module from
-# http://pygoogle.sf.net/ and have a Google Web API license key. Note that
+# Some scripts allow querying Google via the Google Web API. To use this feature, you must
+# install the pyGoogle module from http://pygoogle.sf.net/ and have a Google
+# Web API license key. Note that
# Google doesn't give out license keys anymore.
-# If you don't enter a google license key in your user config file, the scripts
-# will just parse the raw HTML code from the website.
google_key = ''
# Some scripts allow using the Yahoo! Search Web Services. To use this feature,
@@ -302,17 +299,23 @@
# and get a Yahoo AppID from http://developer.yahoo.com
yahoo_appid = ''
+# To use Windows Live Search web service you must get an AppID from
+# http://search.msn.com/developer
+msn_appid = ''
+
############## COPYRIGHT SETTINGS ##############
# Enable/disable search engine in copyright.py script
copyright_google = True
copyright_yahoo = True
+copyright_msn = False
# Perform a deep check, loading URLs to search if 'Wikipedia' is present.
# This may be useful to improve number of correct results. If you haven't
# a fast connection, you might want to keep they disabled.
copyright_check_in_source_google = False
copyright_check_in_source_yahoo = False
+copyright_check_in_source_msn = False
# Limit number of queries for page.
copyright_max_query_for_page = 25
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-08-22 20:35:48 UTC (rev 4089)
+++ trunk/pywikipedia/copyright.py 2007-08-22 20:52:26 UTC (rev 4090)
@@ -596,7 +596,8 @@
def add_in_urllist(url, add_item, engine):
if (engine == 'google' and config.copyright_check_in_source_google) or \
- (engine == 'yahoo' and config.copyright_check_in_source_yahoo):
+ (engine == 'yahoo' and config.copyright_check_in_source_yahoo) or \
+ (engine == 'msn' and config.copyright_check_in_source_msn):
check_in_source = True
else:
check_in_source = False
@@ -697,32 +698,40 @@
print "Got an error ->", err
if search_request_retry:
search_request_retry -= 1
- #if search_in_msn:
- # ## max_query_len = 150?
- # from __SOAPpy import WSDL
- # print " msn query..."
- # wsdl_url = 'http://soap.search.msn.com/webservices.asmx?wsdl'
- # server = WSDL.Proxy(wsdl_url)
- # params = {'AppID': config.msn_appid, 'Query': '-Wikipedia "' + query + '"', 'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': {
- # 'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
- #
- # search_request_retry = config.copyright_connection_tries
- # results = ''
- # while search_request_retry:
- # try:
- # server_results = server.Search(Request = params)
- # search_request_retry = 0
- # if server_results.Responses[0].Results:
- # results = server_results.Responses[0].Results[0]
- # except Exception, err:
- # print "Got an error ->", err
- # search_request_retry -= 1
- # for entry in results:
- # try:
- # add_in_urllist(url, entry.Url, 'msn')
- # except AttributeError:
- # print "attrib ERROR"
+ if config.copyright_msn:
+ #max_query_len = 150?
+ from SOAPpy import WSDL
+ print " Live query..."
+ try:
+ server = WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl')
+ except:
+ print "Live Search Error"
+ raise
+ params = {'AppID': config.msn_appid, 'Query': '-Wikipedia "' + query + '"', 'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': {
+ 'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
+
+ search_request_retry = config.copyright_connection_tries
+ results = ''
+ while search_request_retry:
+ try:
+ server_results = server.Search(Request = params)
+ search_request_retry = 0
+ if server_results.Responses[0].Results:
+ results = server_results.Responses[0].Results[0]
+ except Exception, err:
+ print "Got an error ->", err
+ if search_request_retry:
+ search_request_retry -= 1
+
+ if results:
+ # list or instance?
+ if type(results) == type([]):
+ for entry in results:
+ add_in_urllist(url, entry.Url, 'msn')
+ else:
+ add_in_urllist(url, results.Url, 'msn')
+
offset = 0
for i in range(len(url)):
if check_list(url[i + offset][0], excl_list, verbose = True):