Pywikipedia-l August 2007

pywikipedia-l@lists.wikimedia.org

26 participants
318 discussions

[ pywikipediabot-Bugs-1672346 ] uncaught socket.error exception
by SourceForge.net 23 Aug '07

23 Aug '07

Bugs item #1672346, was opened at 2007-03-02 03:33 Message generated for change (Comment added) made by nobody You can respond by visiting: https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1672346&group_… Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: None Group: None Status: Open Resolution: None Priority: 5 Private: No Submitted By: Nobody/Anonymous (nobody) Assigned to: Nobody/Anonymous (nobody) Summary: uncaught socket.error exception Initial Comment: I got an uncaught socket error in one of my scripts today: Traceback (most recent call last): File "./replace-link.py", line 256, in ? page.put(changedict[title], summ) File "wikipedia.py", line 981, in put return self.putPage(newtext, comment, watchArticle, minorEdit, newPage, self.site().getToken(sysop = sysop), sysop = sysop) File "wikipedia.py", line 1049, in putPage response, data = self.site().postForm(address, predata, sysop = sysop) File "wikipedia.py", line 2673, in postForm return self.postData(address, data, sysop = sysop) File "wikipedia.py", line 2696, in postData conn.endheaders() File "/usr/lib/python2.4/httplib.py", line 798, in endheaders self._send_output() File "/usr/lib/python2.4/httplib.py", line 679, in _send_output self.send(msg) File "/usr/lib/python2.4/httplib.py", line 646, in send self.connect() File "/usr/lib/python2.4/httplib.py", line 630, in connect raise socket.error, msg socket.error: (110, 'Connection timed out') Since all that socket juggling is private to wikipedia.py, I suggest that this exception be caught and re-raised as a wikipedia.Error. ---------------------------------------------------------------------- Comment By: Nobody/Anonymous (nobody) Date: 2007-08-23 07:44 Message: Logged In: NO As of SVN revision 4096, socket errors from wikipedia.py:3174-3176 are not caught (the comment just before these lines even makes note of this fact). I suggest response = conn.getresponse() data = response.read().decode(self.encoding()) conn.close() be changed to try: response = conn.getresponse() data = response.read().decode(self.encoding()) conn.close() except socket.error, errmsg: raise ServerError(errmsg) It's really annoying to have a longer bot job that is aware of wikipedia.Error crash because of an intermittent "Connection reset by peer". ---------------------------------------------------------------------- You can respond by visiting: https://sourceforge.net/tracker/?func=detail&atid=603138&aid=1672346&group_…

1 0

SVN: [4097] trunk/pywikipedia/checkusage.py
by btongminh＠svn.wikimedia.org 23 Aug '07

23 Aug '07

Revision: 4097 Author: btongminh Date: 2007-08-23 14:29:17 +0000 (Thu, 23 Aug 2007) Log Message: ----------- Handle server downage better. Modified Paths: -------------- trunk/pywikipedia/checkusage.py Modified: trunk/pywikipedia/checkusage.py =================================================================== --- trunk/pywikipedia/checkusage.py 2007-08-23 14:17:57 UTC (rev 4096) +++ trunk/pywikipedia/checkusage.py 2007-08-23 14:29:17 UTC (rev 4097) @@ -98,7 +98,11 @@ headers['Connection'] = 'Keep-Alive' headers['User-Agent'] = 'MwClient/' + __ver__ - self._conn.request(method, path, data, headers) + try: + self._conn.request(method, path, data, headers) + except socket.error, e: + self._conn.close() + raise try: res = self._conn.getresponse()

1 0

SVN: [4096] trunk/pywikipedia/wikipedia.py
by valhallasw＠svn.wikimedia.org 23 Aug '07

23 Aug '07

Revision: 4096 Author: valhallasw Date: 2007-08-23 14:17:57 +0000 (Thu, 23 Aug 2007) Log Message: ----------- Page.put() now obeys {{bots}} and {{nobots}} if not asked to ignore them (using the force=True parameter). Using _flush(), get_throttle is now dropped at the end of the run. Modified Paths: -------------- trunk/pywikipedia/wikipedia.py Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-08-23 13:09:56 UTC (rev 4095) +++ trunk/pywikipedia/wikipedia.py 2007-08-23 14:17:57 UTC (rev 4096) @@ -1,4 +1,4 @@ -## -*- coding: utf-8 -*- +## -*- coding: utf-8 -*- """ Library to get and put pages on a MediaWiki. @@ -1030,13 +1030,13 @@ yield Page(site, fileLink) def put_async(self, newtext, - comment=None, watchArticle=None, minorEdit=True): + comment=None, watchArticle=None, minorEdit=True, force=False): """Asynchronous version of put (takes the same arguments), which places pages on a queue to be saved by a daemon thread. """ - page_put_queue.put((self, newtext, comment, watchArticle, minorEdit)) + page_put_queue.put((self, newtext, comment, watchArticle, minorEdit, force)) - def put(self, newtext, comment=None, watchArticle = None, minorEdit = True): + def put(self, newtext, comment=None, watchArticle = None, minorEdit = True, force=False): """Replace the new page with the contents of the first argument. The second argument is a string that is to be used as the summary for the modification @@ -1051,6 +1051,11 @@ #except NoPage: # pass + # Determine if we are allowed to edit + if not force: + if not self.botMayEdit(): + raise LockedPage(u'Not allowed to edit %s because of a restricting template' % self.aslink()) + # If there is an unchecked edit restriction, we need to load the page if self._editrestriction: output(u'Page %s is semi-protected. Getting edit page to find out if we are allowed to edit.' % self.aslink()) @@ -4641,13 +4646,13 @@ Daemon that takes pages from the queue and tries to save them on the wiki. ''' while True: - page, newtext, comment, watchArticle, minorEdit = page_put_queue.get() + page, newtext, comment, watchArticle, minorEdit, force = page_put_queue.get() if page is None: # needed for compatibility with Python 2.3 and 2.4 # in 2.5, we could use the Queue's task_done() and join() methods return try: - page.put(newtext, comment, watchArticle, minorEdit) + page.put(newtext, comment, watchArticle, minorEdit, force) except SpamfilterError, ex: output(u"Saving page [[%s]] prevented by spam filter: %s" % (page.title(), ex.url)) @@ -4683,7 +4688,7 @@ remaining = datetime.timedelta(seconds=(page_put_queue.qsize()+1) * config.put_throttle) output('Waiting for %i pages to be put. Estimated time remaining: %s' % (page_put_queue.qsize()+1, remaining)) - page_put_queue.put((None, None, None, None, None)) + page_put_queue.put((None, None, None, None, None, None)) while(_putthread.isAlive()): try: @@ -4694,6 +4699,7 @@ ['yes', 'no'], ['y', 'N'], 'N') if answer in ['y', 'Y']: return + get_throttle.drop() import atexit atexit.register(_flush)

1 0

SVN: [4095] trunk/pywikipedia/image_replacer.py
by btongminh＠svn.wikimedia.org 23 Aug '07

23 Aug '07

Revision: 4095 Author: btongminh Date: 2007-08-23 13:09:56 +0000 (Thu, 23 Aug 2007) Log Message: ----------- Properly transcode None to empty unicode. Modified Paths: -------------- trunk/pywikipedia/image_replacer.py Modified: trunk/pywikipedia/image_replacer.py =================================================================== --- trunk/pywikipedia/image_replacer.py 2007-08-23 13:09:23 UTC (rev 4094) +++ trunk/pywikipedia/image_replacer.py 2007-08-23 13:09:56 UTC (rev 4095) @@ -146,6 +146,8 @@ not_ok = [(wiki, namespace, page_title.decode('utf-8', 'ignore')) for wiki, namespace, page_title in self.cursor] + if not comment: comment = '' + self.reporters.append((old_image.decode('utf-8', 'ignore'), new_image.decode('utf-8', 'ignore'), user.decode('utf-8', 'ignore'),

1 0

SVN: [4094] trunk/pywikipedia/delinker.py
by btongminh＠svn.wikimedia.org 23 Aug '07

23 Aug '07

Revision: 4094 Author: btongminh Date: 2007-08-23 13:09:23 +0000 (Thu, 23 Aug 2007) Log Message: ----------- Ignore redirects. Modified Paths: -------------- trunk/pywikipedia/delinker.py Modified: trunk/pywikipedia/delinker.py =================================================================== --- trunk/pywikipedia/delinker.py 2007-08-23 13:08:10 UTC (rev 4093) +++ trunk/pywikipedia/delinker.py 2007-08-23 13:09:23 UTC (rev 4094) @@ -127,7 +127,7 @@ # TODO: Per site config. if page.namespace() in self.CommonsDelinker.config['delink_namespaces']: try: - text = page.get() + text = page.get(nofollow_redirects = True) except wikipedia.NoPage: return 'failed' new_text = text @@ -302,7 +302,8 @@ return self.summaries[type][key][0] output(u'%s Fetching new summary for %s' % (self, site)) - + + # FIXME: evil if self.CommonsDelinker.config['global']: self.check_user_page(site) page = wikipedia.Page(site, '%s%s' % \ @@ -397,8 +398,6 @@ # without the image itself. Can be fixed by querying query.php # instead of api.php. Also should this be made as an exits() # method of checkusage.CheckUsage? - #shared_image_repository = self.CommonsDelinker.get_site( - # *self.site.family.shared_image_repository()) shared_image_repository = self.CommonsDelinker.get_site(*self.site.shared_image_repository()) try: if self.CheckUsage.exists(shared_image_repository, image) \ @@ -736,17 +735,13 @@ else: sys.stderr.flush() -if __name__ == '__main__': - # NOTE: Unused - try: - PID = int(os.readlink('/proc/self')) - except: - PID = 0 - +if __name__ == '__main__': output(u'Running ' + __version__) CD = CommonsDelinker() output(u'This bot runs from: ' + str(CD.site)) + re._MAXCACHE = 4 + args = wikipedia.handleArgs() if '-since' in args: # NOTE: Untested

1 0

SVN: [4093] trunk/pywikipedia/family.py
by btongminh＠svn.wikimedia.org 23 Aug '07

23 Aug '07

Revision: 4093 Author: btongminh Date: 2007-08-23 13:08:10 +0000 (Thu, 23 Aug 2007) Log Message: ----------- Give a more specific error if a namespace does not exist. Modified Paths: -------------- trunk/pywikipedia/family.py Modified: trunk/pywikipedia/family.py =================================================================== --- trunk/pywikipedia/family.py 2007-08-23 08:30:49 UTC (rev 4092) +++ trunk/pywikipedia/family.py 2007-08-23 13:08:10 UTC (rev 4093) @@ -2226,7 +2226,7 @@ def namespace(self, code, ns_number, fallback = '_default', all = False): if not self.isDefinedNS(ns_number): - raise KeyError('ERROR: Unknown namespace %d' % ns_number) + raise KeyError('ERROR: Unknown namespace %d for %s:%s' % (ns_number, code, self.name)) elif self.isNsI18N(ns_number, code): v = self.namespaces[ns_number][code] elif fallback:

1 0

SVN: [4092] trunk/pywikipedia
by cosoleto＠svn.wikimedia.org 23 Aug '07

23 Aug '07

Revision: 4092 Author: cosoleto Date: 2007-08-23 08:30:49 +0000 (Thu, 23 Aug 2007) Log Message: ----------- economize_query() to skip sequence of numbers or wide comma separated lists. Modified Paths: -------------- trunk/pywikipedia/config.py trunk/pywikipedia/copyright.py Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2007-08-22 21:52:43 UTC (rev 4091) +++ trunk/pywikipedia/config.py 2007-08-23 08:30:49 UTC (rev 4092) @@ -344,6 +344,14 @@ # Append length of URL to script result copyright_show_length = True +# By default the script try to identify and skip text that contents a wide +# comma separated list or only numbers. But sometimes that might be the +# only part unmodified of a slightly edited and not otherwise reported +# copyright violation. You can disable this feature to try to increase +# accuracy. + +copyright_economize_query = True + ############## FURTHER SETTINGS ############## # The bot can make some additional changes to each page it edits, e.g. fix Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-08-22 21:52:43 UTC (rev 4091) +++ trunk/pywikipedia/copyright.py 2007-08-23 08:30:49 UTC (rev 4092) @@ -75,10 +75,15 @@ __version__='$Id$' -# Try to skip quoted text +# Try to skip quoted text. exclude_quote = True -# No checks if the page is a disambiguation page +# If ratio between query length and number of commas is greater or equal +# to 'comma_ratio' then the script identify a comma separated list and +# don't send data to search engine. +comma_ratio = 5 + +# No checks if the page is a disambiguation page. skip_disambig = True appdir = "copyright/" @@ -323,6 +328,27 @@ f.close() # +# Ignore text that contents comma separated list, only numbers, +# punctuation... + +def economize_query(text) + # Comma separated list + if text.count(', ') > 4: + l = len(text) + c = text.count(', ') + r = 100 * c / l + + if r >= comma_ratio + return True + + # write_log("%d/%d/%d: %s\n" % (l,c,r,text), "copyright/skip" + str(r) + ".txt") + + # Numbers + if re.search('[^0-9\'*/,. +?:;-]{5}', text): + return False + return True + +# # Set regex used in cleanwikicode() to remove [[Image:]] tags # and regex used in check_in_source() to reject pages with # 'Wikipedia'. @@ -442,6 +468,11 @@ line = cleanwikicode(line) for search_words in mysplit(line, 31, " "): if len(search_words) > 120: + if config.copyright_economize_query: + if economize_query(search_words): + wikipedia.output('SKIP TEXT: ' + search_words) + consecutive = False + continue n_query += 1 #wikipedia.output(search_words) if config.copyright_max_query_for_page and n_query > config.copyright_max_query_for_page:

1 0

SVN: [4091] trunk/pywikipedia
by cosoleto＠svn.wikimedia.org 22 Aug '07

22 Aug '07

Revision: 4091 Author: cosoleto Date: 2007-08-22 21:52:43 +0000 (Wed, 22 Aug 2007) Log Message: ----------- Patch to disable search engine/sleep/stop/ignore if search engine refuse your query because you exceed daily quota. Modified Paths: -------------- trunk/pywikipedia/config.py trunk/pywikipedia/copyright.py Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2007-08-22 20:52:26 UTC (rev 4090) +++ trunk/pywikipedia/config.py 2007-08-22 21:52:43 UTC (rev 4091) @@ -288,10 +288,10 @@ ############## SEARCH ENGINE SETTINGS ############## -# Some scripts allow querying Google via the Google Web API. To use this feature, you must -# install the pyGoogle module from http://pygoogle.sf.net/ and have a Google -# Web API license key. Note that -# Google doesn't give out license keys anymore. +# Some scripts allow querying Google via the Google Web API. To use this feature, +# you must install the pyGoogle module from http://pygoogle.sf.net/ and have a +# Google Web API license key. Note that Google doesn't give out license keys +# anymore. google_key = '' # Some scripts allow using the Yahoo! Search Web Services. To use this feature, @@ -326,6 +326,18 @@ # Number of attempts on connection error. copyright_connection_tries = 10 +# Behavior if an exceeded error occur. +# +# Possibilities: +# +# 0 = None +# 1 = Disable search engine +# 2 = Sleep (default) +# 3 = Stop + +copyright_exceeded_in_queries = 2 +copyright_exceeded_in_queries_sleep_hours = 6 + # Append last modified date of URL to script result copyright_show_date = True Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-08-22 20:52:26 UTC (rev 4090) +++ trunk/pywikipedia/copyright.py 2007-08-22 21:52:43 UTC (rev 4091) @@ -659,6 +659,20 @@ url.append((add_item, engine, comment)) return +def exceeded_in_queries(engine): + """Behavior if an exceeded error occur.""" + + # Disable search engine + if config.copyright_exceeded_in_queries == 1: + exec('config.copyright_' + engine + ' = False') + # Sleeping + if config.copyright_exceeded_in_queries == 2: + print "Got a queries exceeded error. Sleeping for %d hours..." % (config.copyright_exceeded_in_queries_sleep_hours) + time.sleep(config.copyright_exceeded_in_queries_sleep_hours * 60 * 60) + # Stop execution + if config.copyright_exceeded_in_queries == 3: + raise 'Got a queries exceeded error.' + def get_results(query, numresults = 10): url = list() query = re.sub("[()\"<>]", "", query) @@ -677,9 +691,13 @@ except KeyboardInterrupt: raise except Exception, err: - #SOAP.faultType: <Fault SOAP-ENV:Server: Exception from service object: - # Daily limit of 1000 queries exceeded for key xxx> print "Got an error ->", err + # + # SOAP.faultType: <Fault SOAP-ENV:Server: Exception from service object: + # Daily limit of 1000 queries exceeded for key ***> + # + if 'Daily limit' in str(err): + exceeded_in_queries('google') if search_request_retry: search_request_retry -= 1 if config.copyright_yahoo: @@ -696,6 +714,8 @@ search_request_retry = 0 except Exception, err: print "Got an error ->", err + if 'limit exceeded' in str(err): + exceeded_in_queries('yahoo') if search_request_retry: search_request_retry -= 1 if config.copyright_msn:

1 0

SVN: [4090] trunk/pywikipedia
by cosoleto＠svn.wikimedia.org 22 Aug '07

22 Aug '07

Revision: 4090 Author: cosoleto Date: 2007-08-22 20:52:26 +0000 (Wed, 22 Aug 2007) Log Message: ----------- Added support for Live Search Modified Paths: -------------- trunk/pywikipedia/config.py trunk/pywikipedia/copyright.py Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2007-08-22 20:35:48 UTC (rev 4089) +++ trunk/pywikipedia/config.py 2007-08-22 20:52:26 UTC (rev 4090) @@ -288,13 +288,10 @@ ############## SEARCH ENGINE SETTINGS ############## -# Some scripts allow querying Google either via the Google Web API, or by -# just parsing the HTML from the Google website. -# To use the Google Web API, you must install the pyGoogle module from -# http://pygoogle.sf.net/ and have a Google Web API license key. Note that +# Some scripts allow querying Google via the Google Web API. To use this feature, you must +# install the pyGoogle module from http://pygoogle.sf.net/ and have a Google +# Web API license key. Note that # Google doesn't give out license keys anymore. -# If you don't enter a google license key in your user config file, the scripts -# will just parse the raw HTML code from the website. google_key = '' # Some scripts allow using the Yahoo! Search Web Services. To use this feature, @@ -302,17 +299,23 @@ # and get a Yahoo AppID from http://developer.yahoo.com yahoo_appid = '' +# To use Windows Live Search web service you must get an AppID from +# http://search.msn.com/developer +msn_appid = '' + ############## COPYRIGHT SETTINGS ############## # Enable/disable search engine in copyright.py script copyright_google = True copyright_yahoo = True +copyright_msn = False # Perform a deep check, loading URLs to search if 'Wikipedia' is present. # This may be useful to improve number of correct results. If you haven't # a fast connection, you might want to keep they disabled. copyright_check_in_source_google = False copyright_check_in_source_yahoo = False +copyright_check_in_source_msn = False # Limit number of queries for page. copyright_max_query_for_page = 25 Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-08-22 20:35:48 UTC (rev 4089) +++ trunk/pywikipedia/copyright.py 2007-08-22 20:52:26 UTC (rev 4090) @@ -596,7 +596,8 @@ def add_in_urllist(url, add_item, engine): if (engine == 'google' and config.copyright_check_in_source_google) or \ - (engine == 'yahoo' and config.copyright_check_in_source_yahoo): + (engine == 'yahoo' and config.copyright_check_in_source_yahoo) or \ + (engine == 'msn' and config.copyright_check_in_source_msn): check_in_source = True else: check_in_source = False @@ -697,32 +698,40 @@ print "Got an error ->", err if search_request_retry: search_request_retry -= 1 - #if search_in_msn: - # ## max_query_len = 150? - # from __SOAPpy import WSDL - # print " msn query..." - # wsdl_url = 'http://soap.search.msn.com/webservices.asmx?wsdl' - # server = WSDL.Proxy(wsdl_url) - # params = {'AppID': config.msn_appid, 'Query': '-Wikipedia "' + query + '"', 'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': { - # 'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}} - # - # search_request_retry = config.copyright_connection_tries - # results = '' - # while search_request_retry: - # try: - # server_results = server.Search(Request = params) - # search_request_retry = 0 - # if server_results.Responses[0].Results: - # results = server_results.Responses[0].Results[0] - # except Exception, err: - # print "Got an error ->", err - # search_request_retry -= 1 - # for entry in results: - # try: - # add_in_urllist(url, entry.Url, 'msn') - # except AttributeError: - # print "attrib ERROR" + if config.copyright_msn: + #max_query_len = 150? + from SOAPpy import WSDL + print " Live query..." + try: + server = WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl') + except: + print "Live Search Error" + raise + params = {'AppID': config.msn_appid, 'Query': '-Wikipedia "' + query + '"', 'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': { + 'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}} + + search_request_retry = config.copyright_connection_tries + results = '' + while search_request_retry: + try: + server_results = server.Search(Request = params) + search_request_retry = 0 + if server_results.Responses[0].Results: + results = server_results.Responses[0].Results[0] + except Exception, err: + print "Got an error ->", err + if search_request_retry: + search_request_retry -= 1 + + if results: + # list or instance? + if type(results) == type([]): + for entry in results: + add_in_urllist(url, entry.Url, 'msn') + else: + add_in_urllist(url, results.Url, 'msn') + offset = 0 for i in range(len(url)): if check_list(url[i + offset][0], excl_list, verbose = True):

1 0

SVN: [4089] trunk/pywikipedia
by cosoleto＠svn.wikimedia.org 22 Aug '07

22 Aug '07

Revision: 4089 Author: cosoleto Date: 2007-08-22 20:35:48 +0000 (Wed, 22 Aug 2007) Log Message: ----------- Add last modified date and length of URL to copyright.py's result. The script can add also a Google cache link if a 400 HTTP status code is found. check_in_source() stuff rewrote object-oriented and equiped with a simple code to decode UTF-8 data. Modified Paths: -------------- trunk/pywikipedia/config.py trunk/pywikipedia/copyright.py Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2007-08-22 18:03:28 UTC (rev 4088) +++ trunk/pywikipedia/config.py 2007-08-22 20:35:48 UTC (rev 4089) @@ -323,6 +323,12 @@ # Number of attempts on connection error. copyright_connection_tries = 10 +# Append last modified date of URL to script result +copyright_show_date = True + +# Append length of URL to script result +copyright_show_length = True + ############## FURTHER SETTINGS ############## # The bot can make some additional changes to each page it edits, e.g. fix Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-08-22 18:03:28 UTC (rev 4088) +++ trunk/pywikipedia/copyright.py 2007-08-22 20:35:48 UTC (rev 4089) @@ -456,8 +456,11 @@ search_words = search_words[:search_words.rindex(" ")] results = get_results(search_words) group_url = '' - for url, engine in results: - group_url += '\n*%s - %s' % (engine, url) + for url, engine, comment in results: + if comment: + group_url += '\n*%s - %s (%s)' % (engine, url, "; ".join(comment)) + else: + group_url += '\n*%s - %s' % (engine, url) if results: group_url_list = group_url.splitlines() group_url_list.sort() @@ -482,51 +485,177 @@ source_seen = set() positive_source_seen = set() -def check_in_source(url): +class NoWebPage(Exception): + """Web page does not exist (404)""" + +class URL_exclusion(Exception): + """URL in exclusion list""" + +class WebPage(object): """ - Sources may be different from search engine database and include mentions of - Wikipedia. This function avoid also errors in search results that can occurs - either with Google and Yahoo! service. """ - import urllib2 - global excl_list, source_seen, positive_source_seen - if url in positive_source_seen: - return True + def __init__(self, url): + """ + """ - if url in source_seen: - return False + if check_list(url, excl_list): + raise URL_exclusion - if check_list(url, excl_list): - return False + self._url = url - # very experimental code - if not url[-4:] in [".pdf", ".doc", ".ppt"]: try: - resp = urllib2.urlopen(url) - text = resp.read() - #resp.close() - except urllib2.HTTPError: + self._urldata = urllib2.urlopen(urllib2.Request(self._url, None, { 'User-Agent': wikipedia.useragent })) + #except httplib.BadStatusLine, line: + # print 'URL: %s\nBad status line: %s' % (url, line) + except urllib2.HTTPError, err: + print "HTTP error: %d / %s (%s)" % (err.code, err.msg, url) + #if err.code == 404: + if err.code >= 400: + raise NoWebPage + return None + #except urllib2.URLError: + except Exception, err: + print "ERROR: %s" % (err) + + self._lastmodified = self._urldata.info().getdate('Last-Modified') + self._length = self._urldata.info().getheader('Content-Length') + self._content_type = self._urldata.info().getheader('Content-Type') + + def length(self): + if hasattr(self, '_length'): + if self._length: + return int(self._length) + if hasattr(self, '_contents'): + return len(self._contents) + + # print "No length for " + self._url + + return None + + def lastmodified(self): + if hasattr(self, '_lastmodified'): + return self._lastmodified + return None + + def get(self, force = False): + """ + """ + + # Exclude URL with listed file extension. + if self._url[-4:] in [".pdf", ".doc", ".ppt"]: + raise URL_exclusion + + # Make sure we did try to get the contents once + if not hasattr(self, '_contents'): + self._contents = self._urldata.read() + return self._contents + return None + + def check_in_source(self): + """ + Sources may be different from search engine database and include mentions of + Wikipedia. This function avoid also errors in search results that can occurs + either with Google and Yahoo! service. + """ + global excl_list, source_seen, positive_source_seen + + if not hasattr(self, '_urldata'): return False - if reWikipediaC.search(text): - # if 'wikipedia' in text.lower(): - excl_list += [url] - #write_log(url + '\n', "copyright/sites_with_'wikipedia'.txt") - positive_source_seen.add(url) + if self._url in positive_source_seen: return True - else: - #write_log(url + '\n', "copyright/sites_without_'wikipedia'.txt") - source_seen.add(url) - return False + if self._url in source_seen: + return False + + text = self.get() + + # Character encoding conversion if 'Content-Type' field has + # charset attribute set to UTF-8. + + if text: + if 'utf-8' in self._content_type.lower(): + text = text.decode("utf-8", 'replace') + else: + # <META> declaration with "http-equiv" set to "Content-Type" in HTML document. + if 'text/html' in self._content_type and (re.search("(?is)<meta\s.*?charset\s*=\s*[\"\']*\s*UTF-8.*?>", text) or re.search("(?is)<\?.*?encoding\s*=\s*[\"\']*\s*UTF-8.*?\?>", text)): + text = text.decode("utf-8", 'replace') + + m = reWikipediaC.search(text) + if m: + excl_list += [self._url] + write_log("%s (%s)\n" % (self._url, m.group()), "copyright/sites_with_'wikipedia'.txt") + positive_source_seen.add(self._url) + return True + else: + write_log(self._url + '\n', "copyright/sites_without_'wikipedia'.txt") + source_seen.add(self._url) + return False + def add_in_urllist(url, add_item, engine): + + if (engine == 'google' and config.copyright_check_in_source_google) or \ + (engine == 'yahoo' and config.copyright_check_in_source_yahoo): + check_in_source = True + else: + check_in_source = False + + if check_in_source or config.copyright_show_date or config.copyright_show_length: + s = None + cache = False + + # list to store date, length, cache URL + comment = list() + + try: + s = WebPage(add_item) + except URL_exclusion: + pass + except NoWebPage: + cache = True + + if s: + # Before of add url in result list, perform the check in source + if check_in_source: + if s.check_in_source(): + return + + if config.copyright_show_date: + date = s.lastmodified() + if date: + if date[:3] != time.localtime()[:3]: + comment.append("%s/%s/%s" % (date[2], date[1], date[0])) + + unit = 'bytes' + + if config.copyright_show_length: + length = s.length() + if length: + # convert in kilobyte + length /= 1024 + unit = 'KB' + if length > 1024: + # convert in megabyte + length /= 1024 + unit = 'MB' + if length > 0: + comment.append("%d %s" % (length, unit)) + + if cache: + if engine == 'google': + comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s google cache]' % add_item[7:]) + elif engine == 'yahoo': + cache = False + elif engine == 'msn': + cache = False + for i in range(len(url)): if add_item in url[i]: if engine not in url[i][1]: - url[i] = (add_item, url[i][1] + ', ' + engine) + url[i] = (add_item, url[i][1] + ', ' + engine, comment) return - url.append((add_item, engine)) + url.append((add_item, engine, comment)) return def get_results(query, numresults = 10): @@ -543,9 +672,6 @@ data = google.doGoogleSearch('-Wikipedia "' + query + '"') search_request_retry = 0 for entry in data.results: - if config.copyright_check_in_source_google: - if check_in_source(entry.URL): - continue add_in_urllist(url, entry.URL, 'google') except KeyboardInterrupt: raise @@ -565,9 +691,6 @@ while search_request_retry: try: for entry in data.parse_results(): - if config.copyright_check_in_source_yahoo: - if check_in_source(entry.Url): - continue add_in_urllist(url, entry.Url, 'yahoo') search_request_retry = 0 except Exception, err:

1 0

← Newer
1
...
8
9
10
11
12
13
14
...
32
Older →

Jump to page:

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

Pywikipedia-l August 2007