Revision: 4140
Author: cydeweys
Date: 2007-08-30 01:34:57 +0000 (Thu, 30 Aug 2007)
Log Message:
-----------
Committing a quick fix to catch the NoPage exceptions caught when working on a red category page. Red categories can still contain pages in them, so we should not abort.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-08-29 22:11:40 UTC (rev 4139)
+++ trunk/pywikipedia/wikipedia.py 2007-08-30 01:34:57 UTC (rev 4140)
@@ -1331,7 +1331,11 @@
The return value is a list of Category objects, one for each of the
category links in the page text.
"""
- return getCategoryLinks(self.get(nofollow_redirects=nofollow_redirects), self.site())
+ try:
+ category_links_to_return = getCategoryLinks(self.get(nofollow_redirects=nofollow_redirects), self.site())
+ except NoPage:
+ category_links_to_return = []
+ return category_links_to_return
def __cmp__(self, other):
"""Pseudo method to be able to use equality and inequality tests on
Revision: 4139
Author: siebrand
Date: 2007-08-29 22:11:40 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
Also revert changes to the delinker. Works again.
Modified Paths:
--------------
trunk/pywikipedia/delinker.py
trunk/pywikipedia/delinker.txt
Modified: trunk/pywikipedia/delinker.py
===================================================================
--- trunk/pywikipedia/delinker.py 2007-08-29 22:06:09 UTC (rev 4138)
+++ trunk/pywikipedia/delinker.py 2007-08-29 22:11:40 UTC (rev 4139)
@@ -396,7 +396,7 @@
http_callback = wait_callback, no_db = True)
- def check_usage(self, image, timestamp, admin, reason, replacement, namespace = None):
+ def check_usage(self, image, timestamp, admin, reason, replacement):
""" Check whether this image needs to be delinked. """
# Check whether the image still is deleted on Commons.
@@ -420,7 +420,7 @@
if self.CommonsDelinker.config['global']:
- usage = self.CheckUsage.get_usage(image, namespace = namespace)
+ usage = self.CheckUsage.get_usage(image)
usage_domains = {}
count = 0
@@ -434,15 +434,12 @@
#FIX!
usage_domains = {(self.site.lang, self.site.family.name):
list(self.CheckUsage.get_usage_live(self.site,
- image, namespace = namespace))}
+ image))}
count = len(usage_domains[(self.site.lang, self.site.family.name)])
output(u'%s %s used on %s pages' % (self, image, count))
if count:
- if count > self.CommonsDelinker.config.get('template_threshold', sys.maxint):
- output('%s Only delinking %s from template namespace' % (self, image))
- return check_usage(image, timestamp, admin, reason, replacement, 10)
# Pass the usage to the Delinker pool along with other arguments
self.CommonsDelinker.Delinkers.append((image, usage_domains,
timestamp, admin, reason, replacement))
Modified: trunk/pywikipedia/delinker.txt
===================================================================
--- trunk/pywikipedia/delinker.txt 2007-08-29 22:06:09 UTC (rev 4138)
+++ trunk/pywikipedia/delinker.txt 2007-08-29 22:11:40 UTC (rev 4139)
@@ -95,8 +95,6 @@
* ''exclude_string = "no-delink"'': If this string is included in the deletion
summary, the file is not delinked.
* ''summary_cache = 3600'': Time before on-wiki settings are updated.
-* ''template_threshold = 1000'': If more an image is used more often than this
- value, only delink from the template namespace.
=== Replacer settings ===
Those variables only need to be set if the replacer is enabled.
Revision: 4137
Author: btongminh
Date: 2007-08-29 20:01:51 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
New config setting: template_threshold.
Modified Paths:
--------------
trunk/pywikipedia/delinker.py
trunk/pywikipedia/delinker.txt
Modified: trunk/pywikipedia/delinker.py
===================================================================
--- trunk/pywikipedia/delinker.py 2007-08-29 19:56:26 UTC (rev 4136)
+++ trunk/pywikipedia/delinker.py 2007-08-29 20:01:51 UTC (rev 4137)
@@ -396,7 +396,7 @@
http_callback = wait_callback, no_db = True)
- def check_usage(self, image, timestamp, admin, reason, replacement):
+ def check_usage(self, image, timestamp, admin, reason, replacement, namespace = None):
""" Check whether this image needs to be delinked. """
# Check whether the image still is deleted on Commons.
@@ -420,7 +420,7 @@
if self.CommonsDelinker.config['global']:
- usage = self.CheckUsage.get_usage(image)
+ usage = self.CheckUsage.get_usage(image, namespace = namespace)
usage_domains = {}
count = 0
@@ -434,12 +434,15 @@
#FIX!
usage_domains = {(self.site.lang, self.site.family.name):
list(self.CheckUsage.get_usage_live(self.site,
- image))}
+ image, namespace = namespace))}
count = len(usage_domains[(self.site.lang, self.site.family.name)])
output(u'%s %s used on %s pages' % (self, image, count))
if count:
+ if count > self.CommonsDelinker.config.get('template_threshold', sys.maxint):
+ output('%s Only delinking %s from template namespace' % (self, image))
+ return check_usage(image, timestamp, admin, reason, replacement, 10)
# Pass the usage to the Delinker pool along with other arguments
self.CommonsDelinker.Delinkers.append((image, usage_domains,
timestamp, admin, reason, replacement))
Modified: trunk/pywikipedia/delinker.txt
===================================================================
--- trunk/pywikipedia/delinker.txt 2007-08-29 19:56:26 UTC (rev 4136)
+++ trunk/pywikipedia/delinker.txt 2007-08-29 20:01:51 UTC (rev 4137)
@@ -95,6 +95,8 @@
* ''exclude_string = "no-delink"'': If this string is included in the deletion
summary, the file is not delinked.
* ''summary_cache = 3600'': Time before on-wiki settings are updated.
+* ''template_threshold = 1000'': If more an image is used more often than this
+ value, only delink from the template namespace.
=== Replacer settings ===
Those variables only need to be set if the replacer is enabled.
Revision: 4135
Author: btongminh
Date: 2007-08-29 19:46:50 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
Add persitent_http as a config setting.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-08-29 19:43:26 UTC (rev 4134)
+++ trunk/pywikipedia/config.py 2007-08-29 19:46:50 UTC (rev 4135)
@@ -359,6 +359,12 @@
copyright_economize_query = True
+############## HTTP SETTINGS ##############
+# Use a persistent http connection. An http connection has to be established
+# only once per site object, making stuff a whole lot faster. Do NOT EVER
+# use this if you share Site objects across threads without proper locking.
+persistent_http = False
+
############## FURTHER SETTINGS ##############
# The bot can make some additional changes to each page it edits, e.g. fix
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-08-29 19:43:26 UTC (rev 4134)
+++ trunk/pywikipedia/wikipedia.py 2007-08-29 19:46:50 UTC (rev 4135)
@@ -3131,6 +3131,7 @@
if not language[0].upper() + language[1:] in self.namespaces():
self._validlanguages.append(language)
+ if persistent_http is None: persistent_http = config.persistent_http
self.persistent_http = persistent_http and self.protocol() in ('http', 'https')
if persistent_http:
if self.protocol() == 'http':
Revision: 4134
Author: btongminh
Date: 2007-08-29 19:43:26 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
The cache keys wikipedia.getSite uses now use all arguments to getSite; Add support for persistent connections. An http connection has to be established only once, making stuff a whole lot faster. Do NOT EVER use this if you share Site objects across threads without proper locking. In fact you should never ever share objects between threads without locking.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-08-29 18:00:33 UTC (rev 4133)
+++ trunk/pywikipedia/wikipedia.py 2007-08-29 19:43:26 UTC (rev 4134)
@@ -3090,13 +3090,18 @@
return myfamily.Family()
class Site(object):
- def __init__(self, code, fam=None, user=None):
- """Constructor takes three arguments:
+ def __init__(self, code, fam=None, user=None, persistent_http = None):
+ """Constructor takes four arguments:
code language code for Site
fam Wikimedia family (optional: defaults to configured).
Can either be a string or a Family object.
- user User to use (optional: defaults to configured)"""
+ user User to use (optional: defaults to configured)
+ persistent_http Use a persistent http connection. An http connection
+ has to be established only once, making stuff a whole lot
+ faster. Do NOT EVER use this if you share Site objects
+ across threads without proper locking.
+ """
self.lang = code.lower()
if isinstance(fam, basestring) or fam is None:
@@ -3125,6 +3130,15 @@
for language in self.languages():
if not language[0].upper() + language[1:] in self.namespaces():
self._validlanguages.append(language)
+
+ self.persistent_http = persistent_http and self.protocol() in ('http', 'https')
+ if persistent_http:
+ if self.protocol() == 'http':
+ self.conn = httplib.HTTPConnection(self.hostname())
+ elif self.protocol() == 'https':
+ self.conn = httplib.HTTPSConnection(self.hostname())
+
+
self.sandboxpage = Page(self,self.family.sandboxpage(code))
def urlEncode(self, query):
@@ -3170,28 +3184,41 @@
# TODO: add the authenticate stuff here
- # Encode all of this into a HTTP request
- if self.protocol() == 'http':
- conn = httplib.HTTPConnection(self.hostname())
- elif self.protocol() == 'https':
- conn = httplib.HTTPSConnection(self.hostname())
- # otherwise, it will crash, as other protocols are not supported
-
+ if self.persistent_http:
+ conn = self.conn
+ else:
+ # Encode all of this into a HTTP request
+ if self.protocol() == 'http':
+ conn = httplib.HTTPConnection(self.hostname())
+ elif self.protocol() == 'https':
+ conn = httplib.HTTPSConnection(self.hostname())
+ # otherwise, it will crash, as other protocols are not supported
+
conn.putrequest('POST', address)
conn.putheader('Content-Length', str(len(data)))
conn.putheader('Content-type', contentType)
conn.putheader('User-agent', useragent)
if useCookie and self.cookies(sysop = sysop):
conn.putheader('Cookie', self.cookies(sysop = sysop))
+ if self.persistent_http:
+ conn.putheader('Connection', 'Keep-Alive')
conn.endheaders()
conn.send(data)
# Prepare the return values
# Note that this can raise network exceptions which are not
# caught here.
- response = conn.getresponse()
+ try:
+ response = conn.getresponse()
+ except httplib.BadStatusLine:
+ # Blub.
+ conn.close()
+ conn.connect()
+ return self.postData(address, data, contentType, sysop, useCookie)
data = response.read().decode(self.encoding())
- conn.close()
+ response.close()
+ if not self.persistent_http:
+ conn.close()
return response, data
def forceLogin(self, sysop = False):
@@ -3278,59 +3305,92 @@
Returns the HTML text of the page converted to unicode.
"""
- if self.hostname() in config.authenticate.keys():
- uo = authenticateURLopener
- else:
- uo = MyURLopener()
- if self.cookies(sysop = sysop):
- uo.addheader('Cookie', self.cookies(sysop = sysop))
+ if self.persistent_http and not data:
+ self.conn.putrequest('GET', path)
+ self.conn.putheader('User-agent', useragent)
+ self.conn.putheader('Cookie', self.cookies(sysop = sysop))
+ self.conn.putheader('Connection', 'Keep-Alive')
if compress:
- uo.addheader('Accept-encoding', 'gzip')
-
- url = '%s://%s%s' % (self.protocol(), self.hostname(), path)
- data = self.urlEncode(data)
-
- # Try to retrieve the page until it was successfully loaded (just in
- # case the server is down or overloaded).
- # Wait for retry_idle_time minutes (growing!) between retries.
- retry_idle_time = 1
- retrieved = False
- while not retrieved:
+ self.conn.putheader('Accept-encoding', 'gzip')
+ self.conn.endheaders()
+
+ # Prepare the return values
+ # Note that this can raise network exceptions which are not
+ # caught here.
try:
- if self.hostname() in config.authenticate.keys():
- if compress:
- request = urllib2.Request(url, data)
- request.add_header('Accept-encoding', 'gzip')
- opener = urllib2.build_opener()
- f = opener.open(request)
+ response = self.conn.getresponse()
+ except httplib.BadStatusLine:
+ # Blub.
+ self.conn.close()
+ self.conn.connect()
+ return self.getUrl(path, retry, sysop, data, compress)
+
+ text = response.read()
+ contentType = response.getheader('Content-Type')
+ contentEncoding = response.getheader('Content-Encoding')
+ else:
+ if self.hostname() in config.authenticate.keys():
+ uo = authenticateURLopener
+ else:
+ uo = MyURLopener()
+ if self.cookies(sysop = sysop):
+ uo.addheader('Cookie', self.cookies(sysop = sysop))
+ if compress:
+ uo.addheader('Accept-encoding', 'gzip')
+
+ url = '%s://%s%s' % (self.protocol(), self.hostname(), path)
+ data = self.urlEncode(data)
+
+ # Try to retrieve the page until it was successfully loaded (just in
+ # case the server is down or overloaded).
+ # Wait for retry_idle_time minutes (growing!) between retries.
+ retry_idle_time = 1
+ retrieved = False
+ while not retrieved:
+ try:
+ if self.hostname() in config.authenticate.keys():
+ if compress:
+ request = urllib2.Request(url, data)
+ request.add_header('Accept-encoding', 'gzip')
+ opener = urllib2.build_opener()
+ f = opener.open(request)
+ else:
+ f = urllib2.urlopen(url, data)
else:
- f = urllib2.urlopen(url, data)
- else:
- f = uo.open(url, data)
- retrieved = True
- except KeyboardInterrupt:
- raise
- except Exception, e:
- if retry:
- # We assume that the server is down. Wait some time, then try again.
- output(u"%s" % e)
- output(u"WARNING: Could not open '%s://%s%s'. Maybe the server or your connection is down. Retrying in %i minutes..." % (self.protocol(), self.hostname(), path, retry_idle_time))
- time.sleep(retry_idle_time * 60)
- # Next time wait longer, but not longer than half an hour
- retry_idle_time *= 2
- if retry_idle_time > 30:
- retry_idle_time = 30
- else:
+ f = uo.open(url, data)
+ retrieved = True
+ except KeyboardInterrupt:
raise
- text = f.read()
- if compress and f.headers.get('Content-Encoding') == 'gzip':
- import StringIO, gzip
- compressedstream = StringIO.StringIO(text)
+ except Exception, e:
+ if retry:
+ # We assume that the server is down. Wait some time, then try again.
+ output(u"%s" % e)
+ output(u"WARNING: Could not open '%s://%s%s'. Maybe the server or your connection is down. Retrying in %i minutes..." % (self.protocol(), self.hostname(), path, retry_idle_time))
+ time.sleep(retry_idle_time * 60)
+ # Next time wait longer, but not longer than half an hour
+ retry_idle_time *= 2
+ if retry_idle_time > 30:
+ retry_idle_time = 30
+ else:
+ raise
+ text = f.read()
+
+ # Find charset in the content-type meta tag
+ contentType = f.info()['Content-Type']
+ contentEncoding = f.headers.get('Content-Encoding')
+
+ if compress and contentEncoding == 'gzip':
+ # Use cStringIO if available
+ # TODO: rewrite gzip.py such that it supports unseekable fileobjects.
+ try:
+ from cStringIO import StringIO
+ except ImportError:
+ from StringIO import StringIO
+ import gzip
+ compressedstream = StringIO(text)
gzipper = gzip.GzipFile(fileobj=compressedstream)
text = gzipper.read()
-
- # Find charset in the content-type meta tag
- contentType = f.info()['Content-Type']
+
R = re.compile('charset=([^\'\";]+)')
m = R.search(contentType)
if m:
@@ -4158,14 +4218,14 @@
_sites = {}
_namespaceCache = {}
-def getSite(code = None, fam = None, user=None):
+def getSite(code = None, fam = None, user=None, persistent_http=None):
if code == None:
code = default_code
if fam == None:
fam = default_family
- key = '%s:%s'%(fam,code)
+ key = '%s:%s:%s:%s'%(fam,code,user,persistent_http)
if not _sites.has_key(key):
- _sites[key] = Site(code=code, fam=fam, user=user)
+ _sites[key] = Site(code=code, fam=fam, user=user, persistent_http=persistent_http)
return _sites[key]
def setSite(site):
Revision: 4131
Author: btongminh
Date: 2007-08-29 17:20:31 +0000 (Wed, 29 Aug 2007)
Log Message:
-----------
Only start the put queue if there is actually work to do.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-08-29 15:14:32 UTC (rev 4130)
+++ trunk/pywikipedia/wikipedia.py 2007-08-29 17:20:31 UTC (rev 4131)
@@ -1030,6 +1030,14 @@
"""Asynchronous version of put (takes the same arguments), which
places pages on a queue to be saved by a daemon thread.
"""
+ try:
+ page_put_queue.mutex.acquire()
+ try:
+ _putthread.start()
+ except AssertionError:
+ pass
+ finally:
+ page_put_queue.mutex.release()
page_put_queue.put((self, newtext, comment, watchArticle, minorEdit, force))
def put(self, newtext, comment=None, watchArticle = None, minorEdit = True, force=False):
@@ -4678,7 +4686,8 @@
# identification for debugging purposes
_putthread.setName('Put-Thread')
_putthread.setDaemon(True)
-_putthread.start()
+## Don't start the queue if it is not necessary.
+#_putthread.start()
def stopme():
"""This should be run when a bot does not interact with the Wiki, or