[Pywikipedia-l] SVN: [4887] trunk/pywikipedia
cosoleto at svn.wikimedia.org
cosoleto at svn.wikimedia.org
Mon Jan 14 17:19:26 UTC 2008
Revision: 4887
Author: cosoleto
Date: 2008-01-14 17:19:26 +0000 (Mon, 14 Jan 2008)
Log Message:
-----------
wikipedia.py: Bots eat up a lot of bandwidth, which is not free. Added compression support in site.postData() (yes! used by GetAll code, until now only getUrl() function was supporting gzip encoding...).
wikipedia.py: Disabled compression in getUrl when used with authenticate staff, doesn't work for me.
wikipedia.py: Code to print blacklisted URL can don't work due to a recent MediaWiki release, it replaces ':' char in ':'. Fixed.
config.py: "python config.py" now don't print module and function data references, only variables. Fixed an imprecision in copyright settings text.
fixes.py: Added support for user-fixes.py. Commented 'correct-ar' fix set because it isn't fully clear what it's supposed to do, if is safe or correct (included for more ancient arabic text). As now it's possible use user-fixes.py and if no objections or explanations is given, this part of code might be removed.
generate_user_files.py: A primitive script that create user-config.py and user-fixes.py files.
catall.py: code cleanup (please more attention adding copyright statement... :-/)
Modified Paths:
--------------
trunk/pywikipedia/catall.py
trunk/pywikipedia/config.py
trunk/pywikipedia/fixes.py
trunk/pywikipedia/wikipedia.py
Added Paths:
-----------
trunk/pywikipedia/generate_user_files.py
Modified: trunk/pywikipedia/catall.py
===================================================================
--- trunk/pywikipedia/catall.py 2008-01-14 14:49:18 UTC (rev 4886)
+++ trunk/pywikipedia/catall.py 2008-01-14 17:19:26 UTC (rev 4887)
@@ -10,16 +10,19 @@
"""
#
# (C) Rob W.W. Hooft, Andre Engels, 2004
-# (C) Filnik, 2008
#
# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: catall.py,v 1.5 2008/01/12 12:49:25 filnik Exp$'
#
+__version__ = '$Id$'
+#
import wikipedia, sys
-msg = {
+# This is a purely interactive robot. We set the delays lower.
+wikipedia.get_throttle.setDelay(5)
+wikipedia.put_throttle.setDelay(10)
+
+msg={
'en':u'Bot: Changing categories',
'he':u'Bot: משנה קטגוריות',
'fr':u'Bot: Change categories',
@@ -36,36 +39,35 @@
}
def choosecats(pagetext):
- chosen = []
- flag = False
- length = 1000
- textToPrint = """Give the new categories, one per line.
-Empty line: if the first, don't change. Otherwise: Ready.
--: I made a mistake, let me start over.
-?: Give the text of the page with GUI.
-??: Give the text of the page in console.
-xx: if the first, remove all categories and add no new.
-q: quit."""
- wikipedia.output(textToPrint)
+ chosen=[]
+ flag=False
+ length=1000
+ print ("Give the new categories, one per line.")
+ print ("Empty line: if the first, don't change. Otherwise: Ready.")
+ print ("-: I made a mistake, let me start over.")
+ print ("?: Give the text of the page with GUI.")
+ print ("??: Give the text of the page in console.")
+ print ("xx: if the first, remove all categories and add no new.")
+ print ("q: quit.")
while flag == False:
- choice = wikipedia.input(u"\nSo, what do you want to do?")
- if choice == "":
- flag = True
- elif choice == "-":
- chosen = choosecats(pagetext)
- flag = True
- elif choice == "?":
+ choice=wikipedia.input(u"?")
+ if choice=="":
+ flag=True
+ elif choice=="-":
+ chosen=choosecats(pagetext)
+ flag=True
+ elif choice=="?":
import editarticle
editor = editarticle.TextEditor()
newtext = editor.edit(pagetext)
- elif choice == "??":
+ elif choice =="??":
wikipedia.output(pagetext[0:length])
length = length+500
- elif choice== "xx" and chosen == []:
+ elif choice=="xx" and chosen==[]:
chosen = None
- flag = True
- elif choice == "q":
- wikipedia.output("quit...")
+ flag=True
+ elif choice=="q":
+ print "quit..."
sys.exit()
else:
chosen.append(choice)
@@ -74,39 +76,37 @@
def make_categories(page, list, site = None):
if site is None:
site = wikipedia.getSite()
- pllist = []
+ pllist=[]
for p in list:
- cattitle = "%s:%s" % (site.category_namespace(), p)
+ cattitle="%s:%s" % (site.category_namespace(), p)
pllist.append(wikipedia.Page(site,cattitle))
page.put(wikipedia.replaceCategoryLinks(page.get(), pllist), comment = wikipedia.translate(site.lang, msg))
-try:
- # This is a purely interactive robot. We set the delays lower.
- wikipedia.get_throttle.setDelay(5)
- wikipedia.put_throttle.setDelay(10)
- docorrections=True
- start = []
+docorrections=True
+start=[]
- for arg in wikipedia.handleArgs():
- if arg == '-onlynew':
- docorrections=False
- else:
- start.append(arg)
-
- if start == []:
- start = 'A'
+for arg in wikipedia.handleArgs():
+ if arg == '-onlynew':
+ docorrections=False
else:
- start = ' '.join(start)
+ start.append(arg)
- mysite = wikipedia.getSite()
+if start == []:
+ start='A'
+else:
+ start=' '.join(start)
+
+mysite = wikipedia.getSite()
+
+try:
for p in mysite.allpages(start = start):
try:
- text = p.get()
- cats = p.categories()
+ text=p.get()
+ cats=p.categories()
if cats == []:
wikipedia.output(u"========== %s ==========" % p.title())
- wikipedia.output("No categories")
- wikipedia.output("----------------------------------------")
+ print "No categories"
+ print "----------------------------------------"
newcats=choosecats(text)
if newcats != [] and newcats is not None:
make_categories(p, newcats, mysite)
@@ -115,14 +115,13 @@
wikipedia.output(u"========== %s ==========" % p.title())
for c in cats:
wikipedia.output(c.title())
- wikipedia.output("----------------------------------------"
+ print "----------------------------------------"
newcats=choosecats(text)
if newcats == None:
make_categories(p, [], mysite)
elif newcats != []:
make_categories(p, newcats, mysite)
except wikipedia.IsRedirectPage:
- wikipedia.output(u'%s is a redirect, skip...' % p.title())
- continue
+ wikipedia.output(u'%s is a redirect' % p.title())
finally:
wikipedia.stopme()
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2008-01-14 14:49:18 UTC (rev 4886)
+++ trunk/pywikipedia/config.py 2008-01-14 17:19:26 UTC (rev 4887)
@@ -357,13 +357,13 @@
# comma separated list or only numbers. But sometimes that might be the
# only part unmodified of a slightly edited and not otherwise reported
# copyright violation. You can disable this feature to try to increase
-# accuracy.
+# number of results.
copyright_economize_query = True
############## HTTP SETTINGS ##############
-# Use a persistent http connection. An http connection has to be established
-# only once per site object, making stuff a whole lot faster. Do NOT EVER
+# Use a persistent http connection. An http connection has to be established
+# only once per site object, making stuff a whole lot faster. Do NOT EVER
# use this if you share Site objects across threads without proper locking.
persistent_http = False
@@ -459,7 +459,7 @@
a '/' to the path if you want it to be a directory path.
from holger at trillke.net 2002/03/18
-
+
"""
from os import makedirs
from os.path import normpath, dirname, exists, abspath
@@ -474,7 +474,7 @@
Argument(s) are zero or more directory names, optionally followed by a
data file name. The return path is offset to config.base_dir. Any
directories in the path that do not already exist are created.
-
+
"""
import os
return makepath(os.path.join(base_dir, *filename))
@@ -490,6 +490,7 @@
# When called as main program, list all configuration variables
#
if __name__=="__main__":
+ import types
_all=1
for _arg in __sys.argv[1:]:
if _arg=="modified":
@@ -500,8 +501,9 @@
_k.sort()
for _name in _k:
if _name[0]!='_':
- if _all or _glv[_name]!=globals()[_name]:
- print _name,"=",repr(globals()[_name])
+ if not type(globals()[_name]) in [types.FunctionType, types.ModuleType]:
+ if _all or _glv[_name]!=globals()[_name]:
+ print _name,"=",repr(globals()[_name])
# cleanup all locally-defined variables
Modified: trunk/pywikipedia/fixes.py
===================================================================
--- trunk/pywikipedia/fixes.py 2008-01-14 14:49:18 UTC (rev 4886)
+++ trunk/pywikipedia/fixes.py 2008-01-14 17:19:26 UTC (rev 4887)
@@ -390,9 +390,12 @@
}
},
- #Corrections for Arabic Wikipedia
- #And any Arabic wiki.
- #python replace.py -always -start:! -fix:correct-ar
+ #Corrections for Arabic Wikipedia and any Arabic wiki.
+
+ # It isn't fully clear what it's supposed to do, if is safe or correct (included for more
+ # ancient arabic text). As now it's possible use user-fixes.py and if no objections or
+ # explanations is given, this part of code might be removed.
+
'correct-ar': {
'regex': False,
'msg': {
@@ -491,3 +494,13 @@
]
},
}
+
+#
+# Load the user fixes file.
+
+import config
+
+try:
+ execfile(config.datafilepath(config.base_dir, "user-fixes.py"))
+except IOError:
+ pass
Added: trunk/pywikipedia/generate_user_files.py
===================================================================
--- trunk/pywikipedia/generate_user_files.py (rev 0)
+++ trunk/pywikipedia/generate_user_files.py 2008-01-14 17:19:26 UTC (rev 4887)
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+""" Script to create user files (user-config.py, user-fixes.py) """
+__version__ = '$Id$'
+
+import os, sys, codecs, re
+
+base_dir = ''
+console_encoding = sys.stdout.encoding
+
+if console_encoding == None or sys.platform == 'cygwin':
+ console_encoding = "iso-8859-1"
+
+def listchoice(clist = [], message = None, default = None):
+
+ if not message:
+ message = "Select"
+
+ if default:
+ message += " (default: %s)" % default
+
+ message += ": "
+
+ n = 0
+
+ for i in clist:
+ n += 1
+ print ("%d: %s" % (n, i))
+
+ while True:
+ choice = raw_input(message)
+
+ if choice == '' and default:
+ return default
+
+ try:
+ return clist[int(choice) - 1]
+ except:
+ print "Invalid response"
+ return response
+
+def file_exists(filename):
+ if os.path.exists(filename):
+ print "'%s' already exists." % filename
+ return True
+ return False
+
+def create_user_config():
+ _fnc = os.path.join(base_dir, "user-config.py")
+ if not file_exists(_fnc):
+ know_families = re.findall(r'(.+)_family.py\b', '\n'.join(os.listdir(os.path.join(base_dir, "families"))))
+ fam = listchoice(know_families, "Select family of sites we are working on", default = 'wikipedia')
+ mylang = raw_input("The language code of the site we're working on (default: 'en'): ") or 'en'
+ username = raw_input("Username (%s %s): " % (mylang, fam)) or 'UnnamedBot'
+ username = unicode(username, console_encoding)
+
+ #
+ # I don't like this solution. Temporary for me.
+ f = codecs.open("config.py", "r", "utf-8") ; cpy = f.read() ; f.close()
+
+ res = re.findall("^(############## (?:LOGFILE|"
+ "INTERWIKI|"
+ "SOLVE_DISAMBIGUATION|"
+ "IMAGE RELATED|"
+ "TABLE CONVERSION BOT|"
+ "WEBLINK CHECKER|"
+ "DATABASE|"
+ "SEARCH ENGINE|"
+ "COPYRIGHT|"
+ "FURTHER) SETTINGS .*?)^(?=#####|# =====)", cpy, re.MULTILINE | re.DOTALL)
+ config_text = u'\n'.join(res)
+
+ f = codecs.open(_fnc, "w", "utf-8")
+ f.write(u"""# -*- coding: utf-8 -*-
+
+# This is an automatically generated file. You can find more configuration parameters in 'config.py' file.
+
+# The family of sites we are working on. wikipedia.py will import
+# families/xxx_family.py so if you want to change this variable,
+# you need to write such a file.
+family = '%s'
+
+# The language code of the site we're working on.
+mylang = '%s'
+
+# The dictionary usernames should contain a username for each site where you
+# have a bot account.
+usernames['%s']['%s'] = u'%s'
+
+
+%s""" % (fam, mylang, fam, mylang, username, config_text))
+ f.close()
+ print "'%s' written." % _fnc
+
+def create_user_fixes():
+ _fnf = os.path.join(base_dir, "user-fixes.py")
+ if not file_exists(_fnf):
+ f = codecs.open(_fnf, "w", "utf-8")
+ f.write(ur"""# -*- coding: utf-8 -*-
+
+#
+# This is only an example. Don't use it.
+#
+
+fixes['example'] = {
+ 'regex': True,
+ 'msg': {
+ '_default':u'no summary specified',
+ },
+ 'replacements': [
+ (ur'\bword\b', u'two words'),
+ ]
+}
+
+""")
+ f.close()
+ print "'%s' written." % _fnf
+
+
+create_user_config()
+create_user_fixes()
\ No newline at end of file
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-01-14 14:49:18 UTC (rev 4886)
+++ trunk/pywikipedia/wikipedia.py 2008-01-14 17:19:26 UTC (rev 4887)
@@ -1318,6 +1318,7 @@
relevant = data[data.find('<!-- start content -->')+22:data.find('<!-- end content -->')].strip()
# Throw away all the other links etc.
relevant = re.sub('<.*?>', '', relevant)
+ relevant = relevant.replace(':', ':')
# MediaWiki only spam-checks HTTP links, and only the
# domain name part of the URL.
m = re.search('http://[\w\-\.]+', relevant)
@@ -2600,7 +2601,7 @@
"""
# TODO: why isn't this a Site method?
- pages = list(pages) # if pages is an iterator, we need to
+ pages = list(pages) # if pages is an iterator, we need to
output(u'Getting %d pages from %s...' % (len(pages), site))
_GetAll(site, pages, throttle, force).run()
@@ -3838,15 +3839,15 @@
def postForm(self, address, predata, sysop=False, useCookie=True):
"""Post http form data to the given address at this site.
-
+
address is the absolute path without hostname.
predata is a dict or any iterable that can be converted to a dict,
containing keys and values for the http form.
-
+
Return a (response, data) tuple, where response is the HTTP
response object and data is a Unicode string containing the
body of the response.
-
+
"""
data = self.urlEncode(predata)
try:
@@ -3857,12 +3858,12 @@
def postData(self, address, data,
contentType='application/x-www-form-urlencoded',
- sysop=False, useCookie=True):
+ sysop=False, useCookie=True, compress=True):
"""Post encoded data to the given http address at this site.
-
+
address is the absolute path without hostname.
data is an ASCII string that has been URL-encoded.
-
+
Returns a (response, data) tuple where response is the HTTP
response object and data is a Unicode string containing the
body of the response.
@@ -3888,6 +3889,8 @@
conn.putheader('Cookie', self.cookies(sysop = sysop))
if False: #self.persistent_http:
conn.putheader('Connection', 'Keep-Alive')
+ if compress:
+ conn.putheader('Accept-encoding', 'gzip')
conn.endheaders()
conn.send(data)
@@ -3901,8 +3904,15 @@
conn.close()
conn.connect()
return self.postData(address, data, contentType, sysop, useCookie)
- data = response.read().decode(self.encoding())
+
+ data = response.read()
+
+ if compress and response.getheader('Content-Encoding') == 'gzip':
+ data = decompress_gzip(data)
+
+ data = data.decode(self.encoding())
response.close()
+
if True: #not self.persistent_http:
conn.close()
return response, data
@@ -3943,7 +3953,7 @@
text = response.read()
headers = dict(response.getheaders())
-
+
else:
if self.hostname() in config.authenticate.keys():
uo = authenticateURLopener
@@ -3965,7 +3975,7 @@
while not retrieved:
try:
if self.hostname() in config.authenticate.keys():
- if compress:
+ if False: # compress:
request = urllib2.Request(url, data)
request.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
@@ -3994,12 +4004,12 @@
else:
raise
text = f.read()
-
+
headers = f.info()
-
+
contentType = headers.get('content-type', '')
contentEncoding = headers.get('content-encoding', '')
-
+
# Ensure that all sent data is received
if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers:
output(u'Warning! len(text) does not match content-length: %s != %s' % \
@@ -4009,16 +4019,7 @@
return self.getUrl(path, retry, sysop, data, compress)
if compress and contentEncoding == 'gzip':
- # Use cStringIO if available
- # TODO: rewrite gzip.py such that it supports unseekable fileobjects.
- try:
- from cStringIO import StringIO
- except ImportError:
- from StringIO import StringIO
- import gzip
- compressedstream = StringIO(text)
- gzipper = gzip.GzipFile(fileobj=compressedstream)
- text = gzipper.read()
+ text = decompress_gzip(text)
R = re.compile('charset=([^\'\";]+)')
m = R.search(contentType)
@@ -5489,7 +5490,7 @@
text can contain special sequences to create colored output. These
consist of the escape character \03 and the color name in curly braces,
e. g. \03{lightpurple}. \03{default} resets the color.
-
+
"""
output_lock.acquire()
try:
@@ -5583,7 +5584,7 @@
-dir:PATH Read the bot's configuration data from directory given by
PATH, instead of from the default directory.
-
+
-lang:xx Set the language of the wiki you want to work on, overriding
the configuration in user-config.py. xx should be the
language code.
@@ -5593,7 +5594,7 @@
This will override the configuration in user-config.py.
-daemonize:xyz Immediately returns control to the terminal and redirects
- stdout and stderr to xyz (only use for bots that require
+ stdout and stderr to xyz (only use for bots that require
no input from stdin).
-help Shows this help text.
@@ -5677,7 +5678,7 @@
"""Wait for the page-putter to flush its queue.
Called automatically upon exiting from Python.
-
+
"""
if page_put_queue.qsize() > 0:
import datetime
@@ -5725,6 +5726,20 @@
get_throttle = Throttle(config.minthrottle,config.maxthrottle)
put_throttle = Throttle(config.put_throttle,config.put_throttle,False)
+def decompress_gzip(data):
+ # Use cStringIO if available
+ # TODO: rewrite gzip.py such that it supports unseekable fileobjects.
+ if data:
+ try:
+ from cStringIO import StringIO
+ except ImportError:
+ from StringIO import StringIO
+ import gzip
+ try:
+ data = gzip.GzipFile(fileobj = StringIO(data)).read()
+ except IOError:
+ raise
+ return data
class MyURLopener(urllib.FancyURLopener):
version="PythonWikipediaBot/1.0"
More information about the Pywikipedia-l
mailing list