Revision: 4887 Author: cosoleto Date: 2008-01-14 17:19:26 +0000 (Mon, 14 Jan 2008)
Log Message: ----------- wikipedia.py: Bots eat up a lot of bandwidth, which is not free. Added compression support in site.postData() (yes! used by GetAll code, until now only getUrl() function was supporting gzip encoding...). wikipedia.py: Disabled compression in getUrl when used with authenticate staff, doesn't work for me. wikipedia.py: Code to print blacklisted URL can don't work due to a recent MediaWiki release, it replaces ':' char in ':'. Fixed. config.py: "python config.py" now don't print module and function data references, only variables. Fixed an imprecision in copyright settings text. fixes.py: Added support for user-fixes.py. Commented 'correct-ar' fix set because it isn't fully clear what it's supposed to do, if is safe or correct (included for more ancient arabic text). As now it's possible use user-fixes.py and if no objections or explanations is given, this part of code might be removed. generate_user_files.py: A primitive script that create user-config.py and user-fixes.py files. catall.py: code cleanup (please more attention adding copyright statement... :-/)
Modified Paths: -------------- trunk/pywikipedia/catall.py trunk/pywikipedia/config.py trunk/pywikipedia/fixes.py trunk/pywikipedia/wikipedia.py
Added Paths: ----------- trunk/pywikipedia/generate_user_files.py
Modified: trunk/pywikipedia/catall.py =================================================================== --- trunk/pywikipedia/catall.py 2008-01-14 14:49:18 UTC (rev 4886) +++ trunk/pywikipedia/catall.py 2008-01-14 17:19:26 UTC (rev 4887) @@ -10,16 +10,19 @@ """ # # (C) Rob W.W. Hooft, Andre Engels, 2004 -# (C) Filnik, 2008 # # Distributed under the terms of the MIT license. -# -__version__ = '$Id: catall.py,v 1.5 2008/01/12 12:49:25 filnik Exp$' # +__version__ = '$Id$' +#
import wikipedia, sys
-msg = { +# This is a purely interactive robot. We set the delays lower. +wikipedia.get_throttle.setDelay(5) +wikipedia.put_throttle.setDelay(10) + +msg={ 'en':u'Bot: Changing categories', 'he':u'Bot: משנה קטגוריות', 'fr':u'Bot: Change categories', @@ -36,36 +39,35 @@ }
def choosecats(pagetext): - chosen = [] - flag = False - length = 1000 - textToPrint = """Give the new categories, one per line. -Empty line: if the first, don't change. Otherwise: Ready. --: I made a mistake, let me start over. -?: Give the text of the page with GUI. -??: Give the text of the page in console. -xx: if the first, remove all categories and add no new. -q: quit.""" - wikipedia.output(textToPrint) + chosen=[] + flag=False + length=1000 + print ("Give the new categories, one per line.") + print ("Empty line: if the first, don't change. Otherwise: Ready.") + print ("-: I made a mistake, let me start over.") + print ("?: Give the text of the page with GUI.") + print ("??: Give the text of the page in console.") + print ("xx: if the first, remove all categories and add no new.") + print ("q: quit.") while flag == False: - choice = wikipedia.input(u"\nSo, what do you want to do?") - if choice == "": - flag = True - elif choice == "-": - chosen = choosecats(pagetext) - flag = True - elif choice == "?": + choice=wikipedia.input(u"?") + if choice=="": + flag=True + elif choice=="-": + chosen=choosecats(pagetext) + flag=True + elif choice=="?": import editarticle editor = editarticle.TextEditor() newtext = editor.edit(pagetext) - elif choice == "??": + elif choice =="??": wikipedia.output(pagetext[0:length]) length = length+500 - elif choice== "xx" and chosen == []: + elif choice=="xx" and chosen==[]: chosen = None - flag = True - elif choice == "q": - wikipedia.output("quit...") + flag=True + elif choice=="q": + print "quit..." sys.exit() else: chosen.append(choice) @@ -74,39 +76,37 @@ def make_categories(page, list, site = None): if site is None: site = wikipedia.getSite() - pllist = [] + pllist=[] for p in list: - cattitle = "%s:%s" % (site.category_namespace(), p) + cattitle="%s:%s" % (site.category_namespace(), p) pllist.append(wikipedia.Page(site,cattitle)) page.put(wikipedia.replaceCategoryLinks(page.get(), pllist), comment = wikipedia.translate(site.lang, msg))
-try: - # This is a purely interactive robot. We set the delays lower. - wikipedia.get_throttle.setDelay(5) - wikipedia.put_throttle.setDelay(10) - docorrections=True - start = [] +docorrections=True +start=[]
- for arg in wikipedia.handleArgs(): - if arg == '-onlynew': - docorrections=False - else: - start.append(arg) - - if start == []: - start = 'A' +for arg in wikipedia.handleArgs(): + if arg == '-onlynew': + docorrections=False else: - start = ' '.join(start) + start.append(arg)
- mysite = wikipedia.getSite() +if start == []: + start='A' +else: + start=' '.join(start) + +mysite = wikipedia.getSite() + +try: for p in mysite.allpages(start = start): try: - text = p.get() - cats = p.categories() + text=p.get() + cats=p.categories() if cats == []: wikipedia.output(u"========== %s ==========" % p.title()) - wikipedia.output("No categories") - wikipedia.output("----------------------------------------") + print "No categories" + print "----------------------------------------" newcats=choosecats(text) if newcats != [] and newcats is not None: make_categories(p, newcats, mysite) @@ -115,14 +115,13 @@ wikipedia.output(u"========== %s ==========" % p.title()) for c in cats: wikipedia.output(c.title()) - wikipedia.output("----------------------------------------" + print "----------------------------------------" newcats=choosecats(text) if newcats == None: make_categories(p, [], mysite) elif newcats != []: make_categories(p, newcats, mysite) except wikipedia.IsRedirectPage: - wikipedia.output(u'%s is a redirect, skip...' % p.title()) - continue + wikipedia.output(u'%s is a redirect' % p.title()) finally: wikipedia.stopme()
Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2008-01-14 14:49:18 UTC (rev 4886) +++ trunk/pywikipedia/config.py 2008-01-14 17:19:26 UTC (rev 4887) @@ -357,13 +357,13 @@ # comma separated list or only numbers. But sometimes that might be the # only part unmodified of a slightly edited and not otherwise reported # copyright violation. You can disable this feature to try to increase -# accuracy. +# number of results.
copyright_economize_query = True
############## HTTP SETTINGS ############## -# Use a persistent http connection. An http connection has to be established -# only once per site object, making stuff a whole lot faster. Do NOT EVER +# Use a persistent http connection. An http connection has to be established +# only once per site object, making stuff a whole lot faster. Do NOT EVER # use this if you share Site objects across threads without proper locking. persistent_http = False
@@ -459,7 +459,7 @@ a '/' to the path if you want it to be a directory path.
from holger@trillke.net 2002/03/18 - + """ from os import makedirs from os.path import normpath, dirname, exists, abspath @@ -474,7 +474,7 @@ Argument(s) are zero or more directory names, optionally followed by a data file name. The return path is offset to config.base_dir. Any directories in the path that do not already exist are created. - + """ import os return makepath(os.path.join(base_dir, *filename)) @@ -490,6 +490,7 @@ # When called as main program, list all configuration variables # if __name__=="__main__": + import types _all=1 for _arg in __sys.argv[1:]: if _arg=="modified": @@ -500,8 +501,9 @@ _k.sort() for _name in _k: if _name[0]!='_': - if _all or _glv[_name]!=globals()[_name]: - print _name,"=",repr(globals()[_name]) + if not type(globals()[_name]) in [types.FunctionType, types.ModuleType]: + if _all or _glv[_name]!=globals()[_name]: + print _name,"=",repr(globals()[_name])
# cleanup all locally-defined variables
Modified: trunk/pywikipedia/fixes.py =================================================================== --- trunk/pywikipedia/fixes.py 2008-01-14 14:49:18 UTC (rev 4886) +++ trunk/pywikipedia/fixes.py 2008-01-14 17:19:26 UTC (rev 4887) @@ -390,9 +390,12 @@ } },
- #Corrections for Arabic Wikipedia - #And any Arabic wiki. - #python replace.py -always -start:! -fix:correct-ar + #Corrections for Arabic Wikipedia and any Arabic wiki. + + # It isn't fully clear what it's supposed to do, if is safe or correct (included for more + # ancient arabic text). As now it's possible use user-fixes.py and if no objections or + # explanations is given, this part of code might be removed. + 'correct-ar': { 'regex': False, 'msg': { @@ -491,3 +494,13 @@ ] }, } + +# +# Load the user fixes file. + +import config + +try: + execfile(config.datafilepath(config.base_dir, "user-fixes.py")) +except IOError: + pass
Added: trunk/pywikipedia/generate_user_files.py =================================================================== --- trunk/pywikipedia/generate_user_files.py (rev 0) +++ trunk/pywikipedia/generate_user_files.py 2008-01-14 17:19:26 UTC (rev 4887) @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +""" Script to create user files (user-config.py, user-fixes.py) """ +__version__ = '$Id$' + +import os, sys, codecs, re + +base_dir = '' +console_encoding = sys.stdout.encoding + +if console_encoding == None or sys.platform == 'cygwin': + console_encoding = "iso-8859-1" + +def listchoice(clist = [], message = None, default = None): + + if not message: + message = "Select" + + if default: + message += " (default: %s)" % default + + message += ": " + + n = 0 + + for i in clist: + n += 1 + print ("%d: %s" % (n, i)) + + while True: + choice = raw_input(message) + + if choice == '' and default: + return default + + try: + return clist[int(choice) - 1] + except: + print "Invalid response" + return response + +def file_exists(filename): + if os.path.exists(filename): + print "'%s' already exists." % filename + return True + return False + +def create_user_config(): + _fnc = os.path.join(base_dir, "user-config.py") + if not file_exists(_fnc): + know_families = re.findall(r'(.+)_family.py\b', '\n'.join(os.listdir(os.path.join(base_dir, "families")))) + fam = listchoice(know_families, "Select family of sites we are working on", default = 'wikipedia') + mylang = raw_input("The language code of the site we're working on (default: 'en'): ") or 'en' + username = raw_input("Username (%s %s): " % (mylang, fam)) or 'UnnamedBot' + username = unicode(username, console_encoding) + + # + # I don't like this solution. Temporary for me. + f = codecs.open("config.py", "r", "utf-8") ; cpy = f.read() ; f.close() + + res = re.findall("^(############## (?:LOGFILE|" + "INTERWIKI|" + "SOLVE_DISAMBIGUATION|" + "IMAGE RELATED|" + "TABLE CONVERSION BOT|" + "WEBLINK CHECKER|" + "DATABASE|" + "SEARCH ENGINE|" + "COPYRIGHT|" + "FURTHER) SETTINGS .*?)^(?=#####|# =====)", cpy, re.MULTILINE | re.DOTALL) + config_text = u'\n'.join(res) + + f = codecs.open(_fnc, "w", "utf-8") + f.write(u"""# -*- coding: utf-8 -*- + +# This is an automatically generated file. You can find more configuration parameters in 'config.py' file. + +# The family of sites we are working on. wikipedia.py will import +# families/xxx_family.py so if you want to change this variable, +# you need to write such a file. +family = '%s' + +# The language code of the site we're working on. +mylang = '%s' + +# The dictionary usernames should contain a username for each site where you +# have a bot account. +usernames['%s']['%s'] = u'%s' + + +%s""" % (fam, mylang, fam, mylang, username, config_text)) + f.close() + print "'%s' written." % _fnc + +def create_user_fixes(): + _fnf = os.path.join(base_dir, "user-fixes.py") + if not file_exists(_fnf): + f = codecs.open(_fnf, "w", "utf-8") + f.write(ur"""# -*- coding: utf-8 -*- + +# +# This is only an example. Don't use it. +# + +fixes['example'] = { + 'regex': True, + 'msg': { + '_default':u'no summary specified', + }, + 'replacements': [ + (ur'\bword\b', u'two words'), + ] +} + +""") + f.close() + print "'%s' written." % _fnf + + +create_user_config() +create_user_fixes() \ No newline at end of file
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2008-01-14 14:49:18 UTC (rev 4886) +++ trunk/pywikipedia/wikipedia.py 2008-01-14 17:19:26 UTC (rev 4887) @@ -1318,6 +1318,7 @@ relevant = data[data.find('<!-- start content -->')+22:data.find('<!-- end content -->')].strip() # Throw away all the other links etc. relevant = re.sub('<.*?>', '', relevant) + relevant = relevant.replace(':', ':') # MediaWiki only spam-checks HTTP links, and only the # domain name part of the URL. m = re.search('http://%5B%5Cw%5C-%5C.%5D+', relevant) @@ -2600,7 +2601,7 @@
""" # TODO: why isn't this a Site method? - pages = list(pages) # if pages is an iterator, we need to + pages = list(pages) # if pages is an iterator, we need to output(u'Getting %d pages from %s...' % (len(pages), site)) _GetAll(site, pages, throttle, force).run()
@@ -3838,15 +3839,15 @@
def postForm(self, address, predata, sysop=False, useCookie=True): """Post http form data to the given address at this site. - + address is the absolute path without hostname. predata is a dict or any iterable that can be converted to a dict, containing keys and values for the http form. - + Return a (response, data) tuple, where response is the HTTP response object and data is a Unicode string containing the body of the response. - + """ data = self.urlEncode(predata) try: @@ -3857,12 +3858,12 @@
def postData(self, address, data, contentType='application/x-www-form-urlencoded', - sysop=False, useCookie=True): + sysop=False, useCookie=True, compress=True): """Post encoded data to the given http address at this site. - + address is the absolute path without hostname. data is an ASCII string that has been URL-encoded. - + Returns a (response, data) tuple where response is the HTTP response object and data is a Unicode string containing the body of the response. @@ -3888,6 +3889,8 @@ conn.putheader('Cookie', self.cookies(sysop = sysop)) if False: #self.persistent_http: conn.putheader('Connection', 'Keep-Alive') + if compress: + conn.putheader('Accept-encoding', 'gzip') conn.endheaders() conn.send(data)
@@ -3901,8 +3904,15 @@ conn.close() conn.connect() return self.postData(address, data, contentType, sysop, useCookie) - data = response.read().decode(self.encoding()) + + data = response.read() + + if compress and response.getheader('Content-Encoding') == 'gzip': + data = decompress_gzip(data) + + data = data.decode(self.encoding()) response.close() + if True: #not self.persistent_http: conn.close() return response, data @@ -3943,7 +3953,7 @@
text = response.read() headers = dict(response.getheaders()) - + else: if self.hostname() in config.authenticate.keys(): uo = authenticateURLopener @@ -3965,7 +3975,7 @@ while not retrieved: try: if self.hostname() in config.authenticate.keys(): - if compress: + if False: # compress: request = urllib2.Request(url, data) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener() @@ -3994,12 +4004,12 @@ else: raise text = f.read() - + headers = f.info() - + contentType = headers.get('content-type', '') contentEncoding = headers.get('content-encoding', '') - + # Ensure that all sent data is received if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers: output(u'Warning! len(text) does not match content-length: %s != %s' % \ @@ -4009,16 +4019,7 @@ return self.getUrl(path, retry, sysop, data, compress)
if compress and contentEncoding == 'gzip': - # Use cStringIO if available - # TODO: rewrite gzip.py such that it supports unseekable fileobjects. - try: - from cStringIO import StringIO - except ImportError: - from StringIO import StringIO - import gzip - compressedstream = StringIO(text) - gzipper = gzip.GzipFile(fileobj=compressedstream) - text = gzipper.read() + text = decompress_gzip(text)
R = re.compile('charset=([^'";]+)') m = R.search(contentType) @@ -5489,7 +5490,7 @@ text can contain special sequences to create colored output. These consist of the escape character \03 and the color name in curly braces, e. g. \03{lightpurple}. \03{default} resets the color. - + """ output_lock.acquire() try: @@ -5583,7 +5584,7 @@
-dir:PATH Read the bot's configuration data from directory given by PATH, instead of from the default directory. - + -lang:xx Set the language of the wiki you want to work on, overriding the configuration in user-config.py. xx should be the language code. @@ -5593,7 +5594,7 @@ This will override the configuration in user-config.py.
-daemonize:xyz Immediately returns control to the terminal and redirects - stdout and stderr to xyz (only use for bots that require + stdout and stderr to xyz (only use for bots that require no input from stdin).
-help Shows this help text. @@ -5677,7 +5678,7 @@ """Wait for the page-putter to flush its queue.
Called automatically upon exiting from Python. - + """ if page_put_queue.qsize() > 0: import datetime @@ -5725,6 +5726,20 @@ get_throttle = Throttle(config.minthrottle,config.maxthrottle) put_throttle = Throttle(config.put_throttle,config.put_throttle,False)
+def decompress_gzip(data): + # Use cStringIO if available + # TODO: rewrite gzip.py such that it supports unseekable fileobjects. + if data: + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO + import gzip + try: + data = gzip.GzipFile(fileobj = StringIO(data)).read() + except IOError: + raise + return data
class MyURLopener(urllib.FancyURLopener): version="PythonWikipediaBot/1.0"
pywikipedia-l@lists.wikimedia.org