SVN: [4887] trunk/pywikipedia - Pywikipedia-l

14 Jan 2008

Revision: 4887
Author:   cosoleto
Date:     2008-01-14 17:19:26 +0000 (Mon, 14 Jan 2008)
Log Message:
-----------
wikipedia.py: Bots eat up a lot of bandwidth, which is not free. Added compression support in site.postData() (yes! used by GetAll code, until now only getUrl() function was supporting gzip encoding...).
wikipedia.py: Disabled compression in getUrl when used with authenticate staff, doesn't work for me.
wikipedia.py: Code to print blacklisted URL can don't work due to a recent MediaWiki release, it replaces ':' char in '&#58;'. Fixed. 
config.py: "python config.py" now don't print module and function data references, only variables. Fixed an imprecision in copyright settings text.
fixes.py: Added support for user-fixes.py. Commented 'correct-ar' fix set because it isn't fully clear what it's supposed to do, if is safe or correct (included for more ancient arabic text). As now it's possible use user-fixes.py and if no objections or explanations is given, this part of code might be removed.
generate_user_files.py: A primitive script that create user-config.py and user-fixes.py files.
catall.py: code cleanup (please more attention adding copyright statement... :-/)
Modified Paths:
--------------
    trunk/pywikipedia/catall.py
    trunk/pywikipedia/config.py
    trunk/pywikipedia/fixes.py
    trunk/pywikipedia/wikipedia.py
Added Paths:
-----------
    trunk/pywikipedia/generate_user_files.py
Modified: trunk/pywikipedia/catall.py
===================================================================

--- trunk/pywikipedia/catall.py	2008-01-14 14:49:18 UTC (rev 4886)
+++ trunk/pywikipedia/catall.py	2008-01-14 17:19:26 UTC (rev 4887)
@@ -10,16 +10,19 @@
 """
 #
 # (C) Rob W.W. Hooft, Andre Engels, 2004
-# (C) Filnik, 2008
 #
 # Distributed under the terms of the MIT license.
-# 
-__version__ = '$Id: catall.py,v 1.5 2008/01/12 12:49:25 filnik Exp$'
 #
+__version__ = '$Id$'
+#
import wikipedia, sys
-msg = {
+# This is a purely interactive robot. We set the delays lower.
+wikipedia.get_throttle.setDelay(5)
+wikipedia.put_throttle.setDelay(10)
+
+msg={
     'en':u'Bot: Changing categories',
     'he':u'Bot: משנה קטגוריות',
     'fr':u'Bot: Change categories',
@@ -36,36 +39,35 @@
     }
def choosecats(pagetext):
-    chosen = []
-    flag = False
-    length = 1000
-    textToPrint = """Give the new categories, one per line.
-Empty line: if the first, don't change. Otherwise: Ready.
--: I made a mistake, let me start over.
-?: Give the text of the page with GUI.
-??: Give the text of the page in console.
-xx: if the first, remove all categories and add no new.
-q: quit."""
-    wikipedia.output(textToPrint)
+    chosen=[]
+    flag=False
+    length=1000
+    print ("Give the new categories, one per line.")
+    print ("Empty line: if the first, don't change. Otherwise: Ready.")
+    print ("-: I made a mistake, let me start over.")
+    print ("?: Give the text of the page with GUI.")
+    print ("??: Give the text of the page in console.") 
+    print ("xx: if the first, remove all categories and add no new.")
+    print ("q: quit.")
     while flag == False:
-        choice = wikipedia.input(u"\nSo, what do you want to do?")
-        if choice == "":
-            flag = True
-        elif choice == "-":
-            chosen = choosecats(pagetext)
-            flag = True
-        elif choice == "?":
+        choice=wikipedia.input(u"?")
+        if choice=="":
+            flag=True
+        elif choice=="-":
+            chosen=choosecats(pagetext)
+            flag=True
+        elif choice=="?":
             import editarticle
             editor = editarticle.TextEditor()
             newtext = editor.edit(pagetext)
-        elif choice == "??":
+        elif choice =="??":
             wikipedia.output(pagetext[0:length])
             length = length+500 
-        elif choice== "xx" and chosen == []:
+        elif choice=="xx" and chosen==[]:
             chosen = None
-            flag = True
-        elif choice == "q":
-            wikipedia.output("quit...")
+            flag=True
+        elif choice=="q":
+            print "quit..."
             sys.exit()
         else:
             chosen.append(choice)
@@ -74,39 +76,37 @@
 def make_categories(page, list, site = None):
     if site is None:
         site = wikipedia.getSite()
-    pllist = []
+    pllist=[]
     for p in list:
-        cattitle = "%s:%s" % (site.category_namespace(), p)
+        cattitle="%s:%s" % (site.category_namespace(), p)
         pllist.append(wikipedia.Page(site,cattitle))
     page.put(wikipedia.replaceCategoryLinks(page.get(), pllist), comment = wikipedia.translate(site.lang, msg))
-try:
-    # This is a purely interactive robot. We set the delays lower.
-    wikipedia.get_throttle.setDelay(5)
-    wikipedia.put_throttle.setDelay(10)
-    docorrections=True
-    start = []
+docorrections=True
+start=[]
-    for arg in wikipedia.handleArgs():
-        if arg == '-onlynew':
-            docorrections=False
-        else:
-            start.append(arg)
-
-    if start == []:
-        start = 'A'
+for arg in wikipedia.handleArgs():
+    if arg == '-onlynew':
+        docorrections=False
     else:
-        start = ' '.join(start)
+        start.append(arg)
-    mysite = wikipedia.getSite()
+if start == []:
+    start='A'
+else:
+    start=' '.join(start)
+
+mysite = wikipedia.getSite()
+
+try:
     for p in mysite.allpages(start = start):
         try:
-            text = p.get()
-            cats = p.categories()
+            text=p.get()
+            cats=p.categories()
             if cats == []:
                 wikipedia.output(u"========== %s ==========" % p.title())
-                wikipedia.output("No categories")
-                wikipedia.output("----------------------------------------")
+                print "No categories"
+                print "----------------------------------------"
                 newcats=choosecats(text)
                 if newcats != [] and newcats is not None:
                     make_categories(p, newcats, mysite)
@@ -115,14 +115,13 @@
                     wikipedia.output(u"========== %s ==========" % p.title())
                     for c in cats:
                         wikipedia.output(c.title())
-                    wikipedia.output("----------------------------------------"
+                    print "----------------------------------------"
                     newcats=choosecats(text)
                     if newcats == None:
                         make_categories(p, [], mysite)
                     elif newcats != []:
                         make_categories(p, newcats, mysite)
         except wikipedia.IsRedirectPage:
-            wikipedia.output(u'%s is a redirect, skip...' % p.title())
-            continue
+            wikipedia.output(u'%s is a redirect' % p.title())
 finally:
     wikipedia.stopme()
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py	2008-01-14 14:49:18 UTC (rev 4886)
+++ trunk/pywikipedia/config.py	2008-01-14 17:19:26 UTC (rev 4887)
@@ -357,13 +357,13 @@
 # comma separated list or only numbers. But sometimes that might be the
 # only part unmodified of a slightly edited and not otherwise reported
 # copyright violation. You can disable this feature to try to increase
-# accuracy.
+# number of results.
copyright_economize_query = True
############## HTTP SETTINGS ##############
-# Use a persistent http connection. An http connection has to be established 
-# only once per site object, making stuff a whole lot faster. Do NOT EVER 
+# Use a persistent http connection. An http connection has to be established
+# only once per site object, making stuff a whole lot faster. Do NOT EVER
 # use this if you share Site objects across threads without proper locking.
 persistent_http = False
@@ -459,7 +459,7 @@
       a '/' to the path if you want it to be a directory path.
from holger@trillke.net 2002/03/18
-    
+
     """
     from os import makedirs
     from os.path import normpath, dirname, exists, abspath
@@ -474,7 +474,7 @@
     Argument(s) are zero or more directory names, optionally followed by a
     data file name. The return path is offset to config.base_dir. Any
     directories in the path that do not already exist are created.
-    
+
     """
     import os
     return makepath(os.path.join(base_dir, *filename))
@@ -490,6 +490,7 @@
 # When called as main program, list all configuration variables
 #
 if __name__=="__main__":
+    import types
     _all=1
     for _arg in __sys.argv[1:]:
         if _arg=="modified":
@@ -500,8 +501,9 @@
     _k.sort()
     for _name in _k:
         if _name[0]!='_':
-            if _all or _glv[_name]!=globals()[_name]:
-                print _name,"=",repr(globals()[_name])
+            if not type(globals()[_name]) in [types.FunctionType, types.ModuleType]:
+                if _all or _glv[_name]!=globals()[_name]:
+                    print _name,"=",repr(globals()[_name])
# cleanup all locally-defined variables
Modified: trunk/pywikipedia/fixes.py
===================================================================
--- trunk/pywikipedia/fixes.py	2008-01-14 14:49:18 UTC (rev 4886)
+++ trunk/pywikipedia/fixes.py	2008-01-14 17:19:26 UTC (rev 4887)
@@ -390,9 +390,12 @@
         }
     },
-    #Corrections for Arabic Wikipedia
-    #And any Arabic wiki.
-    #python replace.py -always -start:! -fix:correct-ar
+    #Corrections for Arabic Wikipedia and any Arabic wiki.
+
+    # It isn't fully clear what it's supposed to do, if is safe or correct (included for more
+    # ancient arabic text). As now it's possible use user-fixes.py and if no objections or
+    # explanations is given, this part of code might be removed.
+
     'correct-ar': {
         'regex': False,
         'msg': {
@@ -491,3 +494,13 @@
         ]
     },
 }
+
+#
+# Load the user fixes file.
+
+import config
+
+try:
+    execfile(config.datafilepath(config.base_dir, "user-fixes.py"))
+except IOError:
+    pass
Added: trunk/pywikipedia/generate_user_files.py
===================================================================
--- trunk/pywikipedia/generate_user_files.py	                        (rev 0)
+++ trunk/pywikipedia/generate_user_files.py	2008-01-14 17:19:26 UTC (rev 4887)
@@ -0,0 +1,120 @@
+# -*- coding: utf-8  -*-
+""" Script to create user files (user-config.py, user-fixes.py) """
+__version__ = '$Id$'
+
+import os, sys, codecs, re
+
+base_dir = ''
+console_encoding = sys.stdout.encoding
+
+if console_encoding == None or sys.platform == 'cygwin':
+    console_encoding = "iso-8859-1"
+
+def listchoice(clist = [], message = None, default = None):
+
+    if not message:
+        message = "Select"
+
+    if default:
+        message += " (default: %s)" % default
+
+    message += ": "
+
+    n = 0
+
+    for i in clist:
+        n += 1
+        print ("%d: %s" % (n, i))
+
+    while True:
+        choice = raw_input(message)
+
+        if choice == '' and default:
+            return default
+
+        try:
+            return clist[int(choice) - 1]
+        except:
+            print "Invalid response"
+    return response
+
+def file_exists(filename):
+    if os.path.exists(filename):
+        print "'%s' already exists." % filename
+        return True
+    return False
+
+def create_user_config():
+    _fnc = os.path.join(base_dir, "user-config.py")
+    if not file_exists(_fnc):
+        know_families = re.findall(r'(.+)_family.py\b', '\n'.join(os.listdir(os.path.join(base_dir, "families"))))
+        fam = listchoice(know_families, "Select family of sites we are working on", default = 'wikipedia')
+        mylang = raw_input("The language code of the site we're working on (default: 'en'): ") or 'en'
+        username = raw_input("Username (%s %s): " % (mylang, fam)) or 'UnnamedBot'
+        username = unicode(username, console_encoding)
+
+        #
+        # I don't like this solution. Temporary for me.
+        f = codecs.open("config.py", "r", "utf-8") ; cpy = f.read() ; f.close()
+
+        res = re.findall("^(############## (?:LOGFILE|"
+                                            "INTERWIKI|"
+                                            "SOLVE_DISAMBIGUATION|"
+                                            "IMAGE RELATED|"
+                                            "TABLE CONVERSION BOT|"
+                                            "WEBLINK CHECKER|"
+                                            "DATABASE|"
+                                            "SEARCH ENGINE|"
+                                            "COPYRIGHT|"
+                                            "FURTHER) SETTINGS .*?)^(?=#####|# =====)", cpy, re.MULTILINE | re.DOTALL)
+        config_text = u'\n'.join(res)
+
+        f = codecs.open(_fnc, "w", "utf-8")
+        f.write(u"""# -*- coding: utf-8  -*-
+
+# This is an automatically generated file. You can find more configuration parameters in 'config.py' file.
+
+# The family of sites we are working on. wikipedia.py will import
+# families/xxx_family.py so if you want to change this variable,
+# you need to write such a file.
+family = '%s'
+
+# The language code of the site we're working on.
+mylang = '%s'
+
+# The dictionary usernames should contain a username for each site where you
+# have a bot account.
+usernames['%s']['%s'] = u'%s'
+
+
+%s""" % (fam, mylang, fam, mylang, username, config_text))
+        f.close()
+        print "'%s' written." % _fnc
+
+def create_user_fixes():
+    _fnf = os.path.join(base_dir, "user-fixes.py")
+    if not file_exists(_fnf):
+        f = codecs.open(_fnf, "w", "utf-8")
+        f.write(ur"""# -*- coding: utf-8  -*-
+
+#
+# This is only an example. Don't use it.
+#
+
+fixes['example'] = {
+    'regex': True,
+    'msg': {
+        '_default':u'no summary specified',
+    },
+    'replacements': [
+        (ur'\bword\b', u'two words'),
+    ]
+}
+
+""")
+        f.close()
+        print "'%s' written." % _fnf
+
+
+create_user_config()
+create_user_fixes()
\ No newline at end of file
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2008-01-14 14:49:18 UTC (rev 4886)
+++ trunk/pywikipedia/wikipedia.py	2008-01-14 17:19:26 UTC (rev 4887)
@@ -1318,6 +1318,7 @@
                         relevant = data[data.find('<!-- start content -->')+22:data.find('<!-- end content -->')].strip()
                         # Throw away all the other links etc.
                         relevant = re.sub('<.*?>', '', relevant)
+                        relevant = relevant.replace('&#58;', ':')
                         # MediaWiki only spam-checks HTTP links, and only the
                         # domain name part of the URL.
                         m = re.search('http://%5B%5Cw%5C-%5C.%5D+', relevant)
@@ -2600,7 +2601,7 @@
"""
     # TODO: why isn't this a Site method?
-    pages = list(pages)  # if pages is an iterator, we need to 
+    pages = list(pages)  # if pages is an iterator, we need to
     output(u'Getting %d pages from %s...' % (len(pages), site))
     _GetAll(site, pages, throttle, force).run()
@@ -3838,15 +3839,15 @@
def postForm(self, address, predata, sysop=False, useCookie=True):
         """Post http form data to the given address at this site.
-        
+
         address is the absolute path without hostname.
         predata is a dict or any iterable that can be converted to a dict,
         containing keys and values for the http form.
-        
+
         Return a (response, data) tuple, where response is the HTTP
         response object and data is a Unicode string containing the
         body of the response.
-        
+
         """
         data = self.urlEncode(predata)
         try:
@@ -3857,12 +3858,12 @@
def postData(self, address, data,
                  contentType='application/x-www-form-urlencoded',
-                 sysop=False, useCookie=True):
+                 sysop=False, useCookie=True, compress=True):
         """Post encoded data to the given http address at this site.
-        
+
         address is the absolute path without hostname.
         data is an ASCII string that has been URL-encoded.
-        
+
         Returns a (response, data) tuple where response is the HTTP
         response object and data is a Unicode string containing the
         body of the response.
@@ -3888,6 +3889,8 @@
             conn.putheader('Cookie', self.cookies(sysop = sysop))
         if False: #self.persistent_http:
             conn.putheader('Connection', 'Keep-Alive')
+        if compress:
+            conn.putheader('Accept-encoding', 'gzip')
         conn.endheaders()
         conn.send(data)
@@ -3901,8 +3904,15 @@
             conn.close()
             conn.connect()
             return self.postData(address, data, contentType, sysop, useCookie)
-        data = response.read().decode(self.encoding())
+
+        data = response.read()
+
+        if compress and response.getheader('Content-Encoding') == 'gzip':
+            data = decompress_gzip(data)
+
+        data = data.decode(self.encoding())
         response.close()
+
         if True: #not self.persistent_http:
             conn.close()
         return response, data
@@ -3943,7 +3953,7 @@
text = response.read()
             headers = dict(response.getheaders())
-                
+
         else:
             if self.hostname() in config.authenticate.keys():
                 uo = authenticateURLopener
@@ -3965,7 +3975,7 @@
             while not retrieved:
                 try:
                     if self.hostname() in config.authenticate.keys():
-                        if compress:
+                        if False: # compress:
                             request = urllib2.Request(url, data)
                             request.add_header('Accept-encoding', 'gzip')
                             opener = urllib2.build_opener()
@@ -3994,12 +4004,12 @@
                     else:
                         raise
             text = f.read()
-            
+
             headers = f.info()
-            
+
         contentType = headers.get('content-type', '')
         contentEncoding = headers.get('content-encoding', '')
-            
+
         # Ensure that all sent data is received
         if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers:
                 output(u'Warning! len(text) does not match content-length: %s != %s' % \
@@ -4009,16 +4019,7 @@
                 return self.getUrl(path, retry, sysop, data, compress)
if compress and contentEncoding == 'gzip':
-            # Use cStringIO if available
-            # TODO: rewrite gzip.py such that it supports unseekable fileobjects.
-            try:
-                from cStringIO import StringIO
-            except ImportError:
-                from StringIO import StringIO
-            import gzip
-            compressedstream = StringIO(text)
-            gzipper = gzip.GzipFile(fileobj=compressedstream)
-            text = gzipper.read()
+            text = decompress_gzip(text)
R = re.compile('charset=([^'";]+)')
         m = R.search(contentType)
@@ -5489,7 +5490,7 @@
     text can contain special sequences to create colored output. These
     consist of the escape character \03 and the color name in curly braces,
     e. g. \03{lightpurple}. \03{default} resets the color.
-    
+
     """
     output_lock.acquire()
     try:
@@ -5583,7 +5584,7 @@
-dir:PATH         Read the bot's configuration data from directory given by
                   PATH, instead of from the default directory.
-                  
+
 -lang:xx          Set the language of the wiki you want to work on, overriding
                   the configuration in user-config.py. xx should be the
                   language code.
@@ -5593,7 +5594,7 @@
                   This will override the configuration in user-config.py.
-daemonize:xyz    Immediately returns control to the terminal and redirects
-                  stdout and stderr to xyz (only use for bots that require 
+                  stdout and stderr to xyz (only use for bots that require
                   no input from stdin).
-help             Shows this help text.
@@ -5677,7 +5678,7 @@
     """Wait for the page-putter to flush its queue.
Called automatically upon exiting from Python.
-    
+
     """
     if page_put_queue.qsize() > 0:
         import datetime
@@ -5725,6 +5726,20 @@
 get_throttle = Throttle(config.minthrottle,config.maxthrottle)
 put_throttle = Throttle(config.put_throttle,config.put_throttle,False)
+def decompress_gzip(data):
+    # Use cStringIO if available
+    # TODO: rewrite gzip.py such that it supports unseekable fileobjects.
+    if data:
+        try:
+            from cStringIO import StringIO
+        except ImportError:
+            from StringIO import StringIO
+        import gzip
+        try:
+            data = gzip.GzipFile(fileobj = StringIO(data)).read()
+        except IOError:
+            raise
+    return data
class MyURLopener(urllib.FancyURLopener):
     version="PythonWikipediaBot/1.0"