Revision: 9481
Author: xqt
Date: 2011-08-29 15:55:43 +0000 (Mon, 29 Aug 2011)
Log Message:
cleanup doc string
Modified Paths:
Modified: trunk/pywikipedia/
--- trunk/pywikipedia/ 2011-08-29 15:48:48 UTC (rev 9480)
+++ trunk/pywikipedia/ 2011-08-29 15:55:43 UTC (rev 9481)
@@ -2,7 +2,7 @@
# (C) Rob W.W. Hooft, 2003
# (C) Yuri Astrakhan, 2005
-# (C) Pywikipedia bot team, 2003-2010
+# (C) Pywikipedia bot team, 2003-2011
# Distributed under the terms of the MIT license.
@@ -20,10 +20,6 @@
def translate(page, hints = None, auto = True, removebrackets = False, site = None, family = None):
- Please comment your source code! --Daniel
- Does some magic stuff. Returns a list of pages.
Goes through all entries in 'hints'. Returns a list of pages.
Entries for single page titles list those pages. Page titles for entries
Revision: 9480
Author: xqt
Date: 2011-08-29 15:48:48 +0000 (Mon, 29 Aug 2011)
Log Message:
update copyright date
Modified Paths:
Modified: trunk/pywikipedia/LICENSE
--- trunk/pywikipedia/LICENSE 2011-08-29 15:14:18 UTC (rev 9479)
+++ trunk/pywikipedia/LICENSE 2011-08-29 15:48:48 UTC (rev 9480)
@@ -1,4 +1,4 @@
-Copyright (c) 2005-2010 The PyWikipediaBot team
+Copyright (c) 2005-2011 The PyWikipediaBot team
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Revision: 9478
Author: xqt
Date: 2011-08-29 15:11:50 +0000 (Mon, 29 Aug 2011)
Log Message:
moved to archive
Added Paths:
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+Put "Picture of the day" in your desktop wallpaper from Wikimedia Commons.
+For Windows system, do you need:
+* Python 2.5
+* Pywin32 for Python 2.5
+* PIL for Python 2.5
+For Linux system, do you need:
+* Python and PIL
+from wikipedia import Site, Page, ImagePage
+from PIL import Image, ImageDraw, ImageFont
+import httplib, time, sys, os
+if sys.platform == 'win32':
+ import ctypes, win32con
+ from _winreg import *
+ import gconf
+def get_commons_image(image):
+ headers = {"Accept": "image/jpg",
+ "Accept": "image/gif",
+ "Accept": "image/png",
+ "Accept": "image/svg",
+ }
+ conn = httplib.HTTPConnection('')
+ conn.request("GET", image, None, headers)
+ r = conn.getresponse()
+ data =
+ if sys.platform == 'win32':
+ arq = open("Picture_of_the_day.bmp","wb") # convert image "on the fly" to Windows Bitmap
+ else:
+ arq = open("Picture_of_the_day.png","wb")
+ arq.write(data)
+ arq.close()
+ conn.close()
+def write_gray(filename, text, outfilename):
+ img ="RGB")
+ write ="RGB", (img.size[0], img.size[1]))
+ draw = ImageDraw.ImageDraw(img)
+ size = 0
+ while True:
+ size +=1
+ try:
+ FONT = "C:\WINDOWS\Fonts\Verdana.ttf"
+ except IndexError:
+ FONT = "/usr/share/fonts/truetype/ttf-bitstream-vera/Verdana.ttf" # ubuntu
+ except IndexError:
+ FONT = "/usr/share/fonts/bitstream-vera/Vera.ttf" # fedora
+ except IndexError:
+ print "Please, report this problem to leogregianin(a)"
+ sys.exit()
+ nextfont = ImageFont.truetype(FONT, size)
+ nexttextwidth, nexttextheight = nextfont.getsize(text)
+ if nexttextwidth+nexttextheight/3 > write.size[0]: break
+ font = nextfont
+ textwidth, textheight = nexttextwidth, nexttextheight
+ draw.setfont(font)
+ draw.text(((write.size[0]-textwidth)/55, (write.size[0]-textheight)/55), text, fill=(120,120,120))
+def set_wallpaper():
+ if sys.platform == 'win32':
+ ctypes.windll.user32.SystemParametersInfoA(SPI_SETDESKWALLPAPER, 0, "Picture_of_the_day.bmp", 0)
+ else:
+ gconf.client_get_default().get_string('/desktop/gnome/background/picture_options', 'scaled')
+ gconf.client_get_default().get_string('/desktop/gnome/background/picture_filename', 'Picture_of_the_day.png')
+if __name__ == '__main__':
+ commons = Site('commons', 'commons')
+ date_today = time.strftime('%Y-%m-%d', time.localtime())
+ template = 'Template:Potd/%s' % date_today
+ templatePage = Page(commons, template)
+ image_today = templatePage.get()
+ image_name = 'Image:%s'% image_today
+ imageURL = ImagePage(commons, image_name)
+ featuredImage = imageURL.fileUrl()
+ image = featuredImage[27:]
+ if sys.platform == 'win32':
+ if image.endswith('.svg'):
+ sys.exit() # Windows background don't accept svg files
+ ### Install CommonsPictureOfTheDay in registry
+ Reg = ConnectRegistry(None, HKEY_LOCAL_MACHINE)
+ Key = OpenKey(Reg, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Run", 0, KEY_WRITE)
+ # entry your correct pywikipediabot patch
+ SetValueEx(Key,"CommonsPictureOfTheDay", 0, REG_SZ, r"C:\pywikipediabot\pywikipedia\")
+ CloseKey(Key)
+ CloseKey(Reg)
+ get_commons_image(image)
+ write_gray('Picture_of_the_day.bmp',
+ '',
+ 'Picture_of_the_day.bmp')
+ set_wallpaper()
+ else:
+ get_commons_image(image)
+ write_gray('Picture_of_the_day.png',
+ '',
+ 'Picture_of_the_day.png')
+ set_wallpaper()
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,74 @@
+# -*- coding: iso-8859-1 -*-
+(C) 2003 Thomas R. Koll, <tomk32(a)>
+ Distributed under the terms of the MIT license.
+__version__='$Id:,v 1.3 2005/12/21 17:51:26 wikipedian Exp $'
+DEBUG = 0
+import re
+from xml.sax.handler import ContentHandler
+class WdTXMLParser(ContentHandler):
+ def __init__(self):
+ self.rTitle = re.compile ('(.*): (.*)')
+ self.rLink = re.compile ('.*[\r\n]*(http://.*)')
+ self.rCount = re.compile ('.*: (\d*)')
+ self.inItem = 0
+ self.inITitle = 0
+ self.inILink = 0
+ self.inIDescription = 0
+ self.tmp = {}
+ self.result = {}
+ def startDocument(self):
+ self.result = {}
+ self.tmp = {}
+ def endDocument(self):
+ return self.result
+ def startElement(self, name, attrs):
+ if name == 'item':
+ self.inItem = 1
+ if self.inItem == 1:
+ if name == 'title':
+ self.inTitle = 1
+ if name == 'link':
+ self.inLink = 1
+ if name == 'description':
+ self.inDescription = 1
+ def characters(self, characters):
+ if self.inItem:
+ if self.inTitle:
+ self.tmp['title'] = self.rTitle.match(characters).group(2)
+ if self.inLink:
+ self.tmp['link'] = self.rLink.match(characters).group(1)
+ if self.inDescription:
+ self.tmp['count'] = self.rCount.match(characters).group(1)
+ def endElement(self, name):
+ if name == 'item':
+ self.inItem = 0
+ self.result[self.tmp['title']] = {
+ 'link' : self.tmp['link'],
+ 'count' : self.tmp['count']
+ }
+ self.tmp = {}
+ if name == 'title':
+ self.inTitle = 0
+ if name == 'link':
+ self.inLink = 0
+ if name == 'description':
+ self.inDescription = 0
+if and and self.count:
+ self.results[self.title] = {
+ 'date' :,
+ 'link' :,
+ 'count' : self.count
+ }
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,81 @@
+Simple bot to check whether two pages with the same name on different language
+'pedias have interwiki links to the same page on another language.
+Call the script with 3 arguments:
+ python lang1 lang2 name
+The script will either print "Yes" and return exit code 0,
+ or print "No" and return exit code 1,
+ or print "Both links are already present"
+ and return exit code 2,
+ or print "One links already present"
+ and return exit code 0.
+It may raise exceptions on pages that disappeared or whatever. This is
+a simple framework at least for the moment.
+# (C) Rob Hooft, 2005
+# Distributed under the terms of the MIT license.
+__version__='$Id:,v 1.3 2005/12/21 17:51:26 wikipedian Exp $'
+from __future__ import generators
+import sys, wikipedia
+class TwoPageGenerator:
+ def __init__(self, lang1, lang2, name):
+ self.lang1 = lang1
+ self.lang2 = lang2
+ = name
+ def __iter__(self):
+ yield wikipedia.Page(wikipedia.getSite(self.lang1),
+ yield wikipedia.Page(wikipedia.getSite(self.lang2),
+class IdenticalRobot:
+ def __init__(self, generator):
+ self.generator = generator
+ def run(self):
+ arr = []
+ for x in self.generator:
+ arr.append(x)
+ pg1 = arr[0]
+ pg2 = arr[1]
+ iw1 = pg1.interwiki()
+ iw2 = pg2.interwiki()
+ if pg2 in iw1 and pg1 in iw2:
+ print "Both links are already present"
+ sys.exit(2)
+ if pg2 in iw1 or pg1 in iw2:
+ print "One link already present"
+ sys.exit(0)
+ for iw in iw1:
+ if iw in iw2:
+ print "Yes"
+ sys.exit(0)
+ print "No"
+ sys.exit(1)
+def main():
+ args = []
+ for arg in sys.argv[1:]:
+ arg = wikipedia.argHandler(arg, 'are-identical')
+ if arg:
+ args.append(arg)
+ g = TwoPageGenerator(*args)
+ r = IdenticalRobot(g)
+ main()
+ wikipedia.stopme()
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+Script to correct URLs like
+( to []
+to have correct generation of links in Wikipedia
+__author__ = '(C) 2003 Thomas R. Koll, <tomk32(a)>'
+__license__ = 'Distributed under the terms of the MIT license.'
+__version__='$Id:,v 1.13 2005/12/21 17:51:26 wikipedian Exp $'
+import re, sys
+import wikipedia
+myComment = {'ar':u'بوت: URL تم إصلاحها',
+ 'en':u'Bot: URL fixed',
+ 'fa':u'ربات: URL اصلاح شد',
+ 'he':u'בוט: תוקנה כתובת URL',
+ 'pt':u'Bot: URL corrigido',
+ 'zh':u'機器人: 網址已修復',
+ }
+if __name__ == "__main__":
+ try:
+ for arg in sys.argv[1:]:
+ if wikipedia.argHandler(arg, 'brackethttp'):
+ pass
+ else:
+ pl = wikipedia.Page(wikipedia.getSite(), arg)
+ text = pl.get()
+ newText = re.sub("(http:\/\/([^ ]*[^\] ]))\)", "[\\1 \\2])", text)
+ if newText != text:
+ wikipedia.showDiff(text, newText)
+ status, reason, data = pl.put(newText, wikipedia.translate(wikipedia.mylang,myComment))
+ print status, reason
+ else:
+ print "No bad link found"
+ except:
+ wikipedia.stopme()
+ raise
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,223 @@
+# -*- coding: utf-8 -*-
+This bot is used for checking external links from Wikipedia. It checks
+all external links in groups of 480 pages, gives the error code for each
+that causes problems, and counts the number of links with and without
+It accepts all general Wikipediabot arguments as well as:
+-start:xxx Check starting at 'xxx'.
+-nolog Do not log to a file, only give output to a screen.
+Anything else is assumed to be a page that is to be checked. Spaces in
+page titles have to be replaced by underscores, otherwise the bot assumes
+the parts are separate pages. If no page has been specified and also no
+-start argument has been provided, the bot acts as if -start:! had been
+specified, starting at the beginning.
+The bot returns all links that have some problem, with the errorcode
+provided by the server, or the artificial errorcode -1 if the server
+could not be reached at all. Output is sent both to the screen and the
+file check_extern.txt
+# (C) Andre Engels, 2004
+# Distributed under the terms of the MIT license.
+__version__='$Id:,v 1.16 2005/12/21 17:51:26 wikipedian Exp $'
+import wikipedia, urllib, re, sys, httplib
+class URLerrorFinder(urllib.FancyURLopener):
+ version="RobHooftWikiRobot/1.0"
+ def open_http(self, url):
+ """Use HTTP protocol."""
+ if isinstance(url, str):
+ host, selector = urllib.splithost(url)
+ if host:
+ user_passwd, host = urllib.splituser(host)
+ host = urllib.unquote(host)
+ realhost = host
+ else:
+ host, selector = url
+ urltype, rest = urllib.splittype(selector)
+ url = rest
+ user_passwd = None
+ if urltype.lower() != 'http':
+ realhost = None
+ else:
+ realhost, rest = splithost(rest)
+ if realhost:
+ user_passwd, realhost = splituser(realhost)
+ if user_passwd:
+ selector = "%s://%s%s" % (urltype, realhost, rest)
+ if proxy_bypass(realhost):
+ host = realhost
+ if not host: return -2
+ h = httplib.HTTP(host)
+ h.putrequest('GET', selector)
+ if realhost: h.putheader('Host', realhost)
+ for args in self.addheaders: h.putheader(*args)
+ h.endheaders()
+ errcode, errmsg, headers = h.getreply()
+ return errcode
+# Which error codes do we not consider errors?
+allowederrorcodes = [100,101,200,201,202,203,205,304]
+errname = {
+ -1:'No contact to server',
+ -2:'No host found',
+ 100:'Continue',
+ 101:'Switching Protocols',
+ 200:'OK',
+ 201:'Created',
+ 202:'Accepted',
+ 203:'Non-Authorative Information',
+ 204:'No Content',
+ 205:'Reset Content',
+ 206:'Partial Content',
+ 300:'Multiple Choices',
+ 301:'Moved Permanently',
+ 302:'Moved Temporarily',
+ 303:'See Other',
+ 304:'Not Modified',
+ 305:'Use Proxy',
+ 307:'Temporary Redirect',
+ 400:'Bad Request',
+ 401:'Unauthorized',
+ 402:'Payment Required',
+ 403:'Forbidden',
+ 404:'Not Found',
+ 405:'Method Not Allowed',
+ 406:'None Acceptable',
+ 407:'Proxy Authentication Required',
+ 408:'Request Timeout',
+ 409:'Conflict',
+ 410:'Gone',
+ 411:'Authorization Refused',
+ 412:'Precondition Failed',
+ 413:'Request Entity Too Large',
+ 414:'Request-URI Too Large',
+ 415:'Unsupported Media Type',
+ 416:'Requested Range not satisfiable',
+ 417:'Expectation Failed',
+ 500:'Internal Server Error',
+ 501:'Not Implemented',
+ 502:'Bad Gateway',
+ 503:'Service Unavailable',
+ 504:'Gateway Timeout',
+ 505:'HTTP Version not supported',
+ 8181:'Certificate Expired',
+ 12002:'Timeout',
+ 12007:'No such host',
+ 12029:'No connection',
+ 12031:'Connection Reset'
+ }
+def errorname(error):
+ # Given a numerical HTML error, give its actual identity
+ if error in errname:
+ return errname[error]
+ elif (error > 300) and (error < 400):
+ return 'Unknown Redirection Response'
+ else:
+ return 'Unknown Error'
+start = '!'
+log = True
+todo = []
+do_all = False
+for arg in sys.argv[1:]:
+ url=sys.argv[1]
+ arg = wikipedia.argHandler(arg, 'check_extern')
+ if arg:
+ if arg.startswith('-start:'):
+ start=arg[7:]
+ do_all=True
+ elif arg=='-nolog':
+ log = False
+ else:
+ mysite = wikipedia.getSite()
+ todo.append(wikipedia.Page(mysite,arg))
+# Make sure we have the final site
+mysite = wikipedia.getSite()
+if todo == []:
+ # No pages have been given; if also no start is given, we start at
+ # the beginning
+ do_all = True
+if log:
+ import logger
+ sys.stdout = logger.Logger(sys.stdout, filename = 'check_extern.log')
+cont = True
+checked = 0
+working = 0
+nonworking = 0
+totalchecked = 0
+ while cont:
+ print
+ i = 0
+ if len(todo)<61 and do_all:
+ for pl in wikipedia.allpages(start = start):
+ todo.append(pl)
+ i += 1
+ if i==480:
+ break
+ start = todo[len(todo)-1].title() + '_0'
+ # todo is a list of pages to do, donow are the pages we will be doing in this run.
+ if len(todo)>60:
+ # Take the first 60.
+ donow = todo[0:60]
+ todo = todo[60:]
+ else:
+ donow = todo
+ # If there was more to do, the 'if len(todo)<61' part would have extended
+ # todo beyond this size.
+ cont = False
+ try:
+ wikipedia.getall(mysite, donow)
+ except wikipedia.SaxError:
+ # Ignore this error, and get the pages the traditional way.
+ pass
+ checked +=len(donow)
+ for pl in donow:
+ R = re.compile(r'http://[^\s}<\]]+[^\s.,:;)\?!\]}<]')
+ try:
+ for url in R.findall(pl.get()):
+ url = wikipedia.unicode2html(url,'ascii')
+ try:
+ error = URLerrorFinder().open(url)
+ except IOError:
+ error = -1
+ if error in allowederrorcodes:
+ working += 1
+ else:
+ nonworking += 1
+ print
+ wikipedia.output(u'Page "%s" links to:'%pl.title())
+ wikipedia.output(url)
+ wikipedia.output(u'Which gave error: %s %s'%(error,errorname(error)))
+ # If anything is wrong with the Wikipedia page, just ignore
+ except (wikipedia.NoPage,wikipedia.IsRedirectPage,wikipedia.LockedPage):
+ pass
+ if checked>499 or not cont:
+ totalchecked += 500
+ checked -= 500
+ print
+ print '======================================================================'
+ wikipedia.output(u'%s pages checked, last was [[%s]]'%(totalchecked+checked,donow[len(donow)-1]))
+ print 'In those pages there were %s correct and %s problematic external links.'%(working,nonworking)
+ wikipedia.stopme()
+ raise
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+Script to copy a table from one Wikipedia to another one, translating it
+ -type:abcd -from:xy Article_Name
+Command line options:
+-from:xy Copy the table from the Wikipedia article in language xy
+ Article must have interwiki link to xy
+-debug Show debug info, and don't send the results to the server
+-type:abcd Translates the table, using translations given below.
+ When the -type argument is not used, the bot will simply
+ copy the table as-is.
+-file:XYZ Reads article names from a file. XYZ is the name of the
+ file from which the list is taken. If XYZ is not given, the
+ user is asked for a filename.
+ Page titles should be saved one per line, without [[brackets]].
+ The -pos parameter won't work if -file is used.
+-image Copy all images within the found table to the target Wikipedia.
+ Make sure the bot is logged in before trying to upload images.
+Article_Name: Name of the article where a table should be inserted
+# (C) Daniel Herding, 2004
+# Distributed under the terms of the MIT license.
+__version__='$Id:,v 1.31 2005/12/21 17:51:26 wikipedian Exp $'
+import wikipedia, translator, lib_images
+import re, sys, string
+# Summary message
+ "ar":u"روبوت: نسخ الجدول من ",
+ "en":u"robot: copying table from ",
+ "de":u"Bot: Kopiere Tabelle von ",
+ "he":u"רובוט: מעתיק טבלה מתוך ",
+ "pt":u"Bot: Copiando tabela de ",
+ }
+# Prints text on the screen only if in -debug mode.
+# Argument text should be raw unicode.
+def print_debug(text):
+ if debug:
+ wikipedia.output(text)
+# this is a modified version of wikipedia.imagelinks(), it only looks in text, not in the whole page.
+def imagelinks(site, text):
+ image_ns = site.image_namespace()
+ # regular expression which matches e.g. "Image" as well as "image" (for en:)
+ im = '[' + image_ns[0].upper() + image_ns[0].lower() + ']' + image_ns[1:]
+ w1=r'('+im+':[^\]\|]*)'
+ w2=r'([^\]]*)'
+ Rlink = re.compile(r'\[\['+w1+r'(\|'+w2+r')?\]\]')
+ result = []
+ for l in Rlink.findall(text):
+ result.append(l[0])
+ return result
+# opens on a page, checks for an interwiki link, transfers and translates the first
+# table, copies all images in that table.
+def treat(to_pl, fromsite):
+ try:
+ to_text = to_pl.get()
+ interwikis = to_pl.interwiki()
+ except wikipedia.IsRedirectPage:
+ print "Can't work on redirect page."
+ return
+ except wikipedia.NoPage:
+ print "Page not found."
+ return
+ from_pl = None
+ for interwiki in interwikis:
+ if == fromsite:
+ from_pl = interwiki
+ if from_pl is None:
+ print "Interwiki link to %s not found." % repr(fromsite)
+ return
+ from_text = from_pl.get()
+ wikipedia.setAction(wikipedia.translate(mysite.lang, msg) + from_pl.aslink())
+ # search start of table
+ table = get_table(from_text)
+ if not table:
+ wikipedia.output(u"No table found in %s" % (from_pl.aslink()))
+ return
+ print_debug(u"Copying images")
+ if copy_images:
+ # extract image links from original table
+ images=imagelinks(fromsite, table)
+ for image in images:
+ # Copy the image to the current wikipedia, copy the image description page as well.
+ # Prompt the user so that he can translate the filename.
+ new_filename = lib_images.transfer_image(wikipedia.Page(fromsite, image), debug)
+ # if the upload succeeded
+ if new_filename:
+ old_image_tag = wikipedia.Page(fromsite, image).title()
+ new_image_tag = wikipedia.Page(mysite, mysite.image_namespace() + ":" + new_filename).title()
+ print_debug(u"Replacing " + old_image_tag + " with " + new_image_tag)
+ # We want to replace "Image:My pic.jpg" as well as "image:my_pic.jpg", so we need a regular expression.
+ old_image_tag = old_image_tag.replace(" ", "[ \_]")
+ old_image_tag = "[" + old_image_tag[0].upper() + old_image_tag[0].lower() + "]" + old_image_tag[1:]
+ #todo: regex for first letter of filename, i.e. first letter after the colon
+ rOld_image_tag = re.compile(old_image_tag)
+ table = re.sub(old_image_tag, new_image_tag, table)
+ translated_table = translator.translate(table, type, fromsite.lang, debug, mysite.lang)
+ if not translated_table:
+ print "Could not translate table."
+ return
+ print_debug(u"\n" + translated_table)
+ # add table to top of the article, seperated by a blank lines
+ to_text = translated_table + "\n\n" + to_text
+ if not debug:
+ # save changes on Wikipedia
+ to_pl.put(to_text, minorEdit='0')
+# Regular expression that will match both <table and {|
+startR = re.compile(r"<table|\{\|")
+# Regular expression that will match both </table> and |}
+endR = re.compile(r"</table>|\|\}")
+# Finds the first table inside a text, including cascaded inner tables.
+def get_table(text):
+ pos = 0
+ # find first start tag
+ first_start_tag =, text)
+ if not first_start_tag:
+ return
+ else:
+ print_debug(u"First start tag found at " + str(first_start_tag.start()))
+ pos = first_start_tag.end()
+ # number of start tags minus numer of end tags
+ table_level = 1
+ remaining_text = text
+ # until an end tag has been found for each start tag:
+ while table_level != 0:
+ # continue search after the last found tag
+ remaining_text = text[pos:]
+ next_start_tag =, remaining_text, pos)
+ next_end_tag =, remaining_text, pos)
+ if not next_end_tag:
+ print_debug(u"Error: missing end tag")
+ pass
+ # if another cascaded table is opened before the current one is closed
+ elif next_start_tag and next_start_tag.start() < next_end_tag.start():
+ print_debug(u"Next start tag found at " + str(pos + next_start_tag.start()))
+ pos += next_start_tag.end()
+ table_level += 1
+ print_debug(u"Table level is " + str(table_level))
+ else:
+ print_debug(u"Next end tag found at " + str(pos + next_end_tag.start()))
+ pos += next_end_tag.end()
+ table_level -= 1
+ print_debug(u"Table level is " + str(table_level))
+ print_debug(u"Table starts at " + str(first_start_tag.start()) + " and ends at " + str(pos) +"\n")
+ print_debug(text[first_start_tag.start():pos])
+ return text[first_start_tag.start():pos]
+if __name__=="__main__":
+ try:
+ # if the -file argument is used, page titles are dumped in this array.
+ # otherwise it will only contain one page.
+ page_list = []
+ # if -file is not used, this temporary array is used to read the page title.
+ page_title = []
+ from_lang = ""
+ type = ""
+ debug = False
+ copy_images = False
+ # read command line parameters
+ for arg in sys.argv[1:]:
+ arg = wikipedia.argHandler(arg, 'copy_table')
+ if arg:
+ if arg.startswith("-from"):
+ from_lang = arg[6:]
+ elif arg.startswith("-type:"):
+ type = arg[6:]
+ elif arg == "-debug":
+ debug = True
+ elif arg == "-image":
+ copy_images = True
+ elif arg.startswith('-file'):
+ if len(arg) == 5:
+ file = wikipedia.input(u'Please enter the list\'s filename: ')
+ else:
+ file = arg[6:]
+ # open file and read page titles out of it
+ f=open(file)
+ for line in f.readlines():
+ if line != '\n':
+ page_list.append(line)
+ f.close()
+ else:
+ page_title.append(arg)
+ # if the page name is given as a command line argument,
+ # connect the title's parts with spaces
+ if page_title != []:
+ page_title = ' '.join(page_title)
+ page_list.append(page_title)
+ mysite = wikipedia.getSite()
+ fromsite = mysite.getSite(code=from_lang)
+ for current_page_name in page_list:
+ thispl = wikipedia.Page(mysite, current_page_name)
+ treat(thispl, fromsite)
+ except:
+ wikipedia.stopme()
+ raise
+ wikipedia.stopme()
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,23 @@
+Script to extract all wiki page names a certain HTML file points to
+The output can be used as input to some robot that takes a list of pages as input.
+This script takes a single file name argument, the file should be a HTML file
+as captured from one of the wikipedia servers.
+# (C) Rob W.W. Hooft, 2003
+# Distributed under the terms of the MIT license.
+__version__='$Id:,v 1.9 2005/12/21 17:51:26 wikipedian Exp $'
+import sys,re
+R=re.compile('/wiki/(.*?)" *')
+for hit in R.findall(text):
+ print hit
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+This script all function have merged to please use:
+ -fromall -count
+shizhao 2009-04-18
+This script only counts how many featured articles all wikipedias have.
+__version__ = '$Id: 6336 2009-02-08 04:14:37Z purodha $'
+# Distributed under the terms of the MIT license.
+import sys
+import wikipedia, catlib
+from featured import featured_name
+def featuredArticles(site):
+ method=featured_name[site.lang][0]
+ name=featured_name[site.lang][1]
+ args=featured_name[site.lang][2:]
+ raw=method(site, name, *args)
+ arts=[]
+ for p in raw:
+ if p.namespace()==0:
+ arts.append(p)
+ elif p.namespace()==1:
+ arts.append(wikipedia.Page(, p.titleWithoutNamespace()))
+ wikipedia.output('\03{lightred}** wikipedia:%s has %i featured articles\03{default}' % (site.lang, len(arts)))
+if __name__=="__main__":
+ mysite = wikipedia.getSite()
+ fromlang = featured_name.keys()
+ fromlang.sort()
+ try:
+ for ll in fromlang:
+ fromsite = wikipedia.getSite(ll)
+ if fromsite != mysite:
+ arts = featuredArticles(fromsite)
+ arts_mysite = featuredArticles(mysite)
+ finally:
+ wikipedia.stopme()
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,67 @@
+Script to transfer many images from one wiki to another. Your
+language (which can be changed with the -lang: argument) is the
+language to upload to. The images should be in a file as interwiki
+links (that is in the form [[en:Image:myimage.png]]); they do not
+need to be all from the same Wiki. This file can be created with
+ -lang:xx Log in to the given wikipedia language to upload to
+The first other argument is taken to be the name of the file you get
+the links from; other arguments are ignored.
+# (C) Andre Engels 2004
+# Distributed under the terms of the MIT license.
+# Modified by Gerrit Holl, 01-11-2004
+__version__='$Id:,v 1.15 2005/12/21 17:51:26 wikipedian Exp $'
+import sys
+import wikipedia, lib_images, pagegenerators
+def getfn():
+ fns = []
+ for arg in sys.argv[1:]:
+ arg = wikipedia.argHandler(arg, 'getimages')
+ if arg:
+ fns.append(arg)
+ if len(fns) == 0:
+ fns.append(raw_input("Please enter a filename: "))
+ return fns
+def main():
+ for filename in getfn():
+ print "Handling images from %s" % filename
+ gen = pagegenerators.TextfilePageGenerator(filename)
+ for image in gen:
+ if image.isImage():
+ print "-" * 50
+ print "Image: %s" % image.title()
+ try:
+ # show the image description page's contents
+ print image.get()
+ except wikipedia.NoPage:
+ print "Description empty."
+ except wikipedia.IsRedirectPage:
+ print "Description page is redirect?!"
+ answer=wikipedia.input(u"Copy this image (y/N)?")
+ if answer.lower().startswith('y'):
+ lib_images.transfer_image(image)
+if __name__ == "__main__":
+ try:
+ main()
+ except:
+ wikipedia.stopme()
+ raise
+ else:
+ wikipedia.stopme()
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+Allows access to the MediaWiki messages, that's the label texts of the MediaWiki
+software in the current language. These can be used in other bots.
+The function refresh_messages() downloads all the current messages and saves
+them to disk. It is run automatically when a bot first tries to access one of
+the messages. It can be updated manually by running this script, e.g. when
+somebody changed the current message at the wiki. The texts will also be
+reloaded automatically once a month.
+Syntax: python mediawiki_messages [-all]
+Command line options:
+ -refresh - Reloads messages for the home wiki or for the one defined via
+ the -lang and -family parameters.
+ -all - Reloads messages for all wikis where messages are already present
+ If another parameter is given, it will be interpreted as a MediaWiki key.
+ The script will then output the respective value, without refreshing..
+# (C) Daniel Herding, 2004
+# Distributed under the terms of the MIT license.
+##WIKIPEDIA.PY. It is being retained solely for compatibility in case any
+##custom-written bots rely upon it. Bot authors should replace any uses
+##of this module as follows:
+## OLD: mediawiki_messages.get(key, site)
+## NEW: site.mediawiki_message(key)
+## OLD: mediawiki_messages.has(key, site)
+## NEW: site.has_mediawiki_message(key)
+## OLD: mediawiki_messages.makepath(path)
+## NEW: wikipedia.makepath(path)
+import warnings
+"""The mediawiki_messages module is deprecated and no longer
+maintained; see the source code for new methods to replace
+calls to this module.""",
+ DeprecationWarning, stacklevel=2)
+import wikipedia
+import re, sys, pickle
+import os.path
+import time
+import codecs
+import urllib
+from BeautifulSoup import *
+__version__='$Id: 3731 2007-06-20 14:42:55Z russblau $'
+loaded = {}
+def get(key, site = None, allowreload = True):
+ site = site or wikipedia.getSite()
+ if site in loaded:
+ # Use cached copy if it exists.
+ dictionary = loaded[site]
+ else:
+ fn = 'mediawiki-messages/mediawiki-messages-%s-%s.dat' % (, site.lang)
+ try:
+ # find out how old our saved dump is (in seconds)
+ file_age = time.time() - os.path.getmtime(fn)
+ # if it's older than 1 month, reload it
+ if file_age > 30 * 24 * 60 * 60:
+ print 'Current MediaWiki message dump is one month old, reloading'
+ refresh_messages(site)
+ except OSError:
+ # no saved dumped exists yet
+ refresh_messages(site)
+ f = open(fn, 'r')
+ dictionary = pickle.load(f)
+ f.close()
+ loaded[site] = dictionary
+ key = key[0].lower() + key[1:]
+ if key in dictionary:
+ return dictionary[key]
+ elif allowreload:
+ refresh_messages(site = site)
+ return get(key, site = site, allowreload = False)
+ else:
+ raise KeyError('MediaWiki Key %s not found' % key)
+def has(key, site = None, allowreload = True):
+ try:
+ get(key, site, allowreload)
+ return True
+ except KeyError:
+ return False
+def makepath(path):
+ """ creates missing directories for the given path and
+ returns a normalized absolute version of the path.
+ - if the given path already exists in the filesystem
+ the filesystem is not modified.
+ - otherwise makepath creates directories along the given path
+ using the dirname() of the path. You may append
+ a '/' to the path if you want it to be a directory path.
+ from holger(a) 2002/03/18
+ """
+ from os import makedirs
+ from os.path import normpath,dirname,exists,abspath
+ dpath = normpath(dirname(path))
+ if not exists(dpath): makedirs(dpath)
+ return normpath(abspath(path))
+def refresh_messages(site = None):
+ site = site or wikipedia.getSite()
+ # get 'all messages' special page's path
+ path = site.allmessages_address()
+ print 'Retrieving MediaWiki messages for %s' % repr(site)
+ wikipedia.put_throttle() # It actually is a get, but a heavy one.
+ allmessages = site.getUrl(path)
+ print 'Parsing MediaWiki messages'
+ soup = BeautifulSoup(allmessages,
+ convertEntities=BeautifulSoup.HTML_ENTITIES)
+ # The MediaWiki namespace in URL-encoded format, as it can contain
+ # non-ASCII characters and spaces.
+ quotedMwNs = urllib.quote(site.namespace(8).replace(' ', '_').encode(site.encoding()))
+ mw_url = site.path() + "?title=" + quotedMwNs + ":"
+ altmw_url = site.path() + "/" + quotedMwNs + ":"
+ nicemw_url = site.nice_get_address(quotedMwNs + ":")
+ shortmw_url = "/" + quotedMwNs + ":"
+ ismediawiki = lambda url:url and (url.startswith(mw_url)
+ or url.startswith(altmw_url)
+ or url.startswith(nicemw_url)
+ or url.startswith(shortmw_url))
+ # we will save the found key:value pairs here
+ dictionary = {}
+ try:
+ for keytag in soup('a', href=ismediawiki):
+ # Key strings only contain ASCII characters, so we can save them as
+ # strs
+ key = str(keytag.find(text=True))
+ keyrow = keytag.parent.parent
+ if keyrow['class'] == "orig":
+ valrow = keyrow.findNextSibling('tr')
+ assert valrow['class'] == "new"
+ value = unicode(
+ elif keyrow['class'] == 'def':
+ value = unicode(keyrow('td')[1].string).strip()
+ else:
+ raise AssertionError("Unknown tr class value: %s" % keyrow['class'])
+ dictionary[key] = value
+ except Exception, e:
+ wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages)
+ raise
+ # Save the dictionary to disk
+ # The file is stored in the mediawiki_messages subdir. Create if necessary.
+ if dictionary == {}:
+ wikipedia.debugDump( 'MediaWiki_Msg', site, u'Error URL: '+unicode(path), allmessages )
+ sys.exit()
+ else:
+ f = open(makepath('mediawiki-messages/mediawiki-messages-%s-%s.dat' % (, site.lang)), 'w')
+ pickle.dump(dictionary, f)
+ f.close()
+ print "Loaded %i values from %s" % (len(dictionary.keys()), site)
+ #print dictionary['sitestatstext']
+def refresh_all_messages():
+ import dircache, time
+ filenames = dircache.listdir('mediawiki-messages')
+ message_filenameR = re.compile('mediawiki-messages-([a-z:]+)-([a-z:]+).dat')
+ for filename in filenames:
+ match = message_filenameR.match(filename)
+ if match:
+ family =
+ lang =
+ site = wikipedia.getSite(code = lang, fam = family)
+ refresh_messages(site)
+def main():
+ refresh_all = False
+ refresh = False
+ key = None
+ for arg in wikipedia.handleArgs():
+ if arg == '-all':
+ refresh_all = True
+ elif arg == '-refresh':
+ refresh = True
+ else:
+ key = arg
+ if key:
+ wikipedia.output(get(key), toStdout = True)
+ elif refresh_all:
+ refresh_all_messages()
+ elif refresh:
+ refresh_messages(wikipedia.getSite())
+ else:
+ wikipedia.showHelp('mediawiki_messages')
+if __name__ == "__main__":
+ try:
+ main()
+ except:
+ wikipedia.stopme()
+ raise
+ else:
+ wikipedia.stopme()
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,95 @@
+This script with all its function has been merged
+to please use:
+ -count
+xqt 2009-10-30
+This script checks references to see if they are properly formatted. Right now
+it just counts the total number of transclusions of any number of given templates.
+NOTE: This script is not capable of handling the <ref></ref> syntax. It just
+handles the {{ref}} syntax, which is still used, but DEPRECATED on the English
+Syntax: python command [arguments]
+Command line options:
+-count Counts the number of times each template (passed in as an argument)
+ is transcluded.
+-namespace: Filters the search to a given namespace. If this is specified
+ multiple times it will search all given namespaces
+Counts how many time {{ref}} and {{note}} are transcluded in articles.
+ python -count ref note -namespace:0
+__version__ = '$Id$'
+import wikipedia, config
+import replace, pagegenerators
+import re, sys, string
+templates = ['ref', 'note', 'ref label', 'note label', 'reflist']
+class ReferencesRobot:
+ #def __init__(self):
+ #Nothing
+ def countRefs(self, templates, namespaces):
+ mysite = wikipedia.getSite()
+ mytpl = mysite.template_namespace()+':'
+ finalText = [u'Number of transclusions per template',u'------------------------------------']
+ for template in templates:
+ gen = pagegenerators.ReferringPageGenerator(wikipedia.Page(mysite, mytpl + template), onlyTemplateInclusion = True)
+ if namespaces:
+ gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
+ count = 0
+ for page in gen:
+ count += 1
+ finalText.append(u'%s: %d' % (template, count))
+ for line in finalText:
+ wikipedia.output(line)
+def main():
+ doCount = False
+ argsList = []
+ namespaces = []
+ for arg in wikipedia.handleArgs():
+ if arg == '-count':
+ doCount = True
+ elif arg.startswith('-namespace:'):
+ try:
+ namespaces.append(int(arg[len('-namespace:'):]))
+ except ValueError:
+ namespaces.append(arg[len('-namespace:'):])
+ else:
+ argsList.append(arg)
+ if doCount:
+ robot = ReferencesRobot()
+ if not argsList:
+ argsList = templates
+ choice = ''
+ if 'reflist' in argsList:
+ wikipedia.output(u'NOTE: it will take a long time to count "reflist".')
+ choice = wikipedia.inputChoice(u'Proceed anyway?', ['yes', 'no', 'skip'], ['y', 'n', 's'], 'y')
+ if choice == 's':
+ argsList.remove('reflist')
+ if choice <> 'n':
+ robot.countRefs(argsList, namespaces)
+ else:
+ wikipedia.showHelp('refcheck')
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ wikipedia.stopme()
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,289 @@
+# -*- coding: utf-8 -*-
+Reads a cur SQL dump and offers a generator over SQLentry objects which can be
+used by other bots. Each SQLentry object represents a page.
+Can also be run directly from the command line to retrieve page lists from
+an SQL dump.
+ python -sql:filename.sql action
+Where action can be one of these:
+* find - List pages which contain a certain text
+* findr - List pages containing text matching a regular expression
+* shortpages - List pages with short contents
+* unmountedcats - List categories that don't have a supercategory
+* percentnames - List pages that contain internal links where special
+ characters are encoded as hexadecimal codes, e.g. %F6
+* baddisambiguations - Created for de.wikipedia to fix primary topic
+ disambiguations (Begriffsklärung nach Modell 2).
+# (C) Daniel Herding, 2004
+# Distributed under the terms of the MIT license.
+__version__ = '$Id:'
+from __future__ import generators
+import re, time
+import wikipedia, config
+class SQLentry(object):
+ '''
+ Represents a wiki page, read from an SQL dump.
+ An instance of this class will have the following attributes:
+ * is the page ID (integer)
+ * self.namespace is the namespace ID (integer)
+ * self.title is the page title without namespace (unicode)
+ * self.text is the text on that page (unicode)
+ * self.comment is the last edit summary (unicode)
+ * self.userid is the last editor's ID (integer)
+ * self.username is the last editor's username (unicode)
+ * self.timestamp is the time of the last edit (time tuple)
+ * self.restrictions is True if the page is locked (boolean)
+ * self.counter is the # of page views, disabled on Wikimedia wikis (integer)
+ * self.redirect is True if the page is a redirect (boolean)
+ * self.minor is True if the last edit was marked as minor (boolean)
+ * is True if the last edit was the first one (boolean)
+ * self.random is a random number used for the 'Random Page' function (float)
+ * self.touched is the date of the last cache update (time tuple)
+ See for details.
+ '''
+ def __init__(self, id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched):
+ '''
+ Constructor. All parameters should be strings, as read from the SQL
+ dump. This function will convert them to formats which are more
+ appropriate for the data types.
+ '''
+ = int(id)
+ self.namespace = int(namespace)
+ self.title = title
+ self.text = text
+ self.comment = comment
+ self.userid = int(userid)
+ self.username = username
+ # convert to a 9-dimensional time tuple, see
+ self.timestamp = time.strptime(timestamp, '%Y%m%d%H%M%S')
+ # convert to boolean
+ self.restrictions = (restrictions != '')
+ self.counter = int(counter)
+ self.redirect = (redirect == '1')
+ self.minor = (minor == '1')
+ = (new == '1')
+ self.random = float(random)
+ # Inversetimestamp is obsolete, so we ignore it.
+ #self.inversetimestamp = inversetimestamp
+ # Basically, I would want to convert touched to time tuple, as I did
+ # with timestamp. But I noticed a problem: in the nds: dump touched
+ # comes before inversetimestamp, and that would cause strptime to crash.
+ # So we simply leave touched as it is and hope that this is the only
+ # exception where entries are mixed up. If you find other such cases,
+ # please report.
+ #self.touched = time.strptime(touched, '%Y%m%d%H%M%S')
+ self.touched = touched
+ # MediaWiki escapes apostrophes, backslashes and quotes with
+ # backslashes. We need to unescape them again.
+ # This regular expression matches a backslash followed by a group, where
+ # the group matches either an apostrophe, a backslashes or a quote.
+ escapedR = re.compile(r'\\([\\\"\'])')
+ # The group \1 is the character we really want, while the leading
+ # backslash is only escape information we don't need.
+ self.title = escapedR.sub(r"\1", self.title)
+ self.text = escapedR.sub(r"\1", self.text)
+ self.comment = escapedR.sub(r"\1", self.comment)
+ self.username = escapedR.sub(r"\1", self.username)
+ # convert \n and \r to newlines and carriage returns.
+ self.text = self.text.replace('\\r', '\r')
+ self.text = self.text.replace('\\n', '\n')
+ # comments can also contain newline characters
+ self.comment = self.comment.replace('\\r', '\r')
+ self.comment = self.comment.replace('\\n', '\n')
+ # I hope that titles and usernames can't :-)
+ def full_title(self, underline = True):
+ '''
+ Returns the full page title in the form 'namespace:title', using the
+ localized namespace titles defined in your family file.
+ If underline is True, returns the page title with underlines instead of
+ spaces.
+ '''
+ if not underline:
+ title = self.title.replace('_', ' ')
+ else:
+ title = self.title
+ namespace_title = wikipedia.getSite().namespace(self.namespace)
+ if namespace_title == None:
+ return self.title
+ else:
+ if underline:
+ namespace_title = namespace_title.replace(' ', '_')
+ return namespace_title + ':' + self.title
+ def age(self):
+ '''
+ Returns the time passed since the last edit, in relation to the current
+ system time, in seconds (floating point number).
+ '''
+ return time.time() - time.mktime(self.timestamp)
+# Represents one parsed SQL dump file. Reads the local file at initialization,
+# parses it with a regular expression, and offers access to the resulting
+# SQLentry objects through the entries() generator.
+class SQLdump(object):
+ def __init__(self, filename, encoding):
+ self.filename = filename
+ self.encoding = encoding
+ def entries(self):
+ '''
+ Generator which reads one line at a time from the SQL dump file, and
+ parses it to create SQLentry objects. Stops when the end of file is
+ reached.
+ '''
+ # This regular expression will match one SQL database entry (i.e. a
+ # page), and each group represents an attribute of that entry.
+ # NOTE: We don't need re.DOTALL because newlines are escaped.
+ pageR = re.compile("\((\d+)," # cur_id (page ID number)
+ + "(\d+)," # cur_namespace (namespace number)
+ + "'(.*?)'," # cur_title (page title w/o namespace)
+ + "'(.*?)'," # cur_text (page contents)
+ + "'(.*?)'," # cur_comment (last edit's summary text)
+ + "(\d+)," # cur_user (user ID of last contributor)
+ + "'(.*?)'," # cur_user_text (user name)
+ + "'(\d{14})'," # cur_timestamp (time of last edit)
+ + "'(.*?)'," # cur_restrictions (protected pages have 'sysop' here)
+ + "(\d+)," # cur_counter (view counter, disabled on WP)
+ + "([01])," # cur_is_redirect
+ + "([01])," # cur_minor_edit
+ + "([01])," # cur_is_new
+ + "([\d\.]+?)," # cur_random (for random page function)
+ + "'(\d{14})'," # inverse_timestamp (obsolete)
+ + "'(\d{14})'\)") # cur_touched (cache update timestamp)
+ print 'Reading SQL dump'
+ # Open the file, read it using the given encoding, and replace invalid
+ # characters with question marks.
+ import codecs
+, 'r', encoding = self.encoding, errors='replace')
+ eof = False
+ while not eof:
+ # Read only one (very long) line because we would risk out of memory
+ # errors if we read the entire file at once
+ line = f.readline()
+ if line == '':
+ print 'End of file.'
+ eof = True
+ self.entries = []
+ for id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched in pageR.findall(line):
+ new_entry = SQLentry(id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched)
+ yield new_entry
+ f.close()
+ def query_percentnames(self):
+ '''
+ yields pages that contain internal links where special characters are
+ encoded as hexadecimal codes, e.g. %F6
+ '''
+ Rpercentlink = re.compile('\[\[[^\]]*?%[A-F0-9][A-F0-9][^\]]*?\]\]')
+ for entry in self.entries():
+ text = wikipedia.removeLanguageLinks(entry.text)
+ if
+ yield entry
+ def query_shortpages(self, minsize):
+ '''
+ yields articles that have less than minsize bytes of text
+ '''
+ for entry in self.entries():
+ if entry.namespace == 0 and not entry.redirect and len(entry.text) < minsize:
+ yield entry
+ def query_find(self, keyword):
+ '''
+ yields pages which contain keyword
+ '''
+ for entry in self.entries():
+ if entry.text.find(keyword) != -1:
+ yield entry
+ def query_findr(self, regex, namespace = None):
+ '''
+ yields pages which contain a string matching the given regular expression
+ '''
+ r = re.compile(regex)
+ for entry in self.entries():
+ if and (namespace == None or entry.namespace == namespace):
+ yield entry
+ def query_unmountedcats(self):
+ '''
+ yields categories which don't have any supercategory
+ '''
+ for entry in self.entries():
+ if entry.namespace == 14:
+ has_supercategory = False
+ for ns in wikipedia.getSite().category_namespaces():
+ if entry.text.find('[[%s:' % ns) != -1:
+ has_supercategory = True
+ break
+ if not has_supercategory:
+ yield entry
+def query(sqldump, action):
+ if action == 'percentnames':
+ for entry in sqldump.query_percentnames():
+ yield entry
+ elif action == 'shortpages':
+ minsize = int(wikipedia.input(u'Minimum size:'))
+ for entry in sqldump.query_shortpages(minsize):
+ yield entry
+ elif action == 'find':
+ keyword = wikipedia.input(u'Search for:')
+ for entry in sqldump.query_find(keyword):
+ yield entry
+ elif action == 'findr':
+ keyword = wikipedia.input(u'Search for:')
+ for entry in sqldump.query_findr(keyword):
+ yield entry
+ elif action == 'unmountedcats':
+ for entry in sqldump.query_unmountedcats():
+ yield entry
+ elif action == 'baddisambiguation':
+ for entry in sqldump.entries():
+ if entry.namespace == 0 and entry.title.endswith(')') and entry.text.startswith("''") and not entry.text.startswith("'''"):
+ yield entry
+if __name__=="__main__":
+ wikipedia.stopme() # No need to have me on the stack, as I'm not contacting the wiki
+ import sys
+ action = None
+ filename = None
+ for arg in sys.argv[1:]:
+ arg = wikipedia.argHandler(arg, 'sqldump')
+ if arg:
+ if arg.startswith('-sql'):
+ if len(arg) == 4:
+ filename = wikipedia.input(u'Please enter the SQL dump\'s filename: ')
+ else:
+ filename = arg[5:]
+ else:
+ action = arg
+ if not filename or not action:
+ wikipedia.output(__doc__, 'utf-8')
+ else:
+ sqldump = SQLdump(filename, wikipedia.myencoding())
+ for entry in query(sqldump, action):
+ wikipedia.output(u'*[[%s]]' % entry.full_title())
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,61 @@
+This script with all its function has been merged
+to please use:
+ -test
+xqt 2009-10-26
+Script to test whether you are logged-in
+ -all Try to test on all sites where a username is defined in
+ -sysop test your sysop account. (Works only with -all)
+# (C) Rob W.W. Hooft, 2003
+# Distributed under the terms of the MIT license.
+import re,sys,wikipedia,config
+def show (mysite, sysop = False):
+ if mysite.loggedInAs(sysop = sysop):
+ wikipedia.output(u"You are logged in on %s as %s." % (repr(mysite), mysite.loggedInAs(sysop=sysop)))
+ else:
+ wikipedia.output(u"You are not logged in on %s." % repr(mysite))
+def main():
+ testall = False
+ sysop = False
+ for arg in wikipedia.handleArgs():
+ if arg == "-all":
+ testall = True
+ elif arg == "-sysop":
+ sysop = True
+ else:
+ wikipedia.showHelp()
+ return
+ if testall:
+ if sysop:
+ namedict = config.sysopnames
+ else:
+ namedict = config.usernames
+ for familyName in namedict.iterkeys():
+ for lang in namedict[familyName].iterkeys():
+ show(wikipedia.getSite(lang, familyName), sysop)
+ else:
+ show(wikipedia.getSite(), sysop)
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ wikipedia.stopme()
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,465 @@
+# -*- coding: utf-8 -*-
+This module translates a string from one language to another, using
+translations given in a hard-coded dictionary. Various dictionaries exist for
+different types of text; e.g. type 'geography' is for tables about places and
+regions, and 'city' is for tables about cities and villages.
+For each table type, there can be three lists:
+* translations - direct replacements. Work in either direction, e.g. if
+ the bot knows that he should replace 'Location' with 'Ligging'
+ when translating from English to Dutch, he can also translate
+ it from Dutch to English.
+* regexes - regular expression replacements. These are more powerful than
+ direct replacements as they support wildcards etc., but only
+ work in one direction.
+* includes - one type can include all items from another type, e.g. when
+ translating a text of the type 'city', the bot also tries to
+ apply the translations and regexes given for type 'geography'
+ because 'city' includes 'geography'.
+# (C) Daniel Herding, 2004
+# Distributed under the terms of the MIT license.
+__version__='$Id:,v 1.21 2005/12/21 17:51:26 wikipedian Exp $'
+types = {
+ # translations for images (inside other tables)
+ "images": {
+ "translations": [
+ { "en":"[[image:", "de":"[[bild:", "nl":"[[afbeelding:", "fr":"[[image:", "af":"[[beeld:" },
+ { "en":"[[Image:", "de":"[[Bild:", "nl":"[[Afbeelding:", "fr":"[[Image:", "af":"[[Beeld:" },
+ { "en":"larger image", "de":u"Bild vergrößern", "nl":"grotere versie", "fr":u"En détail", "af":"In detail" },
+ { "en":"larger image", "de":u"Bild vergrößern", "nl":"groter", "fr":u"En détail", "af":"In detail" },
+ # usually used as link description for articles about flags, coats of arms etc.
+ { "en":"Details", "de":u"Details", "nl":"details", "fr":u"Détails", "af":"Details" },
+ ],
+ },
+ # translations for taxoboxes (for biology articles)
+ "taxo": {
+ "translations": [
+ # Background colors for table headers, with or without quotation marks (taxoboxes on de: all have quotation marks)
+ { "en":"bgcolor=pink", "de":"bgcolor=\"#ffc0c0\"", "nl":"bgcolor=#EEEEEE", "fr":"bgcolor=pink" },
+ { "en":"bgcolor=\"pink\"", "de":"bgcolor=\"#ffc0c0\"", "nl":"bgcolor=\"#EEEEEE\"", "fr":"bgcolor=\"pink\"" },
+ # second table header (below the image)
+ { "en":"[[Scientific classification]]", "de":"[[Systematik (Biologie)|Systematik]]", "nl":"[[Taxonomie|Wetenschappelijke classificatie]]", "fr":u"Classification [[systématique]]" },
+ # main taxobox content
+ { "en":"[[Domain (biology)|Domain]]:", "de":u"''[[Domäne (Biologie)|Domäne]]:''", "nl":"[[Domain (biologie)|Domain]]:", "fr":"??? (domain)" },
+ { "en":"Domain:", "de":u"''[[Domäne (Biologie)|Domäne]]:''", "nl":"[[Domain (biologie)|Domain]]:", "fr":"??? (domain)" },
+ { "en":"[[Kingdom (biology)|Kingdom]]:", "de":"''[[Reich (Biologie)|Reich]]:''", "nl":"[[Rijk (biologie)|Rijk]]:", "fr":u"[[Règne (biologie)|Règne]]:", },
+ { "en":"Kingdom:", "de":"''[[Reich (Biologie)|Reich]]:''", "nl":"[[Rijk (biologie)|Rijk]]:", "fr":u"[[Règne (biologie)|Règne]]:", },
+ { "en":"[[Division (biology)|Division]]:", "de":"''[[Abteilung (Biologie)|Abteilung]]:''", },
+ { "en":"Division:", "de":"''[[Abteilung (Biologie)|Abteilung]]:''", },
+ { "en":"[[Phylum (biology)|Phylum]]:", "de":"''[[Stamm (Biologie)|Stamm]]:''", "nl":"[[Stam (biologie)|Stam]]:", "fr":"[[Embranchement]]:", },
+ { "en":"Phylum:", "de":"''[[Stamm (Biologie)|Stamm]]:''", "nl":"[[Stam (biologie)|Stam]]:", "fr":"[[Embranchement]]:", },
+ { "en":"[[Subphylum]]:", "de":"''[[Unterstamm]]:''", "nl":"[[Substam (biologie)|Substam]]:", "fr":"[[Sous-embranchement]]:", },
+ { "en":"Phylum:", "de":"''[[Unterstamm]]:''", "nl":"[[Substam (biologie)|Substam]]:", "fr":"[[Sous-embranchement]]:", },
+ { "en":"[[Superclass (biology)|Superclass]]:", "de":u"''[[Klasse (Biologie)|Überklasse]]:''", "nl":"[[Superklasse (biologie)|Superklasse]]:", "fr":"[[Super-classe (biologie)|Super-classe]]:", },
+ { "en":"Superclass:", "de":u"''[[Klasse (Biologie)|Überklasse]]:''", "nl":"[[Superklasse (biologie)|Superklasse]]:", "fr":"[[Super-classe (biologie)|Super-classe]]:", },
+ { "en":"[[Class (biology)|Class]]:", "de":"''[[Klasse (Biologie)|Klasse]]:''", "nl":"[[Klasse (biologie)|Klasse]]:", "fr":"[[Classe (biologie)|Classe]]:", },
+ { "en":"Class:", "de":"''[[Klasse (Biologie)|Klasse]]:''", "nl":"[[Klasse (biologie)|Klasse]]:", "fr":"[[Classe (biologie)|Classe]]:", },
+ { "en":"[[Subclass]]:", "de":"''[[Klasse (Biologie)|Unterklasse]]:''", "nl":"[[Onderklasse]]:", "fr":"[[Sous-classe (biologie)|Sous-classe]]:", },
+ { "en":"Subclass:", "de":"''[[Klasse (Biologie)|Unterklasse]]:''", "nl":"[[Onderklasse]]:", "fr":"[[Sous-classe (biologie)|Sous-classe]]:", },
+ { "en":"[[Order (biology)|Superorder]]:", "de":u"''[[Ordnung (Biologie)|Überordnung]]:''", "nl":"[[Superorde]]:", },
+ { "en":"[[Order (biology)|Order]]:", "de":"''[[Ordnung (Biologie)|Ordnung]]:''", "nl":"[[Orde (biologie)|Orde]]:", "fr":"[[Ordre (biologie)|Ordre]]:" },
+ { "en":"Order:", "de":"''[[Ordnung (Biologie)|Ordnung]]:''", "nl":"[[Orde (biologie)|Orde]]:", "fr":"[[Ordre (biologie)|Ordre]]:" },
+ { "en":"[[Suborder]]:", "de":"''[[Ordnung (Biologie)|Unterordnung]]:''", "nl":"[[Infraorde (biologie)|Infraorde]]:", "fr":"[[Sous-ordre (biologie)|Sous-ordre]]:", },
+ { "en":"Suborder:", "de":"''[[Ordnung (Biologie)|Unterordnung]]:''", "nl":"[[Infraorde (biologie)|Infraorde]]:", "fr":"[[Sous-ordre (biologie)|Sous-ordre]]:", },
+ { "en":"[[Family (biology)|Family]]:", "de":"''[[Familie (Biologie)|Familie]]:''", "nl":"[[Familie (biologie)|Familie]]:", "fr":"[[Famille (biologie)|Famille]]:", },
+ { "en":"Family:", "de":"''[[Familie (Biologie)|Familie]]:''", "nl":"[[Familie (biologie)|Familie]]:", "fr":"[[Famille (biologie)|Famille]]:", },
+ { "en":"[[Subfamily (biology)|Subfamily]]:", "de":"''[[Familie (Biologie)|Unterfamilie]]:''", "nl":"[[Onderfamilie]]:", "fr":"[[Sous-famille (biologie)|Sous-famille]]:", },
+ { "en":"Subfamily:", "de":"''[[Familie (Biologie)|Unterfamilie]]:''", "nl":"[[Onderfamilie]]:", "fr":"[[Sous-famille (biologie)|Sous-famille]]:", },
+ { "en":"[[Tribe (biology)|Tribe]]:", "de":"''[[Tribus (Biologie)|Tribus]]:''", "nl":"[[Tak (biologie)|Tak]]:", "fr":"??? (Tribus)" },
+ { "en":"Tribe:", "de":"''[[Tribus (Biologie)|Tribus]]:''", "nl":"[[Tak (biologie)|Tak]]:", "fr":"??? (Tribus)" },
+ { "en":"[[Genus]]:", "de":"''[[Gattung (Biologie)|Gattung]]:''", "nl":"[[Geslacht (biologie)|Geslacht]]:", "fr":"[[Genre]]:" },
+ { "en":"Genus:", "de":"''[[Gattung (Biologie)|Gattung]]:''", "nl":"[[Geslacht (biologie)|Geslacht]]:", "fr":"[[Genre]]:" },
+ { "en":"[[Subgenus]]:", "de":"''[[Gattung (Biologie)|Untergattung]]:''", "nl":"[[Ondergeslacht]]:", "fr":"??? (Sous-genre)" },
+ { "en":"Subgenus:", "de":"''[[Gattung (Biologie)|Untergattung]]:''", "nl":"[[Ondergeslacht]]:", "fr":"??? (Sous-genre)" },
+ { "en":"[[Species]]:", "de":"''[[Art (Biologie)|Art]]:''", "nl":"[[Soort]]:", "fr":u"[[Espèce]]:" },
+ { "en":"Species:", "de":"''[[Art (Biologie)|Art]]:''", "nl":"[[Soort]]:", "fr":u"[[Espèce]]:" },
+ # table headers for subdivisions of the current group
+ { "en":"[[Class (biology)|Classes]]", "de":"[[Klasse (Biologie)|Klassen]]", "nl":"[[Klasse (biologie)|Klassen]]", },
+ { "en":"[[Order (biology)|Orders]]", "de":"[[Ordnung (Biologie)|Ordnungen]]", "nl":"[[Orde (biologie)|Orden]]", "fr":"[[Ordre (biologie)|Ordres]]" },
+ { "en":"[[Suborder]]s", "de":"[[Ordnung (Biologie)|Unterordnungen]]", "nl":"[[Infraorde (biologie)|Infraorden]]:", "fr":"[[Sous-ordre (biologie)|Sous-ordres]]", },
+ { "en":"[[Family (biology)|Families]]", "de":"[[Familie (Biologie)|Familien]]", "nl":"[[Familie (biologie)|Families]]", "fr":"[[Famille (biologie)|Familles]]", },
+ { "en":"[[Genus|Genera]]", "de":"[[Gattung (Biologie)|Gattungen]]", "nl":"[[Geslacht (biologie)|Geslachten]]", "fr":"[[Genre (biologie)|Genre]]" },
+ { "en":"[[Species]]", "de":"[[Art (Biologie)|Arten]]", "nl":"[[Soort]]en", "fr":u"??? (Espèces)" },
+ { "en":"[[Species]] (incomplete)", "de":"[[Art (Biologie)|Arten (Auswahl)]]", "nl":"[[Soort]]en (incompleet)", "fr":u"??? (Espèces (sélection))" },
+ # table headers for nl: style taxoboxes (current group is listed in a special section at the bottom)
+ { "en":"[[Order (biology)|Order]]", "de":"[[Ordnung (Biologie)|Ordnung]]", "nl":"[[Orde (biologie)|Orde]]", "fr":"[[Ordre (biologie)|Ordre]]" },
+ { "en":"[[Family (biology)|Family]]", "de":"[[Familie (Biologie)|Familie]]", "nl":"[[Familie (biologie)|Familie]]", "fr":"[[Famille (biologie)|Famille]]", },
+ { "en":"[[Genus]]", "de":"[[Gattung (Biologie)|Gattung]]", "nl":"[[Geslacht (biologie)|Geslacht]]", "fr":"[[Genre]]" },
+ { "en":"[[Species]]", "de":"[[Art (Biologie)|Art]]", "nl":"[[Soort]]", "fr":u"[[Espèce]]" },
+ ],
+ "regexes": {
+ "en": {
+ # de: doesn't have conservation status infos
+ "\{\{msg\:Status[^\}]+\}\}": {"de":"", },
+ },
+ },
+ "includes": ["images", "taxo_categories"],
+ },
+ # this should only include classes etc. which appear very often, not every species!
+ "taxo_categories": {
+ "translations": [
+ # kingdoms
+ { "en":"[[Animal]]ia", "de":"[[Tiere]] (Animalia)", "nl":"Dieren (''[[Animalia]]'')", },
+ { "en":"[[Plant]]ae", "de":"[[Pflanzen]] (Plantae)", },
+ # divisions
+ { "en":"[[flowering plant|Magnoliophyta]]", "de":u"[[Blütenpflanzen]] (Magnoliophyta)", },
+ # phylums
+ { "en":"[[Anthropod]]a", "de":u"[[Gliederfüßler]] (Anthropoda)", },
+ { "en":"[[Chordata]]", "de":"[[Chordatiere]] (Chordata)", "nl":"Chordadieren (''[[Chordata]]'')", },
+ { "en":"[[Chordate|Chordata]]", "de":"[[Chordatiere]] (Chordata)", "nl":"Chordadieren (''[[Chordata]]'')", },
+ # subphylums
+ { "en":"[[Vertebrata]]", "de":"[[Wirbeltiere]] (Vertebrata)", "nl":"Gewervelden (''[[Vertebrata]]'')", },
+ # superclasses
+ # classes
+ { "en":"[[Aves]]", "de":u"[[Vögel]] (Aves)", "nl":"Vogels (''[[Aves]]'')", },
+ { "en":"[[Insect]]a", "de":"[[Insekten]] (Insecta)", },
+ { "en":"[[Mammal]]ia", "de":u"[[Säugetiere]] (Mammalia)", "nl":"Zoogdieren (''[[Mammalia]]'')", },
+ { "en":"[[Mammalia]]", "de":u"[[Säugetiere]] (Mammalia)", "nl":"Zoogdieren (''[[Mammalia]]'')", },
+ { "en":"[[dicotyledon|Magnoliopsida]]", "de":u"Zweikeimblättrige (Magnoliopsida)", },
+ { "de":"Reptilien (Reptilia)", "nl":"Reptielen (''[[Reptilia]]'')", },
+ ],
+ "regexes": {
+ "de": {
+ # change [[Hunde]] (Canidae) to Hunde (''[[Canidae]]'') for nl:
+ # and to [[Canidae]] for en:
+ "\[\[(?P<german>[^\[]+)\]\] \((?P<latin>.+)\)": {"en":"[[\g<latin>]]", "nl":"\g<german> (\'\'[[\g<latin>]]\'\')", },
+ },
+ "nl": {
+ # change Knaagdieren (''[[Rodentia]]'') to [[Knaagdieren]] (Rodentia)
+ "(?P<dutch>[a-zA-Z ]+) \(\[\[\'\'(?P<latin>[^\[]+)\'\'\]\]\)": {"de":"[[\g<dutch>]] (\g<latin>)", },
+ "(?P<dutch>[a-zA-Z ]+) \(\'\'\[\[(?P<latin>[^\[]+)\]\]\'\'\)": {"de":"[[\g<dutch>]] (\g<latin>)", },
+ "(?P<dutch>[a-zA-Z ]+) \(\[\[\<i\>(?P<latin>[^\[]+)\<\/i\>\]\]\)": {"de":"[[\g<dutch>]] (\g<latin>)", },
+ "(?P<dutch>[a-zA-Z ]+) \(\<i\>\[\[(?P<latin>[^\[]+)\]\]\<\/i\>\)": {"de":"[[\g<dutch>]] (\g<latin>)", },
+ },
+ },
+ },
+ # plants get the same table color as animals on de:, but on en: they are green instead of pink
+ "plant": {
+ "translations": [
+ { "en":"bgcolor=lightgreen", "de":"bgcolor=\"#ffc0c0\"", },
+ { "en":"bgcolor=\"lightgreen\"", "de":"bgcolor=\"#ffc0c0\"", },
+ ],
+ "includes": ["taxo"],
+ },
+ # regular expressions for number formats
+ "numbers": {
+ "translations": [
+ # miljoen shouldn't be abbreviated on nl:
+ { "en":"mill.", "de":"Mio.", "nl":"miljoen", },
+ { "en":"bill.", "de":"Mrd." },
+ ],
+ "regexes": {
+ "fr": {
+ # fr uses or space to separate thousands, de uses dots
+ # note: this doesn't work for numbers > 1,000,000, don't know why
+ "(?P<pre>\d+)\ (?P<block>\d\d\d)": {"de":"\g<pre>.\g<block>", },
+ "(?P<pre>\d+) (?P<block>\d\d\d)": {"de":"\g<pre>.\g<block>", },
+ },
+ "en": {
+ # de uses dots to separate thousands, en uses commas
+ # de uses commas to indicate floating point numbers, en uses dots
+ # switch both - temporary placeholder required
+ "(?P<pre>\d+)\,(?P<block>\d\d\d)": {"de":"\g<pre>TEMPORARY_DOT\g<block>", },
+ "(?P<pre>\d+)\.(?P<block>\d+)": {"de":"\g<pre>,\g<block>", },
+ "TEMPORARY\_DOT": {"de":".", },
+ },
+ "de": {
+ # de uses dots to separate thousands, en uses commas
+ # de uses commas to indicate floating point numbers, en uses dots
+ # switch both - temporary placeholder required
+ "(?P<pre>\d+)\.(?P<block>\d\d\d)": {"en":"\g<pre>TEMPORARY_COMMA\g<block>", },
+ "(?P<pre>\d+)\,(?P<block>\d+)": {"en":"\g<pre>.\g<block>", },
+ "TEMPORARY\_COMMA": {"en":",", },
+ },
+ },
+ },
+ "months": {
+ "translations": [
+ { "sl":"januar", "it":"gennaio", "en":"January", "de":"Januar", "fr":"janvier", "nl":"januari", "af":"Januarie"},
+ { "sl":"februar", "it":"febbraio", "en":"February", "de":"Februar", "fr":u"février", "nl":"februari", "af":"Februarie"},
+ { "sl":"marec", "it":"marzo", "en":"March", "de":u"März", "fr":"mars", "nl":"maart", "af":"Maart"},
+ { "sl":"april", "it":"aprile", "en":"April", "de":"April", "fr":"avril", "nl":"april", "af":"April"},
+ { "sl":"maj", "it":"maggio", "en":"May", "de":"Mai", "fr":"mai", "nl":"mei", "af":"Mei"},
+ { "sl":"junij", "it":"giugno", "en":"June", "de":"Juni", "fr":"juin", "nl":"juni", "af":"Junie"},
+ { "sl":"julij", "it":"luglio", "en":"July", "de":"Juli", "fr":"juillet", "nl":"juli", "af":"Julie"},
+ { "sl":"avgust", "it":"agosto", "en":"August", "de":"August", "fr":u"août", "nl":"augustus", "af":"Augustus"},
+ { "sl":"september", "it":"settembre", "en":"September", "de":"September", "fr":"septembre", "nl":"september", "af":"September"},
+ { "sl":"oktober", "it":"ottobre", "en":"October", "de":"Oktober", "fr":"octobre", "nl":"oktober", "af":"Oktober"},
+ { "sl":"november", "it":"novembre", "en":"November", "de":"November", "fr":"novembre", "nl":"november", "af":"November"},
+ { "sl":"december", "it":"dicembre", "en":"December", "de":"Dezember", "fr":u"décembre", "nl":"december", "af":"Desember"},
+ ]
+ },
+ # conversion between number formats
+ "dates": {
+ "regexes": {
+ "de": {
+ # and format
+ "(?P<day>\d\d).(?P<month>\d\d).(?P<year>(\d\d)+)": {"nl":"\g<day>-\g<month>-\g<year>", },
+ },
+ },
+ },
+ # units of measurement etc.
+ # only for internal use
+ "units": {
+ "translations": [
+ { "en":"[[Square kilometre|km²]]", "de":"[[Quadratkilometer|km²]]", "nl":"[[Vierkante kilometer|km²]]", },
+ { "en":u"[[Square kilometre|km²]]", "de":u"[[Quadratkilometer|km²]]", "nl":u"[[Vierkante kilometer|km²]]", },
+ { "en":"as of ", "de":"Stand: ", },
+ { "en":"years", "de":"Jahre", "nl":"jaar"},
+ ]
+ },
+ # general geographical terms etc.
+ # only for internal use
+ "geography": {
+ "translations": [
+ # header
+ { "en":"Base data", "de":"Basisdaten", "nl":"Basisgegevens", "fr":"Informations", },
+ { "en":"[[Area]]:", "de":u"[[Fläche]]:", "nl":"Oppervlakte:", "fr":"[[Superficie]]:", "eo":"Areo:",},
+ { "en":"[[Population]]:", "de":"[[Einwohner]]:", "nl":"Inwoneraantal:", "fr":u"[[Population]]:", "eo":u"Logantaro:", },
+ { "en":"[[Population density]]:", "de":u"[[Bevölkerungsdichte]]:", "nl":"[[Bevolkingsdichtheid]]:", },
+ { "en":"inh./km²", "de":"Einw./km²", "nl":"inw./km²", "fr":"hab/km²", },
+ { "en":u"inh./km²", "de":u"Einw./km²", "nl":u"inw./km²", "fr":u"hab/km²", },
+ { "en":"inhabitants/km²", "de":"Einwohner/km²", "nl":"inwoners / km²", },
+ { "en":u"inhabitants/km²", "de":u"Einwohner/km²", "nl":u"inwoners / km²", },
+ { "en":"inhabitants per km²", "de":"Einwohner pro km²", "nl":"inwoners per km²", },
+ { "en":u"inhabitants per km²", "de":u"Einwohner pro km²", "nl":u"inwoners per km²", },
+ { "en":"inh.", "de":"Einw.", "nl":"inw.", "fr":"hab.", },
+ { "en":"above [[sea level]]", "de":u"ü. [[Normalnull|NN]]", "nl":"boven [[Normaal Amsterdams Peil|NAP]]", },
+ { "en":"location", "de":"Geografische Lage", "nl":"Ligging", "fr":"Localisation", },
+ # longitude, latitude
+ { "en":"' north", "de":u"' nördlicher Breite", "nl":"' NB" },
+ { "en":"' north", "de":u"' nördl. Breite", "nl":"' NB" },
+ { "en":"' north", "de":"' n. Br.", "nl":"' NB" },
+ { "en":"' east", "de":u"' östlicher Länge", "nl":"' OL" },
+ { "en":"' east", "de":u"' östl. Länge", "nl":"' OL" },
+ { "en":"' east", "de":u"' ö. L.", "nl":"' OL" },
+ { "en":"Map", "de":"Karte", "nl":"Kaart", },
+ { "en":"Coat of Arms", "de":"Wappen", "nl":"Wapen", "fr":"Blason" },
+ ],
+ "includes": ["units"],
+ },
+ "city": {
+ "translations": [
+ { "en":"[[Location]]:", "de":"[[Geografische Lage]]:", "nl":"Ligging", },
+ { "en":"[[Altitude]]:", "de":u"[[Höhe]]:", "nl":"Hoogte:", },
+ { "en":"Highest point:", "de":u"Höchster Punkt:", "nl":"Hoogste punt:",},
+ { "en":"Lowest point:", "de":"Niedrigster Punkt:", "nl":"Laagste punt:"},
+ { "en":"[[Postal code]]:", "de":"[[Postleitzahl]]:", "nl":"[[Postcode]]:", },
+ { "en":"[[Postal code]]s:", "de":"[[Postleitzahl]]en:", "nl":"[[Postcode]]s:", },
+ { "en":"[[Area code]]:", "de":"[[Telefonvorwahl|Vorwahl]]:", "nl":"[[Netnummer]]:", },
+ { "en":"[[Area code]]s:", "de":"[[Telefonvorwahl|Vorwahlen]]:", "nl":"[[Netnummer]]s:", },
+ { "en":"[[License plate]]:", "de":"[[KFZ-Kennzeichen]]:", "nl":"[[Autonummerbord]]:", },
+ { "en":"[[License plate]]:", "de":"[[Kfz-Kennzeichen]]:", "nl":"[[Autonummerbord]]:", },
+ { "en":"City structure:", "de":"Gliederung des Stadtgebiets:", "nl":"Ondergemeentelijke indeling:", },
+ # town hall snail mail address
+ { "en":"Municipality's address:", "de":"Adresse der Gemeindeverwaltung:", "nl":"Adres gemeentehuis:", },
+ # city hall snail mail address
+ { "en":"Municipality's address:", "de":"Adresse der Stadtverwaltung:", "nl":"Adres stadhuis:", },
+ { "en":"Website:", "de":"Webseite:", "nl":"Website:" },
+ { "en":"Website:", "de":"Website:", "nl":"Website:" },
+ { "en":"E-Mail adress:", "de":"[[E-Mail]]-Adresse:", "nl":"Email-adres:", },
+ { "en":"E-Mail adress:", "de":"E-Mail-Adresse:", "nl":"Email-adres:", },
+ # table header
+ { "en":"Politics", "de":"Politik", "nl":"Politiek", },
+ # female mayor
+ { "en":"[[Mayor]]:", "de":u"[[Bürgermeister]]in:", "nl":"[[Burgemeester]]:", },
+ { "en":"[[Mayor]]:", "de":u"[[Bürgermeisterin]]:", "nl":"[[Burgemeester]]:", },
+ # male mayor
+ { "en":"[[Mayor]]:", "de":u"[[Bürgermeister]]:", "nl":"[[Burgemeester]]:", },
+ { "en":"Governing [[Political party|party]]:", "de":"Regierende [[Politische Partei|Partei]]", "nl":"Regerende partij", },
+ { "en":"Governing [[Political party|parties]]:", "de":"Regierende [[Politische Partei|Parteien]]", "nl":"Regerende partijen", },
+ { "en":"Majority [[Political party|party]]:", "de":"[[Politische Partei|Mehrheitspartei]]", "nl":"Meerderheidspartij"},
+ { "en":"Debts:", "de":"Schulden:", },
+ { "en":"[[Unemployment]]:", "de":"[[Arbeitslosenquote]]:", "nl":"Werkloosheidspercentage:", },
+ { "de":u"[[Ausländeranteil]]:", "nl":"Percentage buitenlanders", },
+ { "en":"Age distribution:", "de":"Altersstruktur:", "nl":"Leeftijdsopbouw:", },
+ { "de":"Stadtteile", "nl":"wijken"},
+ { "de":"[[Stadtbezirk]]e", "nl":"deelgemeenten" },
+ { "de":"Stadtbezirke", "nl":"deelgemeenten" },
+ { "en":"Independent", "de":"Parteilos", "nl":"geen partij" },
+ { "en":"Region", "de":"[[Region]]", "nl":"Landstreek" },
+ ],
+ "includes": ["images", "geography", "numbers"],
+ },
+ # translations for cities in Germany
+ "city-de": {
+ "translations": [
+ { "en":"[[Bundesland]]:", "de":"[[Bundesland]]:", "nl":"[[Deelstaat (Duitsland)|Deelstaat]]", },
+ { "en":"[[Regierungsbezirk]]:", "de":"[[Regierungsbezirk]]:", "nl":"[[Regierungsbezirk]]:", },
+ { "en":"[[District]]:", "de":"[[Landkreis|Kreis]]:", "nl":"[[District]]", },
+ { "en":"[[District]]:", "de":"[[Landkreis]]:", "nl":"[[District]]", },
+ { "en":"district-free town", "de":"[[kreisfreie Stadt]]", "nl":"[[stadsdistrict]]", },
+ { "en":"District-free town", "de":"[[Kreisfreie Stadt]]", "nl":"[[Stadsdistrict]]", },
+ { "en":"District-free town", "de":"[[Stadtkreis]]", "nl":"[[Stadsdistrict]]", },
+ { "en":"[[Municipality key]]:", "de":"[[Amtliche Gemeindekennzahl]]:", },
+ { "en":"[[Municipality key]]:", "de":u"[[Amtlicher Gemeindeschlüssel]]:", },
+ { "en":"urban districts", "de":"[[Stadtbezirk]]e", "nl":"stadsdelen", },
+ # female first mayor, no exact translation in en:
+ { "en":"[[Mayor]]:", "de":u"[[Oberbürgermeisterin]]:", "nl":"[[Burgemeester]]:"},
+ { "en":"[[Mayor]]:", "de":u"[[Oberbürgermeister]]in:", "nl":"[[Burgemeester]]:"},
+ # male first mayor, no exact translation in en:
+ { "en":"[[Mayor]]:", "de":u"[[Oberbürgermeister]]:", "nl":"[[Burgemeester]]:"},
+ # "bis" is used between postal codes
+ { "en":" to ", "de":" bis ", "nl":"t/m"},
+ # some cities have demographic info which is titled "Bevölkerung" (population). The spaces are important
+ # because "Bevölkerung" is also a substring of "Bevölkerungsdichte (population density).
+ { "de":u" Bevölkerung ", "nl":" Demografie ", },
+ # parties
+ { "en":"[[Christian Democratic Union of Germany|CDU]]", "de":"[[CDU]]", "nl":"[[Christlich Demokratische Union|CDU]]"},
+ { "en":"[[Social Democratic Party of Germany|SPD]]", "de":"[[SPD]]", "nl":"[[Sozialdemokratische Partei Deutschlands|SPD]]"},
+ { "en":"[[Christian Social Union in Bavaria|CSU]]", "de":"[[CSU]]", "nl":"[[CSU]]"},
+ { "en":"[[Free Democratic Party of Germany|FDP]]", "de":"[[FDP (Deutschland)|FDP]]", "nl":"[[FDP]]"},
+ { "en":u"[[German Green Party|Bündnis 90/Die Grünen]]", "de":u"[[Bündnis 90/Die Grünen]]", "nl":u"[[Die Grünen]]"},
+ { "en":"[[Party of Democratic Socialism|PDS]]", "de":"[[PDS]]", "nl":"[[PDS]]"},
+ # Bundeslaender
+ { "en":"[[Bavaria]]", "de":"[[Bayern]]", "nl":"[[Beieren]]"},
+ { "en":"[[Bremen (state)|Bremen]]", "de":"[[Bremen (Land)|Bremen]]", "nl":"[[Bremen]]"},
+ { "en":"[[Hesse]]", "de":"[[Hessen]]", "nl":"[[Hessen]]"},
+ { "en":"[[Mecklenburg-Western Pomerania]]", "de":"[[Mecklenburg-Vorpommern]]", "nl":"[[Mecklenburg-Voorpommeren]]"},
+ { "en":"[[Lower Saxony]]", "de":"[[Niedersachsen]]", "nl":"[[Nedersaksen]]"},
+ { "en":"[[North Rhine-Westphalia]]", "de":"[[Nordrhein-Westfalen]]", "nl":"[[Noordrijn-Westfalen]]"},
+ { "en":"[[Rhineland-Palatinate]]", "de":"[[Rheinland-Pfalz]]", "nl":"[[Rijnland-Palts]]"},
+ { "en":"[[Saxony]]", "de":"[[Sachsen (Bundesland)|Sachsen]]", "nl":"[[Saksen (deelstaat)|Saksen]]"},
+ { "en":"[[Saxony-Anhalt]]", "de":"[[Sachsen-Anhalt]]", "nl":"[[Saksen-Anhalt]]"},
+ { "en":"[[Schleswig-Holstein]]", "de":"[[Schleswig-Holstein]]", "nl":"[[Sleeswijk-Holstein]]"},
+ { "en":"[[Thuringia]]", "de":u"[[Thüringen]]", "nl":u"[[Thüringen]]",},
+ ],
+ "regexes": {
+ "de": {
+ # image alt text
+ "Deutschlandkarte, (?P<city>.+) markiert": {"en":"Map of Germany, \g<city> marked", "nl":"Kaart van Duitsland met de locatie van \g<city>", },
+ "Karte Deutschlands, (?P<city>.+) markiert": {"en":"Map of Germany, \g<city> marked", "nl":"Kaart van Duitsland met de locatie van \g<city>", },
+ "Karte (?P<city>.+) in Deutschland": {"en":"Map of Germany, \g<city> marked", "nl":"Kaart van Duitsland met de locatie van \g<city>", },
+ # nl: doesn't want Municipality Number
+ u"\|[-]+ bgcolor=\"#FFFFFF\"[\r\n]+\| *\[\[Amtliche( Gemeindekennzahl|r Gemeindeschlüssel)\]\]\:[ \|\r\n]+[\d -]+[\r\n]+": { "nl":"", },
+ },
+ },
+ "includes": ["city", "dates"],
+ },
+ # French départements
+ "dep": {
+ "translations": [
+ # some entries on fr: lack colons, others have spaces before the colons.
+ { "de":"[[Region (Frankreich)|Region]]:", "fr":u"[[Régions françaises|Région]] :", "eo":"[[Francaj regionoj|Regiono]]:", },
+ { "de":"[[Region (Frankreich)|Region]]:", "fr":u"[[Régions françaises|Région]]:", "eo":"[[Francaj regionoj|Regiono]]:", },
+ { "de":u"[[Präfektur (Frankreich)|Präfektur]]:", "fr":u"[[Préfecture]] :", "eo":"[[Prefektejo]]:" },
+ { "de":u"[[Präfektur (Frankreich)|Präfektur]]:", "fr":u"[[Préfecture]]:", "eo":"[[Prefektejo]]:"},
+ { "de":u"[[Unterpräfektur]]en:", "fr":u"[[Sous-préfecture]]s :", },
+ { "de":u"[[Unterpräfektur]]en:", "fr":u"[[Sous-préfecture]]s:", },
+ { "de":u"[[Unterpräfektur]]:", "fr":u"[[Sous-préfecture]] :", },
+ { "de":u"[[Unterpräfektur]]:", "fr":u"[[Sous-préfecture]]:", },
+ { "de":"insgesamt", "fr":"Totale", },
+ # the next three items are already in the list "geography", but someone forgot the colons on fr:
+ { "de":u"[[Einwohner]]:", "fr":u"[[Population]]", "eo":u"Lo\u011dantaro:", },
+ { "de":u"[[Bevölkerungsdichte|Dichte]]:", "fr":u"[[Densité de population|Densité]]", },
+ { "de":u"[[Fläche]]:", "fr":"[[Superficie]]", "eo":"Areo:", },
+ # another workaround for a forgotten colon
+ { "de":"''</small>:", "fr":"''</small>", },
+ { "de":"[[Arrondissement]]s:", "fr":"[[Arrondissement]]s", },
+ { "de":"[[Kanton (Frankreich)|Kantone]]:", "fr":u"[[Cantons français|Cantons]]", },
+ { "de":"[[Kommune (Frankreich)|Kommunen]]:", "fr":"[[Communes de France|Communes]]", },
+ { "de":u"Präsident des<br>[[Generalrat (Frankreich)|Generalrats]]:",
+ "fr":u"[[Président du Conseil général|Président du Conseil<br> général]]", },
+ ],
+ "regexes": {
+ "fr": {
+ "\[\[[aA]rrondissements (des |du |de la |de l\'|d\'|de )": {"de":u"[[Arrondissements im Département ", },
+ "\[\[[cC]ommunes (des |du |de la |de l\'|d\'|de )": {"de":u"[[Kommunen im Département ", },
+ "\[\[[cC]antons (des |du |de la|de l\'|d\'|de )": {"de":u"[[Kantone im Département ", },
+ "Blason (des |du |de la |de l\'|d\'|de )": {"de":"Wappen von ", },
+ # image alt text
+ "Localisation (des |du |de la |de l\'|d\'|de )(?P<dep>.+?) en France": {"de":"Lage von \g<dep> in Frankreich", },
+ },
+ },
+ "includes": ["numbers", "images", "geography"],
+ },
+import wikipedia, string, re
+class Global(object):
+ debug = False
+# Prints text on the screen only if in debug mode.
+# Argument text should be raw unicode.
+def print_debug(text):
+ if Global.debug:
+ wikipedia.output(text)
+# Translate the string given as argument 'text' from language 'from_lang' to
+# language 'to_lang', using translation list 'type' in above dictionary.
+# if debug_mode=True, status messages are displayed.
+def translate(text, type, from_lang, debug_mode=False, to_lang=None):
+ if to_lang is None:
+ to_lang = wikipedia.getSite().lang
+ if debug_mode:
+ Global.debug = True
+ if type == "":
+ return text
+ else:
+ print_debug("\n Translating type " + type)
+ # check if the translation database knows this type of table
+ if not type in types:
+ print "Unknown table type: " + type
+ return
+ if "translations" in types.get(type):
+ print_debug("\nDirect translations for type " + type + "\n")
+ for item in types.get(type).get("translations"):
+ # check if the translation database includes the source language
+ if not from_lang in item:
+ print_debug(from_lang + " translation for item not found in translation table, skipping item")
+ continue
+ # if it's necessary to replace a substring
+ if string.find(text, item.get(from_lang)) > -1:
+ # check if the translation database includes the target language
+ if not to_lang in item:
+ print_debug("Can't translate \"" + item.get(from_lang) + "\". Please make sure that there is a translation in")
+ else:
+ print_debug(item.get(from_lang) + " => " + item.get(to_lang))
+ # translate a substring
+ text = string.replace(text, item.get(from_lang), item.get(to_lang))
+ if 'regexes' in types.get(type):
+ # work on regular expressions
+ print_debug("\nWorking on regular expressions for type " + type + "\n")
+ regexes = types.get(type).get("regexes")
+ if from_lang in regexes:
+ for item in regexes.get(from_lang):
+ # only work on regular expressions that have a replacement for the target language
+ if to_lang in regexes.get(from_lang).get(item):
+ replacement = regexes.get(from_lang).get(item).get(to_lang)
+ regex = re.compile(item)
+ # if the regular expression doesn't match anyway, we don't want it to print a debug message
+ while, text):
+ print_debug(item + " => " + replacement)
+ text = re.sub(regex, replacement, text)
+ # recursively use translation lists which are included in the current list
+ if "includes" in types.get(type):
+ for inc in types.get(type).get("includes"):
+ text = translate(text, inc, from_lang, debug_mode, to_lang)
+ return text
Copied: archive/trunk/ (from rev 9461, trunk/pywikipedia/archive/
--- archive/trunk/ (rev 0)
+++ archive/trunk/ 2011-08-29 15:11:50 UTC (rev 9478)
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+Script to replace bad Windows-1252 (cp1252) characters with
+HTML entities on ISO 8859-1 wikis. Don't run this script on a UTF-8 wiki.
+Syntax: python [pageTitle] [file[:filename]] [sql[:filename]]
+Command line options:
+ -file:XYZ reads a list of pages, which can for exampagee be gotten through
+ Looxix's robot. XYZ is the name of the file from which the
+ list is taken. If XYZ is not given, the user is asked for a
+ filename.
+ Page titles should be in [[double-square brackets]].
+ -sql:XYZ reads a local SQL cur dump, available at
+ Searches for pages with
+ Windows-1252 characters, and tries to repair them on the live
+ wiki. Example:
+ python -sql:20040711_cur_table.sql.sql -lang:es
+# (C) Daniel Herding, 2004
+# Distributed under the terms of the MIT license.
+__version__='$Id:,v 1.27 2005/12/21 17:51:26 wikipedian Exp $'
+import wikipedia, config
+import replace, pagegenerators
+import re, sys
+# Summary message
+ 'en':u'robot: changing Windows-1252 characters to HTML entities',
+ 'fa':u'ربات: تغییر نویسههای Windows-1252 به نهادهای اچتیامال',
+ 'de':u'Bot: Wandle Windows-1252-Zeichen in HTML-Entitäten um',
+ 'fr':u'Bot: Modifie caracteres Windows-1252 vers entités HTML',
+ 'he':u'רובוט: משנה תווים בקידוד Windows-1252 ליישויות HTML',
+ 'ia':u'Robot: modification de characteres Windows-1252 a entitates HTML',
+ }
+# characters that are in Windows-1252), but not in ISO 8859-1
+replacements = [
+ (u"\x80", u"€"), # euro sign
+ (u"\x82", u"‚"), # single low-9 quotation mark
+ (u"\x83", u"ƒ"), # latin small f with hook = function = florin
+ (u"\x84", u"„"), # double low-9 quotation mark
+ (u"\x85", u"…"), # horizontal ellipsis = three dot leader
+ (u"\x86", u"†"), # dagger
+ (u"\x87", u"‡"), # double dagger
+ (u"\x88", u"ˆ"), # modifier letter circumflex accent
+ (u"\x89", u"‰"), # per mille sign
+ (u"\x8A", u"Š"), # latin capital letter S with caron
+ (u"\x8B", u"‹"), # single left-pointing angle quotation mark
+ (u"\x8C", u"Œ"), # latin capital ligature OE
+ (u"\x8E", u"Ž"), # latin capital letter Z with caron
+ (u"\x91", u"‘"), # left single quotation mark
+ (u"\x92", u"’"), # right single quotation mark
+ (u"\x93", u"“"), # left double quotation mark
+ (u"\x94", u"”"), # right double quotation mark
+ (u"\x95", u"•"), # bullet = black small circle
+ (u"\x96", u"–"), # en dash
+ (u"\x97", u"—"), # em dash
+ (u"\x98", u"˜"), # small tilde
+ (u"\x99", u"™"), # trade mark sign
+ (u"\x9A", u"š"), # latin small letter s with caron
+ (u"\x9B", u"&8250;"), # single right-pointing angle quotation mark
+ (u"\x9C", u"œ"), # latin small ligature oe
+ (u"\x9E", u"ž"), # latin small letter z with caron
+ (u"\x9F", u"Ÿ") # latin capital letter Y with diaeresis
+class SqlWindows1252PageGenerator:
+ """
+ opens a local SQL dump file, searches for pages with Windows-1252
+ characters.
+ """
+ def __init__(self, filename):
+ self.filename = filename
+ def __iter__(self):
+ # open SQL dump and read page titles out of it
+ import sqldump
+ sqldump = sqldump.SQLdump(self.filename, 'latin-1')
+ for entry in sqldump.entries():
+ for char in replacements.keys():
+ if entry.text.find(char) != -1:
+ page = wikipedia.Page(wikipedia.getSite(), entry.full_title())
+ yield page
+ break
+class WindowsCharsBot:
+ def __init__(self, generator):
+ self.generator = generator
+ def run(self):
+ replaceBot = replace.ReplaceRobot(self.generator, replacements)
+def main():
+ # this temporary array is used to read the page title.
+ pageTitle = []
+ gen = None
+ for arg in sys.argv[1:]:
+ arg = wikipedia.argHandler(arg, 'windows_chars')
+ if arg:
+ if arg.startswith('-file'):
+ if len(arg) == 5:
+ filename = wikipedia.input(u'please enter the list\'s filename: ')
+ else:
+ filename = arg[6:]
+ gen = pagegenerators.TextfilePageGenerator(filename)
+ elif arg.startswith('-sql'):
+ if len(arg) == 4:
+ sqlfilename = wikipedia.input(u'please enter the SQL dump\'s filename: ')
+ else:
+ sqlfilename = arg[5:]
+ gen = SqlWindows1252PageGenerator(sqlfilename)
+ else:
+ pageTitle.append(arg)
+ # if a single page is given as a command line argument,
+ # reconnect the title's parts with spaces
+ if pageTitle != []:
+ page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
+ gen = iter([page])
+ # get edit summary message
+ wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))
+ if not gen:
+ wikipedia.showHelp('windows_chars')
+ elif wikipedia.getSite().encoding() == "utf-8":
+ print "There is no need to run this robot on UTF-8 wikis."
+ else:
+ preloadingGen = pagegenerators.PreloadingGenerator(gen)
+ bot = WindowsCharsBot(preloadingGen)
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ wikipedia.stopme()