http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9478
Revision: 9478 Author: xqt Date: 2011-08-29 15:11:50 +0000 (Mon, 29 Aug 2011) Log Message: ----------- moved to archive
Added Paths: ----------- archive/trunk/CommonsPictureOfTheDay.py archive/trunk/WdTXMLParser.py archive/trunk/are-identical.py archive/trunk/brackethttp.py archive/trunk/check_extern.py archive/trunk/copy_table.py archive/trunk/extract_names.py archive/trunk/featuredcount.py archive/trunk/getimages.py archive/trunk/mediawiki_messages.py archive/trunk/refcheck.py archive/trunk/sqldump.py archive/trunk/test.py archive/trunk/translator.py archive/trunk/windows_chars.py
Copied: archive/trunk/CommonsPictureOfTheDay.py (from rev 9461, trunk/pywikipedia/archive/CommonsPictureOfTheDay.py) =================================================================== --- archive/trunk/CommonsPictureOfTheDay.py (rev 0) +++ archive/trunk/CommonsPictureOfTheDay.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +''' +Put "Picture of the day" in your desktop wallpaper from Wikimedia Commons. + +For Windows system, do you need: +* Python 2.5 +* Pywin32 for Python 2.5 +* PIL for Python 2.5 + +For Linux system, do you need: +* Python and PIL + +''' + +from wikipedia import Site, Page, ImagePage +from PIL import Image, ImageDraw, ImageFont +import httplib, time, sys, os + +if sys.platform == 'win32': + import ctypes, win32con + from _winreg import * +else: + import gconf + +def get_commons_image(image): + headers = {"Accept": "image/jpg", + "Accept": "image/gif", + "Accept": "image/png", + "Accept": "image/svg", + } + conn = httplib.HTTPConnection('upload.wikimedia.org') + conn.request("GET", image, None, headers) + r = conn.getresponse() + data = r.read() + if sys.platform == 'win32': + arq = open("Picture_of_the_day.bmp","wb") # convert image "on the fly" to Windows Bitmap + else: + arq = open("Picture_of_the_day.png","wb") + arq.write(data) + arq.close() + conn.close() + +def write_gray(filename, text, outfilename): + img = Image.open(filename).convert("RGB") + write = Image.new("RGB", (img.size[0], img.size[1])) + draw = ImageDraw.ImageDraw(img) + size = 0 + while True: + size +=1 + try: + FONT = "C:\WINDOWS\Fonts\Verdana.ttf" + except IndexError: + FONT = "/usr/share/fonts/truetype/ttf-bitstream-vera/Verdana.ttf" # ubuntu + except IndexError: + FONT = "/usr/share/fonts/bitstream-vera/Vera.ttf" # fedora + except IndexError: + print "Please, report this problem to leogregianin@gmail.com" + sys.exit() + nextfont = ImageFont.truetype(FONT, size) + nexttextwidth, nexttextheight = nextfont.getsize(text) + if nexttextwidth+nexttextheight/3 > write.size[0]: break + font = nextfont + textwidth, textheight = nexttextwidth, nexttextheight + draw.setfont(font) + draw.text(((write.size[0]-textwidth)/55, (write.size[0]-textheight)/55), text, fill=(120,120,120)) + img.save(outfilename) + +def set_wallpaper(): + if sys.platform == 'win32': + SPI_SETDESKWALLPAPER = 20 + ctypes.windll.user32.SystemParametersInfoA(SPI_SETDESKWALLPAPER, 0, "Picture_of_the_day.bmp", 0) + else: + gconf.client_get_default().get_string('/desktop/gnome/background/picture_options', 'scaled') + gconf.client_get_default().get_string('/desktop/gnome/background/picture_filename', 'Picture_of_the_day.png') + +if __name__ == '__main__': + commons = Site('commons', 'commons') + date_today = time.strftime('%Y-%m-%d', time.localtime()) + template = 'Template:Potd/%s' % date_today + templatePage = Page(commons, template) + image_today = templatePage.get() + image_name = 'Image:%s'% image_today + imageURL = ImagePage(commons, image_name) + featuredImage = imageURL.fileUrl() + image = featuredImage[27:] + + if sys.platform == 'win32': + if image.endswith('.svg'): + sys.exit() # Windows background don't accept svg files + + ### Install CommonsPictureOfTheDay in registry + Reg = ConnectRegistry(None, HKEY_LOCAL_MACHINE) + Key = OpenKey(Reg, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Run", 0, KEY_WRITE) + # entry your correct pywikipediabot patch + SetValueEx(Key,"CommonsPictureOfTheDay", 0, REG_SZ, r"C:\pywikipediabot\pywikipedia\CommonsPictureOfTheDay.py") + CloseKey(Key) + CloseKey(Reg) + + get_commons_image(image) + + write_gray('Picture_of_the_day.bmp', + 'http://commons.wikimedia.org/wiki/Commons:Picture_of_the_day', + 'Picture_of_the_day.bmp') + + set_wallpaper() + + else: + get_commons_image(image) + write_gray('Picture_of_the_day.png', + 'http://commons.wikimedia.org/wiki/Commons:Picture_of_the_day', + 'Picture_of_the_day.png') + set_wallpaper()
Copied: archive/trunk/WdTXMLParser.py (from rev 9461, trunk/pywikipedia/archive/WdTXMLParser.py) =================================================================== --- archive/trunk/WdTXMLParser.py (rev 0) +++ archive/trunk/WdTXMLParser.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,74 @@ +# -*- coding: iso-8859-1 -*- +""" +(C) 2003 Thomas R. Koll, tomk32@tomk32.de + Distributed under the terms of the MIT license. +""" + +__version__='$Id: WdTXMLParser.py,v 1.3 2005/12/21 17:51:26 wikipedian Exp $' + +DEBUG = 0 +import re +from xml.sax.handler import ContentHandler + +class WdTXMLParser(ContentHandler): + + def __init__(self): + self.rTitle = re.compile ('(.*): (.*)') + self.rLink = re.compile ('.*[\r\n]*(http://.*)') + self.rCount = re.compile ('.*: (\d*)') + self.inItem = 0 + self.inITitle = 0 + self.inILink = 0 + self.inIDescription = 0 + self.tmp = {} + self.result = {} + + def startDocument(self): + self.result = {} + self.tmp = {} + def endDocument(self): + return self.result + + def startElement(self, name, attrs): + if name == 'item': + self.inItem = 1 + if self.inItem == 1: + if name == 'title': + self.inTitle = 1 + if name == 'link': + self.inLink = 1 + if name == 'description': + self.inDescription = 1 + + def characters(self, characters): + if self.inItem: + if self.inTitle: + self.tmp['title'] = self.rTitle.match(characters).group(2) + if self.inLink: + self.tmp['link'] = self.rLink.match(characters).group(1) + if self.inDescription: + self.tmp['count'] = self.rCount.match(characters).group(1) + + def endElement(self, name): + if name == 'item': + self.inItem = 0 + self.result[self.tmp['title']] = { + 'link' : self.tmp['link'], + 'count' : self.tmp['count'] + } + self.tmp = {} + if name == 'title': + self.inTitle = 0 + if name == 'link': + self.inLink = 0 + if name == 'description': + self.inDescription = 0 + +""" +if self.date and self.link and self.count: + self.results[self.title] = { + 'date' : self.date, + 'link' : self.link, + 'count' : self.count + } +"""
Copied: archive/trunk/are-identical.py (from rev 9461, trunk/pywikipedia/archive/are-identical.py) =================================================================== --- archive/trunk/are-identical.py (rev 0) +++ archive/trunk/are-identical.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,81 @@ +""" +Simple bot to check whether two pages with the same name on different language +'pedias have interwiki links to the same page on another language. + +Call the script with 3 arguments: + + python are-identical.py lang1 lang2 name + +The script will either print "Yes" and return exit code 0, + or print "No" and return exit code 1, + or print "Both links are already present" + and return exit code 2, + or print "One links already present" + and return exit code 0. + +It may raise exceptions on pages that disappeared or whatever. This is +a simple framework at least for the moment. +""" +# +# (C) Rob Hooft, 2005 +# +# Distributed under the terms of the MIT license. +# +__version__='$Id: are-identical.py,v 1.3 2005/12/21 17:51:26 wikipedian Exp $' +# +from __future__ import generators + +import sys, wikipedia + +class TwoPageGenerator: + def __init__(self, lang1, lang2, name): + self.lang1 = lang1 + self.lang2 = lang2 + self.name = name + + def __iter__(self): + yield wikipedia.Page(wikipedia.getSite(self.lang1), self.name) + yield wikipedia.Page(wikipedia.getSite(self.lang2), self.name) + + +class IdenticalRobot: + def __init__(self, generator): + self.generator = generator + + def run(self): + arr = [] + for x in self.generator: + arr.append(x) + pg1 = arr[0] + pg2 = arr[1] + iw1 = pg1.interwiki() + iw2 = pg2.interwiki() + if pg2 in iw1 and pg1 in iw2: + print "Both links are already present" + sys.exit(2) + if pg2 in iw1 or pg1 in iw2: + print "One link already present" + sys.exit(0) + for iw in iw1: + if iw in iw2: + print "Yes" + sys.exit(0) + print "No" + sys.exit(1) + +def main(): + args = [] + for arg in sys.argv[1:]: + arg = wikipedia.argHandler(arg, 'are-identical') + if arg: + args.append(arg) + g = TwoPageGenerator(*args) + r = IdenticalRobot(g) + r.run() + +try: + main() +finally: + wikipedia.stopme() + +
Copied: archive/trunk/brackethttp.py (from rev 9461, trunk/pywikipedia/archive/brackethttp.py) =================================================================== --- archive/trunk/brackethttp.py (rev 0) +++ archive/trunk/brackethttp.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +''' +Script to correct URLs like +(http://www.example.org) to [http://www.example.org example.org] +to have correct generation of links in Wikipedia +''' + +__author__ = '(C) 2003 Thomas R. Koll, tomk32@tomk32.de' +__license__ = 'Distributed under the terms of the MIT license.' +__version__='$Id: brackethttp.py,v 1.13 2005/12/21 17:51:26 wikipedian Exp $' + +import re, sys +import wikipedia + +myComment = {'ar':u'بوت: URL تم إصلاحها', + 'en':u'Bot: URL fixed', + 'fa':u'ربات: URL اصلاح شد', + 'he':u'בוט: תוקנה כתובת URL', + 'pt':u'Bot: URL corrigido', + 'zh':u'機器人: 網址已修復', + } + +if __name__ == "__main__": + try: + for arg in sys.argv[1:]: + if wikipedia.argHandler(arg, 'brackethttp'): + pass + else: + pl = wikipedia.Page(wikipedia.getSite(), arg) + text = pl.get() + + newText = re.sub("(http://([^ ]*[^] ])))", "[\1 \2])", text) + + if newText != text: + wikipedia.showDiff(text, newText) + status, reason, data = pl.put(newText, wikipedia.translate(wikipedia.mylang,myComment)) + print status, reason + else: + print "No bad link found" + except: + wikipedia.stopme() + raise +wikipedia.stopme()
Copied: archive/trunk/check_extern.py (from rev 9461, trunk/pywikipedia/archive/check_extern.py) =================================================================== --- archive/trunk/check_extern.py (rev 0) +++ archive/trunk/check_extern.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- +""" +This bot is used for checking external links from Wikipedia. It checks +all external links in groups of 480 pages, gives the error code for each +that causes problems, and counts the number of links with and without +problems. + +It accepts all general Wikipediabot arguments as well as: +-start:xxx Check starting at 'xxx'. +-nolog Do not log to a file, only give output to a screen. + +Anything else is assumed to be a page that is to be checked. Spaces in +page titles have to be replaced by underscores, otherwise the bot assumes +the parts are separate pages. If no page has been specified and also no +-start argument has been provided, the bot acts as if -start:! had been +specified, starting at the beginning. + +The bot returns all links that have some problem, with the errorcode +provided by the server, or the artificial errorcode -1 if the server +could not be reached at all. Output is sent both to the screen and the +file check_extern.txt +""" + +# +# (C) Andre Engels, 2004 +# +# Distributed under the terms of the MIT license. +# + +__version__='$Id: check_extern.py,v 1.16 2005/12/21 17:51:26 wikipedian Exp $' + +import wikipedia, urllib, re, sys, httplib + +class URLerrorFinder(urllib.FancyURLopener): + version="RobHooftWikiRobot/1.0" + def open_http(self, url): + """Use HTTP protocol.""" + if isinstance(url, str): + host, selector = urllib.splithost(url) + if host: + user_passwd, host = urllib.splituser(host) + host = urllib.unquote(host) + realhost = host + else: + host, selector = url + urltype, rest = urllib.splittype(selector) + url = rest + user_passwd = None + if urltype.lower() != 'http': + realhost = None + else: + realhost, rest = splithost(rest) + if realhost: + user_passwd, realhost = splituser(realhost) + if user_passwd: + selector = "%s://%s%s" % (urltype, realhost, rest) + if proxy_bypass(realhost): + host = realhost + if not host: return -2 + h = httplib.HTTP(host) + h.putrequest('GET', selector) + if realhost: h.putheader('Host', realhost) + for args in self.addheaders: h.putheader(*args) + h.endheaders() + errcode, errmsg, headers = h.getreply() + return errcode + +# Which error codes do we not consider errors? +allowederrorcodes = [100,101,200,201,202,203,205,304] + +errname = { + -1:'No contact to server', + -2:'No host found', + 100:'Continue', + 101:'Switching Protocols', + 200:'OK', + 201:'Created', + 202:'Accepted', + 203:'Non-Authorative Information', + 204:'No Content', + 205:'Reset Content', + 206:'Partial Content', + 300:'Multiple Choices', + 301:'Moved Permanently', + 302:'Moved Temporarily', + 303:'See Other', + 304:'Not Modified', + 305:'Use Proxy', + 307:'Temporary Redirect', + 400:'Bad Request', + 401:'Unauthorized', + 402:'Payment Required', + 403:'Forbidden', + 404:'Not Found', + 405:'Method Not Allowed', + 406:'None Acceptable', + 407:'Proxy Authentication Required', + 408:'Request Timeout', + 409:'Conflict', + 410:'Gone', + 411:'Authorization Refused', + 412:'Precondition Failed', + 413:'Request Entity Too Large', + 414:'Request-URI Too Large', + 415:'Unsupported Media Type', + 416:'Requested Range not satisfiable', + 417:'Expectation Failed', + 500:'Internal Server Error', + 501:'Not Implemented', + 502:'Bad Gateway', + 503:'Service Unavailable', + 504:'Gateway Timeout', + 505:'HTTP Version not supported', + 8181:'Certificate Expired', + 12002:'Timeout', + 12007:'No such host', + 12029:'No connection', + 12031:'Connection Reset' + } + +def errorname(error): + # Given a numerical HTML error, give its actual identity + if error in errname: + return errname[error] + elif (error > 300) and (error < 400): + return 'Unknown Redirection Response' + else: + return 'Unknown Error' + +start = '!' +log = True +todo = [] +do_all = False + +for arg in sys.argv[1:]: + url=sys.argv[1] + arg = wikipedia.argHandler(arg, 'check_extern') + if arg: + if arg.startswith('-start:'): + start=arg[7:] + do_all=True + elif arg=='-nolog': + log = False + else: + mysite = wikipedia.getSite() + todo.append(wikipedia.Page(mysite,arg)) + +# Make sure we have the final site +mysite = wikipedia.getSite() + +if todo == []: + # No pages have been given; if also no start is given, we start at + # the beginning + do_all = True + +if log: + import logger + sys.stdout = logger.Logger(sys.stdout, filename = 'check_extern.log') + +cont = True +checked = 0 +working = 0 +nonworking = 0 +totalchecked = 0 + +try: + while cont: + print + i = 0 + if len(todo)<61 and do_all: + for pl in wikipedia.allpages(start = start): + todo.append(pl) + i += 1 + if i==480: + break + start = todo[len(todo)-1].title() + '_0' + # todo is a list of pages to do, donow are the pages we will be doing in this run. + if len(todo)>60: + # Take the first 60. + donow = todo[0:60] + todo = todo[60:] + else: + donow = todo + # If there was more to do, the 'if len(todo)<61' part would have extended + # todo beyond this size. + cont = False + try: + wikipedia.getall(mysite, donow) + except wikipedia.SaxError: + # Ignore this error, and get the pages the traditional way. + pass + checked +=len(donow) + for pl in donow: + R = re.compile(r'http://%5B%5E%5Cs%7D<]]+[^\s.,:;)?!]}<]') + try: + for url in R.findall(pl.get()): + url = wikipedia.unicode2html(url,'ascii') + try: + error = URLerrorFinder().open(url) + except IOError: + error = -1 + if error in allowederrorcodes: + working += 1 + else: + nonworking += 1 + print + wikipedia.output(u'Page "%s" links to:'%pl.title()) + wikipedia.output(url) + wikipedia.output(u'Which gave error: %s %s'%(error,errorname(error))) + # If anything is wrong with the Wikipedia page, just ignore + except (wikipedia.NoPage,wikipedia.IsRedirectPage,wikipedia.LockedPage): + pass + if checked>499 or not cont: + totalchecked += 500 + checked -= 500 + print + print '======================================================================' + wikipedia.output(u'%s pages checked, last was [[%s]]'%(totalchecked+checked,donow[len(donow)-1])) + print 'In those pages there were %s correct and %s problematic external links.'%(working,nonworking) +except: + wikipedia.stopme() + raise +wikipedia.stopme()
Copied: archive/trunk/copy_table.py (from rev 9461, trunk/pywikipedia/archive/copy_table.py) =================================================================== --- archive/trunk/copy_table.py (rev 0) +++ archive/trunk/copy_table.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,230 @@ +# -*- coding: utf-8 -*- +""" +Script to copy a table from one Wikipedia to another one, translating it +on-the-fly. + +Syntax: + copy_table.py -type:abcd -from:xy Article_Name + +Command line options: + +-from:xy Copy the table from the Wikipedia article in language xy + Article must have interwiki link to xy + +-debug Show debug info, and don't send the results to the server + +-type:abcd Translates the table, using translations given below. + When the -type argument is not used, the bot will simply + copy the table as-is. + +-file:XYZ Reads article names from a file. XYZ is the name of the + file from which the list is taken. If XYZ is not given, the + user is asked for a filename. + Page titles should be saved one per line, without [[brackets]]. + The -pos parameter won't work if -file is used. + +-image Copy all images within the found table to the target Wikipedia. + Make sure the bot is logged in before trying to upload images. + +Article_Name: Name of the article where a table should be inserted + +""" +# +# (C) Daniel Herding, 2004 +# +# Distributed under the terms of the MIT license. +# +__version__='$Id: copy_table.py,v 1.31 2005/12/21 17:51:26 wikipedian Exp $' +# +import wikipedia, translator, lib_images +import re, sys, string + +# Summary message +msg={ + "ar":u"روبوت: نسخ الجدول من ", + "en":u"robot: copying table from ", + "de":u"Bot: Kopiere Tabelle von ", + "he":u"רובוט: מעתיק טבלה מתוך ", + "pt":u"Bot: Copiando tabela de ", + } + +# Prints text on the screen only if in -debug mode. +# Argument text should be raw unicode. +def print_debug(text): + if debug: + wikipedia.output(text) + + +# this is a modified version of wikipedia.imagelinks(), it only looks in text, not in the whole page. +def imagelinks(site, text): + image_ns = site.image_namespace() + # regular expression which matches e.g. "Image" as well as "image" (for en:) + im = '[' + image_ns[0].upper() + image_ns[0].lower() + ']' + image_ns[1:] + w1=r'('+im+':[^]|]*)' + w2=r'([^]]*)' + Rlink = re.compile(r'[['+w1+r'(|'+w2+r')?]]') + result = [] + for l in Rlink.findall(text): + result.append(l[0]) + return result + +# opens on a page, checks for an interwiki link, transfers and translates the first +# table, copies all images in that table. +def treat(to_pl, fromsite): + try: + to_text = to_pl.get() + interwikis = to_pl.interwiki() + except wikipedia.IsRedirectPage: + print "Can't work on redirect page." + return + except wikipedia.NoPage: + print "Page not found." + return + from_pl = None + for interwiki in interwikis: + if interwiki.site() == fromsite: + from_pl = interwiki + if from_pl is None: + print "Interwiki link to %s not found." % repr(fromsite) + return + from_text = from_pl.get() + wikipedia.setAction(wikipedia.translate(mysite.lang, msg) + from_pl.aslink()) + # search start of table + table = get_table(from_text) + if not table: + wikipedia.output(u"No table found in %s" % (from_pl.aslink())) + return + + print_debug(u"Copying images") + if copy_images: + # extract image links from original table + images=imagelinks(fromsite, table) + for image in images: + # Copy the image to the current wikipedia, copy the image description page as well. + # Prompt the user so that he can translate the filename. + new_filename = lib_images.transfer_image(wikipedia.Page(fromsite, image), debug) + # if the upload succeeded + if new_filename: + old_image_tag = wikipedia.Page(fromsite, image).title() + new_image_tag = wikipedia.Page(mysite, mysite.image_namespace() + ":" + new_filename).title() + print_debug(u"Replacing " + old_image_tag + " with " + new_image_tag) + # We want to replace "Image:My pic.jpg" as well as "image:my_pic.jpg", so we need a regular expression. + old_image_tag = old_image_tag.replace(" ", "[ _]") + old_image_tag = "[" + old_image_tag[0].upper() + old_image_tag[0].lower() + "]" + old_image_tag[1:] + #todo: regex for first letter of filename, i.e. first letter after the colon + rOld_image_tag = re.compile(old_image_tag) + table = re.sub(old_image_tag, new_image_tag, table) + + + translated_table = translator.translate(table, type, fromsite.lang, debug, mysite.lang) + if not translated_table: + print "Could not translate table." + return + + print_debug(u"\n" + translated_table) + # add table to top of the article, seperated by a blank lines + to_text = translated_table + "\n\n" + to_text + if not debug: + # save changes on Wikipedia + to_pl.put(to_text, minorEdit='0') + + + + +# Regular expression that will match both <table and {| +startR = re.compile(r"<table|{|") +# Regular expression that will match both </table> and |} +endR = re.compile(r"</table>||}") + +# Finds the first table inside a text, including cascaded inner tables. +def get_table(text): + pos = 0 + # find first start tag + first_start_tag = re.search(startR, text) + if not first_start_tag: + return + else: + print_debug(u"First start tag found at " + str(first_start_tag.start())) + pos = first_start_tag.end() + # number of start tags minus numer of end tags + table_level = 1 + remaining_text = text + # until an end tag has been found for each start tag: + while table_level != 0: + # continue search after the last found tag + remaining_text = text[pos:] + next_start_tag = re.search(startR, remaining_text, pos) + next_end_tag = re.search(endR, remaining_text, pos) + if not next_end_tag: + print_debug(u"Error: missing end tag") + pass + # if another cascaded table is opened before the current one is closed + elif next_start_tag and next_start_tag.start() < next_end_tag.start(): + print_debug(u"Next start tag found at " + str(pos + next_start_tag.start())) + pos += next_start_tag.end() + table_level += 1 + print_debug(u"Table level is " + str(table_level)) + else: + print_debug(u"Next end tag found at " + str(pos + next_end_tag.start())) + pos += next_end_tag.end() + table_level -= 1 + print_debug(u"Table level is " + str(table_level)) + print_debug(u"Table starts at " + str(first_start_tag.start()) + " and ends at " + str(pos) +"\n") + print_debug(text[first_start_tag.start():pos]) + return text[first_start_tag.start():pos] + +if __name__=="__main__": + try: + # if the -file argument is used, page titles are dumped in this array. + # otherwise it will only contain one page. + page_list = [] + # if -file is not used, this temporary array is used to read the page title. + page_title = [] + from_lang = "" + type = "" + debug = False + copy_images = False + + # read command line parameters + for arg in sys.argv[1:]: + arg = wikipedia.argHandler(arg, 'copy_table') + if arg: + if arg.startswith("-from"): + from_lang = arg[6:] + elif arg.startswith("-type:"): + type = arg[6:] + elif arg == "-debug": + debug = True + elif arg == "-image": + copy_images = True + elif arg.startswith('-file'): + if len(arg) == 5: + file = wikipedia.input(u'Please enter the list's filename: ') + else: + file = arg[6:] + # open file and read page titles out of it + f=open(file) + for line in f.readlines(): + if line != '\n': + page_list.append(line) + f.close() + else: + page_title.append(arg) + + # if the page name is given as a command line argument, + # connect the title's parts with spaces + if page_title != []: + page_title = ' '.join(page_title) + page_list.append(page_title) + + mysite = wikipedia.getSite() + fromsite = mysite.getSite(code=from_lang) + + for current_page_name in page_list: + thispl = wikipedia.Page(mysite, current_page_name) + treat(thispl, fromsite) + except: + wikipedia.stopme() + raise + wikipedia.stopme() +
Copied: archive/trunk/extract_names.py (from rev 9461, trunk/pywikipedia/archive/extract_names.py) =================================================================== --- archive/trunk/extract_names.py (rev 0) +++ archive/trunk/extract_names.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,23 @@ +""" +Script to extract all wiki page names a certain HTML file points to + +The output can be used as input to some robot that takes a list of pages as input. + +This script takes a single file name argument, the file should be a HTML file +as captured from one of the wikipedia servers. +""" +# +# (C) Rob W.W. Hooft, 2003 +# +# Distributed under the terms of the MIT license. +# +__version__='$Id: extract_names.py,v 1.9 2005/12/21 17:51:26 wikipedian Exp $' +# +import sys,re +R=re.compile('/wiki/(.*?)" *') +fn=sys.argv[1] +f=open(fn) +text=f.read() +f.close() +for hit in R.findall(text): + print hit
Copied: archive/trunk/featuredcount.py (from rev 9461, trunk/pywikipedia/archive/featuredcount.py) =================================================================== --- archive/trunk/featuredcount.py (rev 0) +++ archive/trunk/featuredcount.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,52 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +################################################## +This script all function have merged to featured.py. please use: + + featured.py -fromall -count + +shizhao 2009-04-18 +################################################## + + +This script only counts how many featured articles all wikipedias have. + +usage: featuredcount.py + +""" +__version__ = '$Id: featuredcount.py 6336 2009-02-08 04:14:37Z purodha $' + +# +# Distributed under the terms of the MIT license. +# + +import sys +import wikipedia, catlib +from featured import featured_name + +def featuredArticles(site): + method=featured_name[site.lang][0] + name=featured_name[site.lang][1] + args=featured_name[site.lang][2:] + raw=method(site, name, *args) + arts=[] + for p in raw: + if p.namespace()==0: + arts.append(p) + elif p.namespace()==1: + arts.append(wikipedia.Page(p.site(), p.titleWithoutNamespace())) + wikipedia.output('\03{lightred}** wikipedia:%s has %i featured articles\03{default}' % (site.lang, len(arts))) + +if __name__=="__main__": + mysite = wikipedia.getSite() + fromlang = featured_name.keys() + fromlang.sort() + try: + for ll in fromlang: + fromsite = wikipedia.getSite(ll) + if fromsite != mysite: + arts = featuredArticles(fromsite) + arts_mysite = featuredArticles(mysite) + finally: + wikipedia.stopme()
Copied: archive/trunk/getimages.py (from rev 9461, trunk/pywikipedia/archive/getimages.py) =================================================================== --- archive/trunk/getimages.py (rev 0) +++ archive/trunk/getimages.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,67 @@ +""" +Script to transfer many images from one wiki to another. Your +language (which can be changed with the -lang: argument) is the +language to upload to. The images should be in a file as interwiki +links (that is in the form [[en:Image:myimage.png]]); they do not +need to be all from the same Wiki. This file can be created with +extract_wikilinks.py. + +Arguments: + + -lang:xx Log in to the given wikipedia language to upload to + +The first other argument is taken to be the name of the file you get +the links from; other arguments are ignored. +""" + +# +# (C) Andre Engels 2004 +# +# Distributed under the terms of the MIT license. +# +# Modified by Gerrit Holl, 01-11-2004 +__version__='$Id: getimages.py,v 1.15 2005/12/21 17:51:26 wikipedian Exp $' + +import sys +import wikipedia, lib_images, pagegenerators + +def getfn(): + fns = [] + + for arg in sys.argv[1:]: + arg = wikipedia.argHandler(arg, 'getimages') + if arg: + fns.append(arg) + + if len(fns) == 0: + fns.append(raw_input("Please enter a filename: ")) + + return fns + +def main(): + for filename in getfn(): + print "Handling images from %s" % filename + gen = pagegenerators.TextfilePageGenerator(filename) + for image in gen: + if image.isImage(): + print "-" * 50 + print "Image: %s" % image.title() + try: + # show the image description page's contents + print image.get() + except wikipedia.NoPage: + print "Description empty." + except wikipedia.IsRedirectPage: + print "Description page is redirect?!" + answer=wikipedia.input(u"Copy this image (y/N)?") + if answer.lower().startswith('y'): + lib_images.transfer_image(image) + +if __name__ == "__main__": + try: + main() + except: + wikipedia.stopme() + raise + else: + wikipedia.stopme()
Copied: archive/trunk/mediawiki_messages.py (from rev 9461, trunk/pywikipedia/archive/mediawiki_messages.py) =================================================================== --- archive/trunk/mediawiki_messages.py (rev 0) +++ archive/trunk/mediawiki_messages.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,218 @@ +# -*- coding: utf-8 -*- +""" +Allows access to the MediaWiki messages, that's the label texts of the MediaWiki +software in the current language. These can be used in other bots. + +The function refresh_messages() downloads all the current messages and saves +them to disk. It is run automatically when a bot first tries to access one of +the messages. It can be updated manually by running this script, e.g. when +somebody changed the current message at the wiki. The texts will also be +reloaded automatically once a month. + +Syntax: python mediawiki_messages [-all] + +Command line options: + -refresh - Reloads messages for the home wiki or for the one defined via + the -lang and -family parameters. + + -all - Reloads messages for all wikis where messages are already present + + If another parameter is given, it will be interpreted as a MediaWiki key. + The script will then output the respective value, without refreshing.. + +""" + +# (C) Daniel Herding, 2004 +# +# Distributed under the terms of the MIT license. + +##THIS MODULE IS DEPRECATED AND HAS BEEN REPLACED BY NEW FUNCTIONALITY IN +##WIKIPEDIA.PY. It is being retained solely for compatibility in case any +##custom-written bots rely upon it. Bot authors should replace any uses +##of this module as follows: +## +## OLD: mediawiki_messages.get(key, site) +## NEW: site.mediawiki_message(key) +## +## OLD: mediawiki_messages.has(key, site) +## NEW: site.has_mediawiki_message(key) +## +## OLD: mediawiki_messages.makepath(path) +## NEW: wikipedia.makepath(path) +## +########################################################################## + +import warnings +warnings.warn( +"""The mediawiki_messages module is deprecated and no longer +maintained; see the source code for new methods to replace +calls to this module.""", + DeprecationWarning, stacklevel=2) + + +import wikipedia +import re, sys, pickle +import os.path +import time +import codecs +import urllib +from BeautifulSoup import * + +__version__='$Id: mediawiki_messages.py 3731 2007-06-20 14:42:55Z russblau $' + +loaded = {} + +def get(key, site = None, allowreload = True): + site = site or wikipedia.getSite() + if site in loaded: + # Use cached copy if it exists. + dictionary = loaded[site] + else: + fn = 'mediawiki-messages/mediawiki-messages-%s-%s.dat' % (site.family.name, site.lang) + try: + # find out how old our saved dump is (in seconds) + file_age = time.time() - os.path.getmtime(fn) + # if it's older than 1 month, reload it + if file_age > 30 * 24 * 60 * 60: + print 'Current MediaWiki message dump is one month old, reloading' + refresh_messages(site) + except OSError: + # no saved dumped exists yet + refresh_messages(site) + f = open(fn, 'r') + dictionary = pickle.load(f) + f.close() + loaded[site] = dictionary + key = key[0].lower() + key[1:] + if key in dictionary: + return dictionary[key] + elif allowreload: + refresh_messages(site = site) + return get(key, site = site, allowreload = False) + else: + raise KeyError('MediaWiki Key %s not found' % key) + +def has(key, site = None, allowreload = True): + try: + get(key, site, allowreload) + return True + except KeyError: + return False + +def makepath(path): + """ creates missing directories for the given path and + returns a normalized absolute version of the path. + + - if the given path already exists in the filesystem + the filesystem is not modified. + + - otherwise makepath creates directories along the given path + using the dirname() of the path. You may append + a '/' to the path if you want it to be a directory path. + + from holger@trillke.net 2002/03/18 + """ + from os import makedirs + from os.path import normpath,dirname,exists,abspath + + dpath = normpath(dirname(path)) + if not exists(dpath): makedirs(dpath) + return normpath(abspath(path)) + +def refresh_messages(site = None): + site = site or wikipedia.getSite() + # get 'all messages' special page's path + path = site.allmessages_address() + print 'Retrieving MediaWiki messages for %s' % repr(site) + wikipedia.put_throttle() # It actually is a get, but a heavy one. + allmessages = site.getUrl(path) + + print 'Parsing MediaWiki messages' + soup = BeautifulSoup(allmessages, + convertEntities=BeautifulSoup.HTML_ENTITIES) + # The MediaWiki namespace in URL-encoded format, as it can contain + # non-ASCII characters and spaces. + quotedMwNs = urllib.quote(site.namespace(8).replace(' ', '_').encode(site.encoding())) + mw_url = site.path() + "?title=" + quotedMwNs + ":" + altmw_url = site.path() + "/" + quotedMwNs + ":" + nicemw_url = site.nice_get_address(quotedMwNs + ":") + shortmw_url = "/" + quotedMwNs + ":" + ismediawiki = lambda url:url and (url.startswith(mw_url) + or url.startswith(altmw_url) + or url.startswith(nicemw_url) + or url.startswith(shortmw_url)) + # we will save the found key:value pairs here + dictionary = {} + + try: + for keytag in soup('a', href=ismediawiki): + # Key strings only contain ASCII characters, so we can save them as + # strs + key = str(keytag.find(text=True)) + keyrow = keytag.parent.parent + if keyrow['class'] == "orig": + valrow = keyrow.findNextSibling('tr') + assert valrow['class'] == "new" + value = unicode(valrow.td.string).strip() + elif keyrow['class'] == 'def': + value = unicode(keyrow('td')[1].string).strip() + else: + raise AssertionError("Unknown tr class value: %s" % keyrow['class']) + dictionary[key] = value + except Exception, e: + wikipedia.debugDump( 'MediaWiki_Msg', site, u'%s: %s while processing URL: %s' % (repr(e), str(e), unicode(path)), allmessages) + raise + + # Save the dictionary to disk + # The file is stored in the mediawiki_messages subdir. Create if necessary. + if dictionary == {}: + wikipedia.debugDump( 'MediaWiki_Msg', site, u'Error URL: '+unicode(path), allmessages ) + sys.exit() + else: + f = open(makepath('mediawiki-messages/mediawiki-messages-%s-%s.dat' % (site.family.name, site.lang)), 'w') + pickle.dump(dictionary, f) + f.close() + print "Loaded %i values from %s" % (len(dictionary.keys()), site) + #print dictionary['sitestatstext'] + +def refresh_all_messages(): + import dircache, time + filenames = dircache.listdir('mediawiki-messages') + message_filenameR = re.compile('mediawiki-messages-([a-z:]+)-([a-z:]+).dat') + for filename in filenames: + match = message_filenameR.match(filename) + if match: + family = match.group(1) + lang = match.group(2) + site = wikipedia.getSite(code = lang, fam = family) + refresh_messages(site) + +def main(): + refresh_all = False + refresh = False + key = None + for arg in wikipedia.handleArgs(): + if arg == '-all': + refresh_all = True + elif arg == '-refresh': + refresh = True + else: + key = arg + if key: + wikipedia.output(get(key), toStdout = True) + elif refresh_all: + refresh_all_messages() + elif refresh: + refresh_messages(wikipedia.getSite()) + else: + wikipedia.showHelp('mediawiki_messages') + +if __name__ == "__main__": + try: + main() + except: + wikipedia.stopme() + raise + else: + wikipedia.stopme() +
Copied: archive/trunk/refcheck.py (from rev 9461, trunk/pywikipedia/archive/refcheck.py) =================================================================== --- archive/trunk/refcheck.py (rev 0) +++ archive/trunk/refcheck.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,95 @@ +#!/usr/bin/python +""" +################################################## +This script with all its function has been merged +to templatecount.py. please use: + + templatecount.py -count + +xqt 2009-10-30 +################################################## +This script checks references to see if they are properly formatted. Right now +it just counts the total number of transclusions of any number of given templates. + +NOTE: This script is not capable of handling the <ref></ref> syntax. It just +handles the {{ref}} syntax, which is still used, but DEPRECATED on the English +Wikipedia. + +Syntax: python refcheck.py command [arguments] + +Command line options: + +-count Counts the number of times each template (passed in as an argument) + is transcluded. +-namespace: Filters the search to a given namespace. If this is specified + multiple times it will search all given namespaces + +Examples: + +Counts how many time {{ref}} and {{note}} are transcluded in articles. + + python refcheck.py -count ref note -namespace:0 + +""" +__version__ = '$Id$' + +import wikipedia, config +import replace, pagegenerators +import re, sys, string + +templates = ['ref', 'note', 'ref label', 'note label', 'reflist'] + +class ReferencesRobot: + #def __init__(self): + #Nothing + def countRefs(self, templates, namespaces): + mysite = wikipedia.getSite() + mytpl = mysite.template_namespace()+':' + finalText = [u'Number of transclusions per template',u'------------------------------------'] + for template in templates: + gen = pagegenerators.ReferringPageGenerator(wikipedia.Page(mysite, mytpl + template), onlyTemplateInclusion = True) + if namespaces: + gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) + count = 0 + for page in gen: + count += 1 + finalText.append(u'%s: %d' % (template, count)) + for line in finalText: + wikipedia.output(line) + +def main(): + doCount = False + argsList = [] + namespaces = [] + for arg in wikipedia.handleArgs(): + if arg == '-count': + doCount = True + elif arg.startswith('-namespace:'): + try: + namespaces.append(int(arg[len('-namespace:'):])) + except ValueError: + namespaces.append(arg[len('-namespace:'):]) + else: + argsList.append(arg) + + if doCount: + robot = ReferencesRobot() + if not argsList: + argsList = templates + choice = '' + if 'reflist' in argsList: + wikipedia.output(u'NOTE: it will take a long time to count "reflist".') + choice = wikipedia.inputChoice(u'Proceed anyway?', ['yes', 'no', 'skip'], ['y', 'n', 's'], 'y') + if choice == 's': + argsList.remove('reflist') + if choice <> 'n': + robot.countRefs(argsList, namespaces) + else: + wikipedia.showHelp('refcheck') + +if __name__ == "__main__": + try: + main() + finally: + wikipedia.stopme() +
Copied: archive/trunk/sqldump.py (from rev 9461, trunk/pywikipedia/archive/sqldump.py) =================================================================== --- archive/trunk/sqldump.py (rev 0) +++ archive/trunk/sqldump.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,289 @@ +# -*- coding: utf-8 -*- +""" +Reads a cur SQL dump and offers a generator over SQLentry objects which can be +used by other bots. Each SQLentry object represents a page. + +Can also be run directly from the command line to retrieve page lists from +an SQL dump. + +Syntax: + + python sqldump.py -sql:filename.sql action + +Where action can be one of these: + +* find - List pages which contain a certain text +* findr - List pages containing text matching a regular expression +* shortpages - List pages with short contents +* unmountedcats - List categories that don't have a supercategory +* percentnames - List pages that contain internal links where special + characters are encoded as hexadecimal codes, e.g. %F6 +* baddisambiguations - Created for de.wikipedia to fix primary topic + disambiguations (Begriffsklärung nach Modell 2). +""" +# +# (C) Daniel Herding, 2004 +# +# Distributed under the terms of the MIT license. +# + +__version__ = '$Id:' + +from __future__ import generators +import re, time +import wikipedia, config + + +class SQLentry(object): + ''' + Represents a wiki page, read from an SQL dump. + + An instance of this class will have the following attributes: + * self.id is the page ID (integer) + * self.namespace is the namespace ID (integer) + * self.title is the page title without namespace (unicode) + * self.text is the text on that page (unicode) + * self.comment is the last edit summary (unicode) + * self.userid is the last editor's ID (integer) + * self.username is the last editor's username (unicode) + * self.timestamp is the time of the last edit (time tuple) + * self.restrictions is True if the page is locked (boolean) + * self.counter is the # of page views, disabled on Wikimedia wikis (integer) + * self.redirect is True if the page is a redirect (boolean) + * self.minor is True if the last edit was marked as minor (boolean) + * self.new is True if the last edit was the first one (boolean) + * self.random is a random number used for the 'Random Page' function (float) + * self.touched is the date of the last cache update (time tuple) + + See http://meta.wikimedia.org/wiki/Cur_table for details. + ''' + + def __init__(self, id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched): + ''' + Constructor. All parameters should be strings, as read from the SQL + dump. This function will convert them to formats which are more + appropriate for the data types. + ''' + self.id = int(id) + self.namespace = int(namespace) + self.title = title + self.text = text + self.comment = comment + self.userid = int(userid) + self.username = username + # convert to a 9-dimensional time tuple, see http://python.org/doc/2.3.4/lib/module-time.html + self.timestamp = time.strptime(timestamp, '%Y%m%d%H%M%S') + # convert to boolean + self.restrictions = (restrictions != '') + self.counter = int(counter) + self.redirect = (redirect == '1') + self.minor = (minor == '1') + self.new = (new == '1') + self.random = float(random) + # Inversetimestamp is obsolete, so we ignore it. + #self.inversetimestamp = inversetimestamp + + # Basically, I would want to convert touched to time tuple, as I did + # with timestamp. But I noticed a problem: in the nds: dump touched + # comes before inversetimestamp, and that would cause strptime to crash. + # So we simply leave touched as it is and hope that this is the only + # exception where entries are mixed up. If you find other such cases, + # please report. + #self.touched = time.strptime(touched, '%Y%m%d%H%M%S') + self.touched = touched + + # MediaWiki escapes apostrophes, backslashes and quotes with + # backslashes. We need to unescape them again. + # This regular expression matches a backslash followed by a group, where + # the group matches either an apostrophe, a backslashes or a quote. + escapedR = re.compile(r'\([\"'])') + # The group \1 is the character we really want, while the leading + # backslash is only escape information we don't need. + self.title = escapedR.sub(r"\1", self.title) + self.text = escapedR.sub(r"\1", self.text) + self.comment = escapedR.sub(r"\1", self.comment) + self.username = escapedR.sub(r"\1", self.username) + + # convert \n and \r to newlines and carriage returns. + self.text = self.text.replace('\r', '\r') + self.text = self.text.replace('\n', '\n') + # comments can also contain newline characters + self.comment = self.comment.replace('\r', '\r') + self.comment = self.comment.replace('\n', '\n') + # I hope that titles and usernames can't :-) + + def full_title(self, underline = True): + ''' + Returns the full page title in the form 'namespace:title', using the + localized namespace titles defined in your family file. + If underline is True, returns the page title with underlines instead of + spaces. + ''' + if not underline: + title = self.title.replace('_', ' ') + else: + title = self.title + namespace_title = wikipedia.getSite().namespace(self.namespace) + if namespace_title == None: + return self.title + else: + if underline: + namespace_title = namespace_title.replace(' ', '_') + return namespace_title + ':' + self.title + + def age(self): + ''' + Returns the time passed since the last edit, in relation to the current + system time, in seconds (floating point number). + ''' + return time.time() - time.mktime(self.timestamp) + +# Represents one parsed SQL dump file. Reads the local file at initialization, +# parses it with a regular expression, and offers access to the resulting +# SQLentry objects through the entries() generator. +class SQLdump(object): + def __init__(self, filename, encoding): + self.filename = filename + self.encoding = encoding + + def entries(self): + ''' + Generator which reads one line at a time from the SQL dump file, and + parses it to create SQLentry objects. Stops when the end of file is + reached. + ''' + # This regular expression will match one SQL database entry (i.e. a + # page), and each group represents an attribute of that entry. + # NOTE: We don't need re.DOTALL because newlines are escaped. + pageR = re.compile("((\d+)," # cur_id (page ID number) + + "(\d+)," # cur_namespace (namespace number) + + "'(.*?)'," # cur_title (page title w/o namespace) + + "'(.*?)'," # cur_text (page contents) + + "'(.*?)'," # cur_comment (last edit's summary text) + + "(\d+)," # cur_user (user ID of last contributor) + + "'(.*?)'," # cur_user_text (user name) + + "'(\d{14})'," # cur_timestamp (time of last edit) + + "'(.*?)'," # cur_restrictions (protected pages have 'sysop' here) + + "(\d+)," # cur_counter (view counter, disabled on WP) + + "([01])," # cur_is_redirect + + "([01])," # cur_minor_edit + + "([01])," # cur_is_new + + "([\d.]+?)," # cur_random (for random page function) + + "'(\d{14})'," # inverse_timestamp (obsolete) + + "'(\d{14})')") # cur_touched (cache update timestamp) + print 'Reading SQL dump' + # Open the file, read it using the given encoding, and replace invalid + # characters with question marks. + import codecs + f=codecs.open(self.filename, 'r', encoding = self.encoding, errors='replace') + eof = False + while not eof: + # Read only one (very long) line because we would risk out of memory + # errors if we read the entire file at once + line = f.readline() + if line == '': + print 'End of file.' + eof = True + self.entries = [] + for id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched in pageR.findall(line): + new_entry = SQLentry(id, namespace, title, text, comment, userid, username, timestamp, restrictions, counter, redirect, minor, new, random, inversetimestamp, touched) + yield new_entry + f.close() + + def query_percentnames(self): + ''' + yields pages that contain internal links where special characters are + encoded as hexadecimal codes, e.g. %F6 + ''' + Rpercentlink = re.compile('[[[^]]*?%[A-F0-9][A-F0-9][^]]*?]]') + for entry in self.entries(): + text = wikipedia.removeLanguageLinks(entry.text) + if Rpercentlink.search(text): + yield entry + + def query_shortpages(self, minsize): + ''' + yields articles that have less than minsize bytes of text + ''' + for entry in self.entries(): + if entry.namespace == 0 and not entry.redirect and len(entry.text) < minsize: + yield entry + + def query_find(self, keyword): + ''' + yields pages which contain keyword + ''' + for entry in self.entries(): + if entry.text.find(keyword) != -1: + yield entry + + def query_findr(self, regex, namespace = None): + ''' + yields pages which contain a string matching the given regular expression + ''' + r = re.compile(regex) + for entry in self.entries(): + if r.search(entry.text) and (namespace == None or entry.namespace == namespace): + yield entry + + def query_unmountedcats(self): + ''' + yields categories which don't have any supercategory + ''' + for entry in self.entries(): + if entry.namespace == 14: + has_supercategory = False + for ns in wikipedia.getSite().category_namespaces(): + if entry.text.find('[[%s:' % ns) != -1: + has_supercategory = True + break + if not has_supercategory: + yield entry + +def query(sqldump, action): + if action == 'percentnames': + for entry in sqldump.query_percentnames(): + yield entry + elif action == 'shortpages': + minsize = int(wikipedia.input(u'Minimum size:')) + for entry in sqldump.query_shortpages(minsize): + yield entry + elif action == 'find': + keyword = wikipedia.input(u'Search for:') + for entry in sqldump.query_find(keyword): + yield entry + elif action == 'findr': + keyword = wikipedia.input(u'Search for:') + for entry in sqldump.query_findr(keyword): + yield entry + elif action == 'unmountedcats': + for entry in sqldump.query_unmountedcats(): + yield entry + elif action == 'baddisambiguation': + for entry in sqldump.entries(): + if entry.namespace == 0 and entry.title.endswith(')') and entry.text.startswith("''") and not entry.text.startswith("'''"): + yield entry + +if __name__=="__main__": + wikipedia.stopme() # No need to have me on the stack, as I'm not contacting the wiki + import sys + action = None + filename = None + for arg in sys.argv[1:]: + arg = wikipedia.argHandler(arg, 'sqldump') + if arg: + if arg.startswith('-sql'): + if len(arg) == 4: + filename = wikipedia.input(u'Please enter the SQL dump's filename: ') + else: + filename = arg[5:] + else: + action = arg + if not filename or not action: + wikipedia.output(__doc__, 'utf-8') + else: + sqldump = SQLdump(filename, wikipedia.myencoding()) + + for entry in query(sqldump, action): + wikipedia.output(u'*[[%s]]' % entry.full_title()) +
Copied: archive/trunk/test.py (from rev 9461, trunk/pywikipedia/archive/test.py) =================================================================== --- archive/trunk/test.py (rev 0) +++ archive/trunk/test.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,61 @@ +#!/usr/bin/python +""" +################################################## +This script with all its function has been merged +to login.py. please use: + + login.py -test + +xqt 2009-10-26 +################################################## + +Script to test whether you are logged-in + +Parameters: + + -all Try to test on all sites where a username is defined in + user-config.py. + -sysop test your sysop account. (Works only with -all) +""" +# +# (C) Rob W.W. Hooft, 2003 +# +# Distributed under the terms of the MIT license. +# +__version__='$Id$' +# +import re,sys,wikipedia,config + +def show (mysite, sysop = False): + if mysite.loggedInAs(sysop = sysop): + wikipedia.output(u"You are logged in on %s as %s." % (repr(mysite), mysite.loggedInAs(sysop=sysop))) + else: + wikipedia.output(u"You are not logged in on %s." % repr(mysite)) + +def main(): + testall = False + sysop = False + for arg in wikipedia.handleArgs(): + if arg == "-all": + testall = True + elif arg == "-sysop": + sysop = True + else: + wikipedia.showHelp() + return + if testall: + if sysop: + namedict = config.sysopnames + else: + namedict = config.usernames + for familyName in namedict.iterkeys(): + for lang in namedict[familyName].iterkeys(): + show(wikipedia.getSite(lang, familyName), sysop) + else: + show(wikipedia.getSite(), sysop) + +if __name__ == "__main__": + try: + main() + finally: + wikipedia.stopme()
Copied: archive/trunk/translator.py (from rev 9461, trunk/pywikipedia/archive/translator.py) =================================================================== --- archive/trunk/translator.py (rev 0) +++ archive/trunk/translator.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,465 @@ +# -*- coding: utf-8 -*- + +''' +This module translates a string from one language to another, using +translations given in a hard-coded dictionary. Various dictionaries exist for +different types of text; e.g. type 'geography' is for tables about places and +regions, and 'city' is for tables about cities and villages. + +For each table type, there can be three lists: +* translations - direct replacements. Work in either direction, e.g. if + the bot knows that he should replace 'Location' with 'Ligging' + when translating from English to Dutch, he can also translate + it from Dutch to English. +* regexes - regular expression replacements. These are more powerful than + direct replacements as they support wildcards etc., but only + work in one direction. +* includes - one type can include all items from another type, e.g. when + translating a text of the type 'city', the bot also tries to + apply the translations and regexes given for type 'geography' + because 'city' includes 'geography'. +''' + +# (C) Daniel Herding, 2004 +# +# Distributed under the terms of the MIT license. +# +# + +__version__='$Id: translator.py,v 1.21 2005/12/21 17:51:26 wikipedian Exp $' + +types = { + # translations for images (inside other tables) + "images": { + "translations": [ + { "en":"[[image:", "de":"[[bild:", "nl":"[[afbeelding:", "fr":"[[image:", "af":"[[beeld:" }, + { "en":"[[Image:", "de":"[[Bild:", "nl":"[[Afbeelding:", "fr":"[[Image:", "af":"[[Beeld:" }, + { "en":"larger image", "de":u"Bild vergrößern", "nl":"grotere versie", "fr":u"En détail", "af":"In detail" }, + { "en":"larger image", "de":u"Bild vergrößern", "nl":"groter", "fr":u"En détail", "af":"In detail" }, + # usually used as link description for articles about flags, coats of arms etc. + { "en":"Details", "de":u"Details", "nl":"details", "fr":u"Détails", "af":"Details" }, + ], + }, + + # translations for taxoboxes (for biology articles) + "taxo": { + "translations": [ + # Background colors for table headers, with or without quotation marks (taxoboxes on de: all have quotation marks) + { "en":"bgcolor=pink", "de":"bgcolor="#ffc0c0"", "nl":"bgcolor=#EEEEEE", "fr":"bgcolor=pink" }, + { "en":"bgcolor="pink"", "de":"bgcolor="#ffc0c0"", "nl":"bgcolor="#EEEEEE"", "fr":"bgcolor="pink"" }, + # second table header (below the image) + { "en":"[[Scientific classification]]", "de":"[[Systematik (Biologie)|Systematik]]", "nl":"[[Taxonomie|Wetenschappelijke classificatie]]", "fr":u"Classification [[systématique]]" }, + # main taxobox content + { "en":"[[Domain (biology)|Domain]]:", "de":u"''[[Domäne (Biologie)|Domäne]]:''", "nl":"[[Domain (biologie)|Domain]]:", "fr":"??? (domain)" }, + { "en":"Domain:", "de":u"''[[Domäne (Biologie)|Domäne]]:''", "nl":"[[Domain (biologie)|Domain]]:", "fr":"??? (domain)" }, + { "en":"[[Kingdom (biology)|Kingdom]]:", "de":"''[[Reich (Biologie)|Reich]]:''", "nl":"[[Rijk (biologie)|Rijk]]:", "fr":u"[[Règne (biologie)|Règne]]:", }, + { "en":"Kingdom:", "de":"''[[Reich (Biologie)|Reich]]:''", "nl":"[[Rijk (biologie)|Rijk]]:", "fr":u"[[Règne (biologie)|Règne]]:", }, + { "en":"[[Division (biology)|Division]]:", "de":"''[[Abteilung (Biologie)|Abteilung]]:''", }, + { "en":"Division:", "de":"''[[Abteilung (Biologie)|Abteilung]]:''", }, + { "en":"[[Phylum (biology)|Phylum]]:", "de":"''[[Stamm (Biologie)|Stamm]]:''", "nl":"[[Stam (biologie)|Stam]]:", "fr":"[[Embranchement]]:", }, + { "en":"Phylum:", "de":"''[[Stamm (Biologie)|Stamm]]:''", "nl":"[[Stam (biologie)|Stam]]:", "fr":"[[Embranchement]]:", }, + { "en":"[[Subphylum]]:", "de":"''[[Unterstamm]]:''", "nl":"[[Substam (biologie)|Substam]]:", "fr":"[[Sous-embranchement]]:", }, + { "en":"Phylum:", "de":"''[[Unterstamm]]:''", "nl":"[[Substam (biologie)|Substam]]:", "fr":"[[Sous-embranchement]]:", }, + { "en":"[[Superclass (biology)|Superclass]]:", "de":u"''[[Klasse (Biologie)|Überklasse]]:''", "nl":"[[Superklasse (biologie)|Superklasse]]:", "fr":"[[Super-classe (biologie)|Super-classe]]:", }, + { "en":"Superclass:", "de":u"''[[Klasse (Biologie)|Überklasse]]:''", "nl":"[[Superklasse (biologie)|Superklasse]]:", "fr":"[[Super-classe (biologie)|Super-classe]]:", }, + { "en":"[[Class (biology)|Class]]:", "de":"''[[Klasse (Biologie)|Klasse]]:''", "nl":"[[Klasse (biologie)|Klasse]]:", "fr":"[[Classe (biologie)|Classe]]:", }, + { "en":"Class:", "de":"''[[Klasse (Biologie)|Klasse]]:''", "nl":"[[Klasse (biologie)|Klasse]]:", "fr":"[[Classe (biologie)|Classe]]:", }, + { "en":"[[Subclass]]:", "de":"''[[Klasse (Biologie)|Unterklasse]]:''", "nl":"[[Onderklasse]]:", "fr":"[[Sous-classe (biologie)|Sous-classe]]:", }, + { "en":"Subclass:", "de":"''[[Klasse (Biologie)|Unterklasse]]:''", "nl":"[[Onderklasse]]:", "fr":"[[Sous-classe (biologie)|Sous-classe]]:", }, + { "en":"[[Order (biology)|Superorder]]:", "de":u"''[[Ordnung (Biologie)|Überordnung]]:''", "nl":"[[Superorde]]:", }, + { "en":"[[Order (biology)|Order]]:", "de":"''[[Ordnung (Biologie)|Ordnung]]:''", "nl":"[[Orde (biologie)|Orde]]:", "fr":"[[Ordre (biologie)|Ordre]]:" }, + { "en":"Order:", "de":"''[[Ordnung (Biologie)|Ordnung]]:''", "nl":"[[Orde (biologie)|Orde]]:", "fr":"[[Ordre (biologie)|Ordre]]:" }, + { "en":"[[Suborder]]:", "de":"''[[Ordnung (Biologie)|Unterordnung]]:''", "nl":"[[Infraorde (biologie)|Infraorde]]:", "fr":"[[Sous-ordre (biologie)|Sous-ordre]]:", }, + { "en":"Suborder:", "de":"''[[Ordnung (Biologie)|Unterordnung]]:''", "nl":"[[Infraorde (biologie)|Infraorde]]:", "fr":"[[Sous-ordre (biologie)|Sous-ordre]]:", }, + { "en":"[[Family (biology)|Family]]:", "de":"''[[Familie (Biologie)|Familie]]:''", "nl":"[[Familie (biologie)|Familie]]:", "fr":"[[Famille (biologie)|Famille]]:", }, + { "en":"Family:", "de":"''[[Familie (Biologie)|Familie]]:''", "nl":"[[Familie (biologie)|Familie]]:", "fr":"[[Famille (biologie)|Famille]]:", }, + { "en":"[[Subfamily (biology)|Subfamily]]:", "de":"''[[Familie (Biologie)|Unterfamilie]]:''", "nl":"[[Onderfamilie]]:", "fr":"[[Sous-famille (biologie)|Sous-famille]]:", }, + { "en":"Subfamily:", "de":"''[[Familie (Biologie)|Unterfamilie]]:''", "nl":"[[Onderfamilie]]:", "fr":"[[Sous-famille (biologie)|Sous-famille]]:", }, + { "en":"[[Tribe (biology)|Tribe]]:", "de":"''[[Tribus (Biologie)|Tribus]]:''", "nl":"[[Tak (biologie)|Tak]]:", "fr":"??? (Tribus)" }, + { "en":"Tribe:", "de":"''[[Tribus (Biologie)|Tribus]]:''", "nl":"[[Tak (biologie)|Tak]]:", "fr":"??? (Tribus)" }, + { "en":"[[Genus]]:", "de":"''[[Gattung (Biologie)|Gattung]]:''", "nl":"[[Geslacht (biologie)|Geslacht]]:", "fr":"[[Genre]]:" }, + { "en":"Genus:", "de":"''[[Gattung (Biologie)|Gattung]]:''", "nl":"[[Geslacht (biologie)|Geslacht]]:", "fr":"[[Genre]]:" }, + { "en":"[[Subgenus]]:", "de":"''[[Gattung (Biologie)|Untergattung]]:''", "nl":"[[Ondergeslacht]]:", "fr":"??? (Sous-genre)" }, + { "en":"Subgenus:", "de":"''[[Gattung (Biologie)|Untergattung]]:''", "nl":"[[Ondergeslacht]]:", "fr":"??? (Sous-genre)" }, + { "en":"[[Species]]:", "de":"''[[Art (Biologie)|Art]]:''", "nl":"[[Soort]]:", "fr":u"[[Espèce]]:" }, + { "en":"Species:", "de":"''[[Art (Biologie)|Art]]:''", "nl":"[[Soort]]:", "fr":u"[[Espèce]]:" }, + # table headers for subdivisions of the current group + { "en":"[[Class (biology)|Classes]]", "de":"[[Klasse (Biologie)|Klassen]]", "nl":"[[Klasse (biologie)|Klassen]]", }, + { "en":"[[Order (biology)|Orders]]", "de":"[[Ordnung (Biologie)|Ordnungen]]", "nl":"[[Orde (biologie)|Orden]]", "fr":"[[Ordre (biologie)|Ordres]]" }, + { "en":"[[Suborder]]s", "de":"[[Ordnung (Biologie)|Unterordnungen]]", "nl":"[[Infraorde (biologie)|Infraorden]]:", "fr":"[[Sous-ordre (biologie)|Sous-ordres]]", }, + { "en":"[[Family (biology)|Families]]", "de":"[[Familie (Biologie)|Familien]]", "nl":"[[Familie (biologie)|Families]]", "fr":"[[Famille (biologie)|Familles]]", }, + { "en":"[[Genus|Genera]]", "de":"[[Gattung (Biologie)|Gattungen]]", "nl":"[[Geslacht (biologie)|Geslachten]]", "fr":"[[Genre (biologie)|Genre]]" }, + { "en":"[[Species]]", "de":"[[Art (Biologie)|Arten]]", "nl":"[[Soort]]en", "fr":u"??? (Espèces)" }, + { "en":"[[Species]] (incomplete)", "de":"[[Art (Biologie)|Arten (Auswahl)]]", "nl":"[[Soort]]en (incompleet)", "fr":u"??? (Espèces (sélection))" }, + # table headers for nl: style taxoboxes (current group is listed in a special section at the bottom) + { "en":"[[Order (biology)|Order]]", "de":"[[Ordnung (Biologie)|Ordnung]]", "nl":"[[Orde (biologie)|Orde]]", "fr":"[[Ordre (biologie)|Ordre]]" }, + { "en":"[[Family (biology)|Family]]", "de":"[[Familie (Biologie)|Familie]]", "nl":"[[Familie (biologie)|Familie]]", "fr":"[[Famille (biologie)|Famille]]", }, + { "en":"[[Genus]]", "de":"[[Gattung (Biologie)|Gattung]]", "nl":"[[Geslacht (biologie)|Geslacht]]", "fr":"[[Genre]]" }, + { "en":"[[Species]]", "de":"[[Art (Biologie)|Art]]", "nl":"[[Soort]]", "fr":u"[[Espèce]]" }, + ], + "regexes": { + "en": { + # de: doesn't have conservation status infos + "{{msg:Status[^}]+}}": {"de":"", }, + }, + }, + "includes": ["images", "taxo_categories"], + }, + + # this should only include classes etc. which appear very often, not every species! + "taxo_categories": { + "translations": [ + # kingdoms + { "en":"[[Animal]]ia", "de":"[[Tiere]] (Animalia)", "nl":"Dieren (''[[Animalia]]'')", }, + { "en":"[[Plant]]ae", "de":"[[Pflanzen]] (Plantae)", }, + # divisions + { "en":"[[flowering plant|Magnoliophyta]]", "de":u"[[Blütenpflanzen]] (Magnoliophyta)", }, + # phylums + { "en":"[[Anthropod]]a", "de":u"[[Gliederfüßler]] (Anthropoda)", }, + { "en":"[[Chordata]]", "de":"[[Chordatiere]] (Chordata)", "nl":"Chordadieren (''[[Chordata]]'')", }, + { "en":"[[Chordate|Chordata]]", "de":"[[Chordatiere]] (Chordata)", "nl":"Chordadieren (''[[Chordata]]'')", }, + # subphylums + { "en":"[[Vertebrata]]", "de":"[[Wirbeltiere]] (Vertebrata)", "nl":"Gewervelden (''[[Vertebrata]]'')", }, + # superclasses + # classes + { "en":"[[Aves]]", "de":u"[[Vögel]] (Aves)", "nl":"Vogels (''[[Aves]]'')", }, + { "en":"[[Insect]]a", "de":"[[Insekten]] (Insecta)", }, + { "en":"[[Mammal]]ia", "de":u"[[Säugetiere]] (Mammalia)", "nl":"Zoogdieren (''[[Mammalia]]'')", }, + { "en":"[[Mammalia]]", "de":u"[[Säugetiere]] (Mammalia)", "nl":"Zoogdieren (''[[Mammalia]]'')", }, + { "en":"[[dicotyledon|Magnoliopsida]]", "de":u"Zweikeimblättrige (Magnoliopsida)", }, + { "de":"Reptilien (Reptilia)", "nl":"Reptielen (''[[Reptilia]]'')", }, + ], + "regexes": { + "de": { + # change [[Hunde]] (Canidae) to Hunde (''[[Canidae]]'') for nl: + # and to [[Canidae]] for en: + "[[(?P<german>[^[]+)]] ((?P<latin>.+))": {"en":"[[\g<latin>]]", "nl":"\g<german> (''[[\g<latin>]]'')", }, + }, + "nl": { + # change Knaagdieren (''[[Rodentia]]'') to [[Knaagdieren]] (Rodentia) + "(?P<dutch>[a-zA-Z ]+) ([[''(?P<latin>[^[]+)'']])": {"de":"[[\g<dutch>]] (\g<latin>)", }, + "(?P<dutch>[a-zA-Z ]+) (''[[(?P<latin>[^[]+)]]'')": {"de":"[[\g<dutch>]] (\g<latin>)", }, + "(?P<dutch>[a-zA-Z ]+) ([[<i>(?P<latin>[^[]+)</i>]])": {"de":"[[\g<dutch>]] (\g<latin>)", }, + "(?P<dutch>[a-zA-Z ]+) (<i>[[(?P<latin>[^[]+)]]</i>)": {"de":"[[\g<dutch>]] (\g<latin>)", }, + }, + }, + + }, + + + # plants get the same table color as animals on de:, but on en: they are green instead of pink + "plant": { + "translations": [ + { "en":"bgcolor=lightgreen", "de":"bgcolor="#ffc0c0"", }, + { "en":"bgcolor="lightgreen"", "de":"bgcolor="#ffc0c0"", }, + ], + "includes": ["taxo"], + }, + + # regular expressions for number formats + "numbers": { + "translations": [ + # miljoen shouldn't be abbreviated on nl: + { "en":"mill.", "de":"Mio.", "nl":"miljoen", }, + { "en":"bill.", "de":"Mrd." }, + ], + "regexes": { + "fr": { + # fr uses or space to separate thousands, de uses dots + # note: this doesn't work for numbers > 1,000,000, don't know why + "(?P<pre>\d+) (?P<block>\d\d\d)": {"de":"\g<pre>.\g<block>", }, + "(?P<pre>\d+) (?P<block>\d\d\d)": {"de":"\g<pre>.\g<block>", }, + }, + "en": { + # de uses dots to separate thousands, en uses commas + # de uses commas to indicate floating point numbers, en uses dots + # switch both - temporary placeholder required + "(?P<pre>\d+),(?P<block>\d\d\d)": {"de":"\g<pre>TEMPORARY_DOT\g<block>", }, + "(?P<pre>\d+).(?P<block>\d+)": {"de":"\g<pre>,\g<block>", }, + "TEMPORARY_DOT": {"de":".", }, + }, + "de": { + # de uses dots to separate thousands, en uses commas + # de uses commas to indicate floating point numbers, en uses dots + # switch both - temporary placeholder required + "(?P<pre>\d+).(?P<block>\d\d\d)": {"en":"\g<pre>TEMPORARY_COMMA\g<block>", }, + "(?P<pre>\d+),(?P<block>\d+)": {"en":"\g<pre>.\g<block>", }, + "TEMPORARY_COMMA": {"en":",", }, + }, + }, + }, + + "months": { + "translations": [ + { "sl":"januar", "it":"gennaio", "en":"January", "de":"Januar", "fr":"janvier", "nl":"januari", "af":"Januarie"}, + { "sl":"februar", "it":"febbraio", "en":"February", "de":"Februar", "fr":u"février", "nl":"februari", "af":"Februarie"}, + { "sl":"marec", "it":"marzo", "en":"March", "de":u"März", "fr":"mars", "nl":"maart", "af":"Maart"}, + { "sl":"april", "it":"aprile", "en":"April", "de":"April", "fr":"avril", "nl":"april", "af":"April"}, + { "sl":"maj", "it":"maggio", "en":"May", "de":"Mai", "fr":"mai", "nl":"mei", "af":"Mei"}, + { "sl":"junij", "it":"giugno", "en":"June", "de":"Juni", "fr":"juin", "nl":"juni", "af":"Junie"}, + { "sl":"julij", "it":"luglio", "en":"July", "de":"Juli", "fr":"juillet", "nl":"juli", "af":"Julie"}, + { "sl":"avgust", "it":"agosto", "en":"August", "de":"August", "fr":u"août", "nl":"augustus", "af":"Augustus"}, + { "sl":"september", "it":"settembre", "en":"September", "de":"September", "fr":"septembre", "nl":"september", "af":"September"}, + { "sl":"oktober", "it":"ottobre", "en":"October", "de":"Oktober", "fr":"octobre", "nl":"oktober", "af":"Oktober"}, + { "sl":"november", "it":"novembre", "en":"November", "de":"November", "fr":"novembre", "nl":"november", "af":"November"}, + { "sl":"december", "it":"dicembre", "en":"December", "de":"Dezember", "fr":u"décembre", "nl":"december", "af":"Desember"}, + ] + }, + + # conversion between number formats + "dates": { + "regexes": { + "de": { + # dd.mm.yy and dd.mm.yyyy format + "(?P<day>\d\d).(?P<month>\d\d).(?P<year>(\d\d)+)": {"nl":"\g<day>-\g<month>-\g<year>", }, + }, + }, + }, + + + + # units of measurement etc. + # only for internal use + "units": { + "translations": [ + { "en":"[[Square kilometre|km²]]", "de":"[[Quadratkilometer|km²]]", "nl":"[[Vierkante kilometer|km²]]", }, + { "en":u"[[Square kilometre|km²]]", "de":u"[[Quadratkilometer|km²]]", "nl":u"[[Vierkante kilometer|km²]]", }, + { "en":"as of ", "de":"Stand: ", }, + { "en":"years", "de":"Jahre", "nl":"jaar"}, + ] + }, + + # general geographical terms etc. + # only for internal use + "geography": { + "translations": [ + # header + { "en":"Base data", "de":"Basisdaten", "nl":"Basisgegevens", "fr":"Informations", }, + { "en":"[[Area]]:", "de":u"[[Fläche]]:", "nl":"Oppervlakte:", "fr":"[[Superficie]]:", "eo":"Areo:",}, + { "en":"[[Population]]:", "de":"[[Einwohner]]:", "nl":"Inwoneraantal:", "fr":u"[[Population]]:", "eo":u"Logantaro:", }, + { "en":"[[Population density]]:", "de":u"[[Bevölkerungsdichte]]:", "nl":"[[Bevolkingsdichtheid]]:", }, + { "en":"inh./km²", "de":"Einw./km²", "nl":"inw./km²", "fr":"hab/km²", }, + { "en":u"inh./km²", "de":u"Einw./km²", "nl":u"inw./km²", "fr":u"hab/km²", }, + { "en":"inhabitants/km²", "de":"Einwohner/km²", "nl":"inwoners / km²", }, + { "en":u"inhabitants/km²", "de":u"Einwohner/km²", "nl":u"inwoners / km²", }, + { "en":"inhabitants per km²", "de":"Einwohner pro km²", "nl":"inwoners per km²", }, + { "en":u"inhabitants per km²", "de":u"Einwohner pro km²", "nl":u"inwoners per km²", }, + { "en":"inh.", "de":"Einw.", "nl":"inw.", "fr":"hab.", }, + { "en":"above [[sea level]]", "de":u"ü. [[Normalnull|NN]]", "nl":"boven [[Normaal Amsterdams Peil|NAP]]", }, + { "en":"location", "de":"Geografische Lage", "nl":"Ligging", "fr":"Localisation", }, + # longitude, latitude + { "en":"' north", "de":u"' nördlicher Breite", "nl":"' NB" }, + { "en":"' north", "de":u"' nördl. Breite", "nl":"' NB" }, + { "en":"' north", "de":"' n. Br.", "nl":"' NB" }, + { "en":"' east", "de":u"' östlicher Länge", "nl":"' OL" }, + { "en":"' east", "de":u"' östl. Länge", "nl":"' OL" }, + { "en":"' east", "de":u"' ö. L.", "nl":"' OL" }, + { "en":"Map", "de":"Karte", "nl":"Kaart", }, + { "en":"Coat of Arms", "de":"Wappen", "nl":"Wapen", "fr":"Blason" }, + ], + "includes": ["units"], + }, + + "city": { + "translations": [ + { "en":"[[Location]]:", "de":"[[Geografische Lage]]:", "nl":"Ligging", }, + { "en":"[[Altitude]]:", "de":u"[[Höhe]]:", "nl":"Hoogte:", }, + { "en":"Highest point:", "de":u"Höchster Punkt:", "nl":"Hoogste punt:",}, + { "en":"Lowest point:", "de":"Niedrigster Punkt:", "nl":"Laagste punt:"}, + { "en":"[[Postal code]]:", "de":"[[Postleitzahl]]:", "nl":"[[Postcode]]:", }, + { "en":"[[Postal code]]s:", "de":"[[Postleitzahl]]en:", "nl":"[[Postcode]]s:", }, + { "en":"[[Area code]]:", "de":"[[Telefonvorwahl|Vorwahl]]:", "nl":"[[Netnummer]]:", }, + { "en":"[[Area code]]s:", "de":"[[Telefonvorwahl|Vorwahlen]]:", "nl":"[[Netnummer]]s:", }, + { "en":"[[License plate]]:", "de":"[[KFZ-Kennzeichen]]:", "nl":"[[Autonummerbord]]:", }, + { "en":"[[License plate]]:", "de":"[[Kfz-Kennzeichen]]:", "nl":"[[Autonummerbord]]:", }, + { "en":"City structure:", "de":"Gliederung des Stadtgebiets:", "nl":"Ondergemeentelijke indeling:", }, + # town hall snail mail address + { "en":"Municipality's address:", "de":"Adresse der Gemeindeverwaltung:", "nl":"Adres gemeentehuis:", }, + # city hall snail mail address + { "en":"Municipality's address:", "de":"Adresse der Stadtverwaltung:", "nl":"Adres stadhuis:", }, + { "en":"Website:", "de":"Webseite:", "nl":"Website:" }, + { "en":"Website:", "de":"Website:", "nl":"Website:" }, + { "en":"E-Mail adress:", "de":"[[E-Mail]]-Adresse:", "nl":"Email-adres:", }, + { "en":"E-Mail adress:", "de":"E-Mail-Adresse:", "nl":"Email-adres:", }, + # table header + { "en":"Politics", "de":"Politik", "nl":"Politiek", }, + # female mayor + { "en":"[[Mayor]]:", "de":u"[[Bürgermeister]]in:", "nl":"[[Burgemeester]]:", }, + { "en":"[[Mayor]]:", "de":u"[[Bürgermeisterin]]:", "nl":"[[Burgemeester]]:", }, + # male mayor + { "en":"[[Mayor]]:", "de":u"[[Bürgermeister]]:", "nl":"[[Burgemeester]]:", }, + { "en":"Governing [[Political party|party]]:", "de":"Regierende [[Politische Partei|Partei]]", "nl":"Regerende partij", }, + { "en":"Governing [[Political party|parties]]:", "de":"Regierende [[Politische Partei|Parteien]]", "nl":"Regerende partijen", }, + { "en":"Majority [[Political party|party]]:", "de":"[[Politische Partei|Mehrheitspartei]]", "nl":"Meerderheidspartij"}, + { "en":"Debts:", "de":"Schulden:", }, + { "en":"[[Unemployment]]:", "de":"[[Arbeitslosenquote]]:", "nl":"Werkloosheidspercentage:", }, + { "de":u"[[Ausländeranteil]]:", "nl":"Percentage buitenlanders", }, + { "en":"Age distribution:", "de":"Altersstruktur:", "nl":"Leeftijdsopbouw:", }, + { "de":"Stadtteile", "nl":"wijken"}, + { "de":"[[Stadtbezirk]]e", "nl":"deelgemeenten" }, + { "de":"Stadtbezirke", "nl":"deelgemeenten" }, + { "en":"Independent", "de":"Parteilos", "nl":"geen partij" }, + { "en":"Region", "de":"[[Region]]", "nl":"Landstreek" }, + ], + "includes": ["images", "geography", "numbers"], + }, + + # translations for cities in Germany + "city-de": { + "translations": [ + { "en":"[[Bundesland]]:", "de":"[[Bundesland]]:", "nl":"[[Deelstaat (Duitsland)|Deelstaat]]", }, + { "en":"[[Regierungsbezirk]]:", "de":"[[Regierungsbezirk]]:", "nl":"[[Regierungsbezirk]]:", }, + { "en":"[[District]]:", "de":"[[Landkreis|Kreis]]:", "nl":"[[District]]", }, + { "en":"[[District]]:", "de":"[[Landkreis]]:", "nl":"[[District]]", }, + { "en":"district-free town", "de":"[[kreisfreie Stadt]]", "nl":"[[stadsdistrict]]", }, + { "en":"District-free town", "de":"[[Kreisfreie Stadt]]", "nl":"[[Stadsdistrict]]", }, + { "en":"District-free town", "de":"[[Stadtkreis]]", "nl":"[[Stadsdistrict]]", }, + { "en":"[[Municipality key]]:", "de":"[[Amtliche Gemeindekennzahl]]:", }, + { "en":"[[Municipality key]]:", "de":u"[[Amtlicher Gemeindeschlüssel]]:", }, + { "en":"urban districts", "de":"[[Stadtbezirk]]e", "nl":"stadsdelen", }, + # female first mayor, no exact translation in en: + { "en":"[[Mayor]]:", "de":u"[[Oberbürgermeisterin]]:", "nl":"[[Burgemeester]]:"}, + { "en":"[[Mayor]]:", "de":u"[[Oberbürgermeister]]in:", "nl":"[[Burgemeester]]:"}, + # male first mayor, no exact translation in en: + { "en":"[[Mayor]]:", "de":u"[[Oberbürgermeister]]:", "nl":"[[Burgemeester]]:"}, + # "bis" is used between postal codes + { "en":" to ", "de":" bis ", "nl":"t/m"}, + # some cities have demographic info which is titled "Bevölkerung" (population). The spaces are important + # because "Bevölkerung" is also a substring of "Bevölkerungsdichte (population density). + { "de":u" Bevölkerung ", "nl":" Demografie ", }, + + # parties + { "en":"[[Christian Democratic Union of Germany|CDU]]", "de":"[[CDU]]", "nl":"[[Christlich Demokratische Union|CDU]]"}, + { "en":"[[Social Democratic Party of Germany|SPD]]", "de":"[[SPD]]", "nl":"[[Sozialdemokratische Partei Deutschlands|SPD]]"}, + { "en":"[[Christian Social Union in Bavaria|CSU]]", "de":"[[CSU]]", "nl":"[[CSU]]"}, + { "en":"[[Free Democratic Party of Germany|FDP]]", "de":"[[FDP (Deutschland)|FDP]]", "nl":"[[FDP]]"}, + { "en":u"[[German Green Party|Bündnis 90/Die Grünen]]", "de":u"[[Bündnis 90/Die Grünen]]", "nl":u"[[Die Grünen]]"}, + { "en":"[[Party of Democratic Socialism|PDS]]", "de":"[[PDS]]", "nl":"[[PDS]]"}, + # Bundeslaender + { "en":"[[Bavaria]]", "de":"[[Bayern]]", "nl":"[[Beieren]]"}, + { "en":"[[Bremen (state)|Bremen]]", "de":"[[Bremen (Land)|Bremen]]", "nl":"[[Bremen]]"}, + { "en":"[[Hesse]]", "de":"[[Hessen]]", "nl":"[[Hessen]]"}, + { "en":"[[Mecklenburg-Western Pomerania]]", "de":"[[Mecklenburg-Vorpommern]]", "nl":"[[Mecklenburg-Voorpommeren]]"}, + { "en":"[[Lower Saxony]]", "de":"[[Niedersachsen]]", "nl":"[[Nedersaksen]]"}, + { "en":"[[North Rhine-Westphalia]]", "de":"[[Nordrhein-Westfalen]]", "nl":"[[Noordrijn-Westfalen]]"}, + { "en":"[[Rhineland-Palatinate]]", "de":"[[Rheinland-Pfalz]]", "nl":"[[Rijnland-Palts]]"}, + { "en":"[[Saxony]]", "de":"[[Sachsen (Bundesland)|Sachsen]]", "nl":"[[Saksen (deelstaat)|Saksen]]"}, + { "en":"[[Saxony-Anhalt]]", "de":"[[Sachsen-Anhalt]]", "nl":"[[Saksen-Anhalt]]"}, + { "en":"[[Schleswig-Holstein]]", "de":"[[Schleswig-Holstein]]", "nl":"[[Sleeswijk-Holstein]]"}, + { "en":"[[Thuringia]]", "de":u"[[Thüringen]]", "nl":u"[[Thüringen]]",}, + ], + "regexes": { + "de": { + # image alt text + "Deutschlandkarte, (?P<city>.+) markiert": {"en":"Map of Germany, \g<city> marked", "nl":"Kaart van Duitsland met de locatie van \g<city>", }, + "Karte Deutschlands, (?P<city>.+) markiert": {"en":"Map of Germany, \g<city> marked", "nl":"Kaart van Duitsland met de locatie van \g<city>", }, + "Karte (?P<city>.+) in Deutschland": {"en":"Map of Germany, \g<city> marked", "nl":"Kaart van Duitsland met de locatie van \g<city>", }, + # nl: doesn't want Municipality Number + u"|[-]+ bgcolor="#FFFFFF"[\r\n]+| *[[Amtliche( Gemeindekennzahl|r Gemeindeschlüssel)]]:[ |\r\n]+[\d -]+[\r\n]+": { "nl":"", }, + }, + }, + "includes": ["city", "dates"], + + }, + + # French départements + "dep": { + "translations": [ + # some entries on fr: lack colons, others have spaces before the colons. + { "de":"[[Region (Frankreich)|Region]]:", "fr":u"[[Régions françaises|Région]] :", "eo":"[[Francaj regionoj|Regiono]]:", }, + { "de":"[[Region (Frankreich)|Region]]:", "fr":u"[[Régions françaises|Région]]:", "eo":"[[Francaj regionoj|Regiono]]:", }, + { "de":u"[[Präfektur (Frankreich)|Präfektur]]:", "fr":u"[[Préfecture]] :", "eo":"[[Prefektejo]]:" }, + { "de":u"[[Präfektur (Frankreich)|Präfektur]]:", "fr":u"[[Préfecture]]:", "eo":"[[Prefektejo]]:"}, + { "de":u"[[Unterpräfektur]]en:", "fr":u"[[Sous-préfecture]]s :", }, + { "de":u"[[Unterpräfektur]]en:", "fr":u"[[Sous-préfecture]]s:", }, + { "de":u"[[Unterpräfektur]]:", "fr":u"[[Sous-préfecture]] :", }, + { "de":u"[[Unterpräfektur]]:", "fr":u"[[Sous-préfecture]]:", }, + { "de":"insgesamt", "fr":"Totale", }, + # the next three items are already in the list "geography", but someone forgot the colons on fr: + { "de":u"[[Einwohner]]:", "fr":u"[[Population]]", "eo":u"Lo\u011dantaro:", }, + { "de":u"[[Bevölkerungsdichte|Dichte]]:", "fr":u"[[Densité de population|Densité]]", }, + { "de":u"[[Fläche]]:", "fr":"[[Superficie]]", "eo":"Areo:", }, + # another workaround for a forgotten colon + { "de":"''</small>:", "fr":"''</small>", }, + { "de":"[[Arrondissement]]s:", "fr":"[[Arrondissement]]s", }, + { "de":"[[Kanton (Frankreich)|Kantone]]:", "fr":u"[[Cantons français|Cantons]]", }, + { "de":"[[Kommune (Frankreich)|Kommunen]]:", "fr":"[[Communes de France|Communes]]", }, + { "de":u"Präsident des<br>[[Generalrat (Frankreich)|Generalrats]]:", + "fr":u"[[Président du Conseil général|Président du Conseil<br> général]]", }, + ], + "regexes": { + "fr": { + "[[[aA]rrondissements (des |du |de la |de l'|d'|de )": {"de":u"[[Arrondissements im Département ", }, + "[[[cC]ommunes (des |du |de la |de l'|d'|de )": {"de":u"[[Kommunen im Département ", }, + "[[[cC]antons (des |du |de la|de l'|d'|de )": {"de":u"[[Kantone im Département ", }, + "Blason (des |du |de la |de l'|d'|de )": {"de":"Wappen von ", }, + # image alt text + "Localisation (des |du |de la |de l'|d'|de )(?P<dep>.+?) en France": {"de":"Lage von \g<dep> in Frankreich", }, + }, + }, + "includes": ["numbers", "images", "geography"], + }, +} + +import wikipedia, string, re + +class Global(object): + debug = False + +# Prints text on the screen only if in debug mode. +# Argument text should be raw unicode. +def print_debug(text): + if Global.debug: + wikipedia.output(text) + +# Translate the string given as argument 'text' from language 'from_lang' to +# language 'to_lang', using translation list 'type' in above dictionary. +# if debug_mode=True, status messages are displayed. +def translate(text, type, from_lang, debug_mode=False, to_lang=None): + if to_lang is None: + to_lang = wikipedia.getSite().lang + if debug_mode: + Global.debug = True + if type == "": + return text + else: + print_debug("\n Translating type " + type) + # check if the translation database knows this type of table + if not type in types: + print "Unknown table type: " + type + return + if "translations" in types.get(type): + print_debug("\nDirect translations for type " + type + "\n") + for item in types.get(type).get("translations"): + # check if the translation database includes the source language + if not from_lang in item: + print_debug(from_lang + " translation for item not found in translation table, skipping item") + continue + # if it's necessary to replace a substring + if string.find(text, item.get(from_lang)) > -1: + # check if the translation database includes the target language + if not to_lang in item: + print_debug("Can't translate "" + item.get(from_lang) + "". Please make sure that there is a translation in copy_table.py.") + else: + print_debug(item.get(from_lang) + " => " + item.get(to_lang)) + # translate a substring + text = string.replace(text, item.get(from_lang), item.get(to_lang)) + if 'regexes' in types.get(type): + # work on regular expressions + print_debug("\nWorking on regular expressions for type " + type + "\n") + regexes = types.get(type).get("regexes") + if from_lang in regexes: + for item in regexes.get(from_lang): + # only work on regular expressions that have a replacement for the target language + if to_lang in regexes.get(from_lang).get(item): + replacement = regexes.get(from_lang).get(item).get(to_lang) + regex = re.compile(item) + # if the regular expression doesn't match anyway, we don't want it to print a debug message + while re.search(regex, text): + print_debug(item + " => " + replacement) + text = re.sub(regex, replacement, text) + # recursively use translation lists which are included in the current list + if "includes" in types.get(type): + for inc in types.get(type).get("includes"): + text = translate(text, inc, from_lang, debug_mode, to_lang) + return text
Copied: archive/trunk/windows_chars.py (from rev 9461, trunk/pywikipedia/archive/windows_chars.py) =================================================================== --- archive/trunk/windows_chars.py (rev 0) +++ archive/trunk/windows_chars.py 2011-08-29 15:11:50 UTC (rev 9478) @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- +""" +Script to replace bad Windows-1252 (cp1252) characters with +HTML entities on ISO 8859-1 wikis. Don't run this script on a UTF-8 wiki. + +Syntax: python windows_chars.py [pageTitle] [file[:filename]] [sql[:filename]] + +Command line options: + + -file:XYZ reads a list of pages, which can for exampagee be gotten through + Looxix's robot. XYZ is the name of the file from which the + list is taken. If XYZ is not given, the user is asked for a + filename. + Page titles should be in [[double-square brackets]]. + + -sql:XYZ reads a local SQL cur dump, available at + http://download.wikimedia.org/. Searches for pages with + Windows-1252 characters, and tries to repair them on the live + wiki. Example: + python windows_chars.py -sql:20040711_cur_table.sql.sql -lang:es + +""" +# +# (C) Daniel Herding, 2004 +# +# Distributed under the terms of the MIT license. +# +__version__='$Id: windows_chars.py,v 1.27 2005/12/21 17:51:26 wikipedian Exp $' +# +import wikipedia, config +import replace, pagegenerators +import re, sys + +# Summary message +msg={ + 'en':u'robot: changing Windows-1252 characters to HTML entities', + 'fa':u'ربات: تغییر نویسههای Windows-1252 به نهادهای اچتیامال', + 'de':u'Bot: Wandle Windows-1252-Zeichen in HTML-Entitäten um', + 'fr':u'Bot: Modifie caracteres Windows-1252 vers entités HTML', + 'he':u'רובוט: משנה תווים בקידוד Windows-1252 ליישויות HTML', + 'ia':u'Robot: modification de characteres Windows-1252 a entitates HTML', + } + +# characters that are in Windows-1252), but not in ISO 8859-1 +replacements = [ + (u"\x80", u"€"), # euro sign + (u"\x82", u"‚"), # single low-9 quotation mark + (u"\x83", u"ƒ"), # latin small f with hook = function = florin + (u"\x84", u"„"), # double low-9 quotation mark + (u"\x85", u"…"), # horizontal ellipsis = three dot leader + (u"\x86", u"†"), # dagger + (u"\x87", u"‡"), # double dagger + (u"\x88", u"ˆ"), # modifier letter circumflex accent + (u"\x89", u"‰"), # per mille sign + (u"\x8A", u"Š"), # latin capital letter S with caron + (u"\x8B", u"‹"), # single left-pointing angle quotation mark + (u"\x8C", u"Œ"), # latin capital ligature OE + (u"\x8E", u"Ž"), # latin capital letter Z with caron + (u"\x91", u"‘"), # left single quotation mark + (u"\x92", u"’"), # right single quotation mark + (u"\x93", u"“"), # left double quotation mark + (u"\x94", u"”"), # right double quotation mark + (u"\x95", u"•"), # bullet = black small circle + (u"\x96", u"–"), # en dash + (u"\x97", u"—"), # em dash + (u"\x98", u"˜"), # small tilde + (u"\x99", u"™"), # trade mark sign + (u"\x9A", u"š"), # latin small letter s with caron + (u"\x9B", u"&8250;"), # single right-pointing angle quotation mark + (u"\x9C", u"œ"), # latin small ligature oe + (u"\x9E", u"ž"), # latin small letter z with caron + (u"\x9F", u"Ÿ") # latin capital letter Y with diaeresis +] + +class SqlWindows1252PageGenerator: + """ + opens a local SQL dump file, searches for pages with Windows-1252 + characters. + """ + def __init__(self, filename): + self.filename = filename + + def __iter__(self): + # open SQL dump and read page titles out of it + import sqldump + sqldump = sqldump.SQLdump(self.filename, 'latin-1') + for entry in sqldump.entries(): + for char in replacements.keys(): + if entry.text.find(char) != -1: + page = wikipedia.Page(wikipedia.getSite(), entry.full_title()) + yield page + break + +class WindowsCharsBot: + def __init__(self, generator): + self.generator = generator + + def run(self): + replaceBot = replace.ReplaceRobot(self.generator, replacements) + replaceBot.run() + +def main(): + # this temporary array is used to read the page title. + pageTitle = [] + gen = None + + for arg in sys.argv[1:]: + arg = wikipedia.argHandler(arg, 'windows_chars') + if arg: + if arg.startswith('-file'): + if len(arg) == 5: + filename = wikipedia.input(u'please enter the list's filename: ') + else: + filename = arg[6:] + gen = pagegenerators.TextfilePageGenerator(filename) + elif arg.startswith('-sql'): + if len(arg) == 4: + sqlfilename = wikipedia.input(u'please enter the SQL dump's filename: ') + else: + sqlfilename = arg[5:] + gen = SqlWindows1252PageGenerator(sqlfilename) + else: + pageTitle.append(arg) + + # if a single page is given as a command line argument, + # reconnect the title's parts with spaces + if pageTitle != []: + page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle)) + gen = iter([page]) + + # get edit summary message + wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg)) + + if not gen: + wikipedia.showHelp('windows_chars') + elif wikipedia.getSite().encoding() == "utf-8": + print "There is no need to run this robot on UTF-8 wikis." + else: + preloadingGen = pagegenerators.PreloadingGenerator(gen) + bot = WindowsCharsBot(preloadingGen) + bot.run() + +if __name__ == "__main__": + try: + main() + finally: + wikipedia.stopme()
pywikipedia-svn@lists.wikimedia.org