http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11646
Revision: 11646 Author: valhallasw Date: 2013-06-12 21:24:42 +0000 (Wed, 12 Jun 2013) Log Message: ----------- + unicode console support for windows
This is a direct port of the current userinterfaces system from trunk, with the new logging system bolted on. This means I may have destroyed some useful bits in the process, but at least all my tests returned the right results.
Modified Paths: -------------- branches/rewrite/pywikibot/userinterfaces/cgi_interface.py branches/rewrite/pywikibot/userinterfaces/terminal_interface.py branches/rewrite/pywikibot/userinterfaces/transliteration.py
Added Paths: ----------- branches/rewrite/pywikibot/userinterfaces/terminal_interface_base.py branches/rewrite/pywikibot/userinterfaces/terminal_interface_unix.py branches/rewrite/pywikibot/userinterfaces/terminal_interface_win32.py branches/rewrite/pywikibot/userinterfaces/win32_unicode.py
Modified: branches/rewrite/pywikibot/userinterfaces/cgi_interface.py =================================================================== --- branches/rewrite/pywikibot/userinterfaces/cgi_interface.py 2013-06-12 18:59:33 UTC (rev 11645) +++ branches/rewrite/pywikibot/userinterfaces/cgi_interface.py 2013-06-12 21:24:42 UTC (rev 11646) @@ -1,10 +1,3 @@ -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id$' - import sys
class UI:
Modified: branches/rewrite/pywikibot/userinterfaces/terminal_interface.py =================================================================== --- branches/rewrite/pywikibot/userinterfaces/terminal_interface.py 2013-06-12 18:59:33 UTC (rev 11645) +++ branches/rewrite/pywikibot/userinterfaces/terminal_interface.py 2013-06-12 21:24:42 UTC (rev 11646) @@ -1,448 +1,14 @@ # -*- coding: utf-8 -*- # -# (C) Pywikipedia bot team, 2003-2007 +# (C) Pywikipedia bot team, 2003-2012 # # Distributed under the terms of the MIT license. # __version__ = '$Id$'
+import sys
-import traceback, re, sys -import logging -import threading -import pywikibot -from pywikibot import config -from pywikibot.bot import DEBUG, VERBOSE, INFO, STDOUT, INPUT, WARNING -from pywikibot.userinterfaces import transliteration - - -try: - import ctypes - ctypes_found = True -except ImportError: - ctypes_found = False - - -def getDefaultTextColorInWindows(): - """ - This method determines the default text color and saves its color - code inside the variable windowsColors['default']. - - Based on MIT-licensed code by Andre Burgaud published at - http://starship.python.net/crew/theller/wiki/ColorConsole - """ - if sys.platform != 'win32' or not ctypes_found: - return -1 - SHORT = ctypes.c_short - WORD = ctypes.c_ushort - - # wincon.h - class COORD(ctypes.Structure): - _fields_ = [ - ("X", SHORT), - ("Y", SHORT) - ] - - class SMALL_RECT(ctypes.Structure): - _fields_ = [ - ("Left", SHORT), - ("Top", SHORT), - ("Right", SHORT), - ("Bottom", SHORT) - ] - - class CONSOLE_SCREEN_BUFFER_INFO(ctypes.Structure): - _fields_ = [ - ("dwSize", COORD), - ("dwCursorPosition", COORD), - ("wAttributes", WORD), - ("srWindow", SMALL_RECT), - ("dwMaximumWindowSize", COORD) - ] - - std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) - csbi = CONSOLE_SCREEN_BUFFER_INFO() - ctypes.windll.kernel32.GetConsoleScreenBufferInfo(std_out_handle, ctypes.byref(csbi)) - return (csbi.wAttributes & 0x000f) - -# TODO: other colors: - #0 = Black - #1 = Blue - #2 = Green - #3 = Aqua - #4 = Red - #5 = Purple - #6 = Yellow - #7 = White - #8 = Gray - #9 = Light Blue - #10 = Light Green - #11 = Light Aqua - #12 = Light Red - #13 = Light Purple - #14 = Light Yellow - #15 = Bright White - -unixColors = { - 'default': chr(27) + '[0m', # Unix end tag to switch back to default - 'lightblue': chr(27) + '[94;1m', # Light Blue start tag - 'lightgreen': chr(27) + '[92;1m', # Light Green start tag - 'lightaqua': chr(27) + '[36;1m', # Light Aqua start tag - 'lightred': chr(27) + '[91;1m', # Light Red start tag - 'lightpurple': chr(27) + '[35;1m', # Light Purple start tag - 'lightyellow': chr(27) + '[33;1m', # Light Yellow start tag -} - -windowsColors = { - 'default': 7, - 'black': 0, - 'blue': 1, - 'green': 2, - 'aqua': 3, - 'red': 4, - 'purple': 5, - 'yellow': 6, - 'lightgray': 7, - 'gray': 8, - 'lightblue': 9, - 'lightgreen': 10, - 'lightaqua': 11, - 'lightred': 12, - 'lightpurple': 13, - 'lightyellow': 14, - 'white': 15, -} - -colorTagR = re.compile('\03{(?P<name>%s)}' % '|'.join(windowsColors.keys())) - -class UI(object): - def init_handlers(self, root_logger, default_stream=sys.stderr): - """Initialize the handlers for user output. - - This method initializes handler(s) for output levels VERBOSE (if - enabled by config.verbose_output), INFO, STDOUT, WARNING, ERROR, - and CRITICAL. STDOUT writes its output to sys.stdout; all the - others write theirs to sys.stderr. - - """ - - if default_stream == 'stdout': - default_stream = sys.stdout - elif default_stream == 'stderr': - default_stream = sys.stderr - - # default handler for display to terminal - default_handler = TerminalHandler(strm=default_stream) - if config.verbose_output: - default_handler.setLevel(VERBOSE) - else: - default_handler.setLevel(INFO) - # this handler ignores levels above INPUT - default_handler.addFilter(MaxLevelFilter(INPUT)) - default_handler.setFormatter( - TerminalFormatter(fmt="%(message)s%(newline)s")) - root_logger.addHandler(default_handler) - - # handler for level STDOUT - output_handler = TerminalHandler(strm=sys.stdout) - output_handler.setLevel(STDOUT) - output_handler.addFilter(MaxLevelFilter(STDOUT)) - output_handler.setFormatter( - TerminalFormatter(fmt="%(message)s%(newline)s")) - root_logger.addHandler(output_handler) - - # handler for levels WARNING and higher - warning_handler = TerminalHandler(strm=sys.stderr) - warning_handler.setLevel(logging.WARNING) - warning_handler.setFormatter( - TerminalFormatter(fmt="%(levelname)s: %(message)s%(newline)s")) - root_logger.addHandler(warning_handler) - - def input(self, question, password = False): - """ - Ask the user a question and return the answer. - - Works like raw_input(), but returns a unicode string instead of ASCII. - - Unlike raw_input, this function automatically adds a space after the - question. - - """ - - # sound the terminal bell to notify the user - if config.ring_bell: - sys.stdout.write('\07') - - # While we're waiting for user input, - # we don't want terminal writes from other Threads - TerminalHandler.sharedlock.acquire() - try: - pywikibot.logoutput(question + ' ', newline=False, - _level=pywikibot.INPUT) - if password and sys.stdin.isatty(): - import getpass - text = getpass.getpass('') - # See PYWP-13 / http://bugs.python.org/issue11236 - # getpass does not always raise an KeyboardInterrupt when ^C - # is pressed. - if '\x03' in text: - raise KeyboardInterrupt() - else: - text = raw_input() - finally: - TerminalHandler.sharedlock.release() - - if not isinstance(text, unicode): - text = unicode(text, config.console_encoding) - return text - - def inputChoice(self, question, options, hotkeys, default=None): - """ - Ask the user a question with a predefined list of acceptable answers. - """ - options = options[:] # we don't want to edit the passed parameter - for i in range(len(options)): - option = options[i] - hotkey = hotkeys[i] - # try to mark a part of the option name as the hotkey - m = re.search('[%s%s]' % (hotkey.lower(), hotkey.upper()), option) - if hotkey == default: - caseHotkey = hotkey.upper() - else: - caseHotkey = hotkey - if m: - pos = m.start() - options[i] = '%s[%s]%s' % (option[:pos], caseHotkey, - option[pos+1:]) - else: - options[i] = '%s [%s]' % (option, caseHotkey) - # loop until the user entered a valid choice - while True: - prompt = '%s (%s)' % (question, ', '.join(options)) - - # it's okay to enter input with the lock, RLock is reentrant. - answer = self.input(prompt) - if answer.lower() in hotkeys or answer.upper() in hotkeys: - return answer - elif default and answer=='': # empty string entered - return default - - def editText(self, text, jumpIndex=None, highlight=None): - """Return the text as edited by the user. - - Uses a Tkinter edit box because we don't have a console editor - - Parameters: - * text - a Unicode string - * jumpIndex - an integer: position at which to put the caret - * highlight - a substring; each occurence will be highlighted - - """ - try: - import gui - except ImportError, e: - print 'Could not load GUI modules: %s' % e - return text - editor = gui.EditBoxWindow() - return editor.edit(text, jumpIndex=jumpIndex, highlight=highlight) - - def askForCaptcha(self, url): - """Show the user a CAPTCHA image and return the answer.""" - try: - import webbrowser - pywikibot.output(u'Opening CAPTCHA in your web browser...') - webbrowser.open(url) - return pywikibot.input( - u'What is the solution of the CAPTCHA that is shown in your web browser?') - except: - pywikibot.output(u'Error in opening web browser: %s' - % sys.exc_info()[0]) - return pywikibot.input( - u'What is the solution of the CAPTCHA at %s ?' % url) - - -class TerminalHandler(logging.Handler): - """A handler class that writes logging records, appropriately formatted, to - a stream connected to a terminal. This class does not close the stream, - as sys.stdout or sys.stderr may be (and usually will be) used. - - Slightly modified version of the StreamHandler class that ships with - logging module, plus code for colorization of output. - - """ - - # create a class-level lock that can be shared by all instances - import threading - sharedlock = threading.RLock() - - def __init__(self, strm=None): - """Initialize the handler. - - If strm is not specified, sys.stderr is used. - - """ - logging.Handler.__init__(self) - # replace Handler's instance-specific lock with the shared class lock - # to ensure that only one instance of this handler can write to - # the console at a time - self.lock = TerminalHandler.sharedlock - if strm is None: - strm = sys.stderr - self.stream = strm - self.formatter = None - - def flush(self): - """Flush the stream. """ - self.stream.flush() - - def emit_raw(self, record, msg): - """Emit a formatted message. - - The message is written to the stream. If exception information is - present, it is formatted using traceback.print_exception and - appended to the stream. - - """ - try: - fs = "%s" - if isinstance(msg, str): - self.stream.write(fs % msg) - else: - try: - self.stream.write(fs % msg.encode(config.console_encoding, - "xmlcharrefreplace")) - except UnicodeError: - self.stream.write(fs % msg.encode("ascii", - "xmlcharrefreplace")) - self.flush() - except (KeyboardInterrupt, SystemExit): - raise - except: - self.handleError(record) - - def emitColorizedInUnix(self, record, msg): - lastColor = None - for key, value in unixColors.iteritems(): - msg = msg.replace('\03{%s}' % key, value) - # just to be sure, reset the color - msg += unixColors['default'] - self.emit_raw(record, msg) - - def emitColorizedInWindows(self, record, msg): - """This only works in Python 2.5 or higher.""" - if ctypes_found: - std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) - # Color tags might be cascaded, e.g. because of transliteration. - # Therefore we need this stack. - colorStack = [] - tagM = True - while tagM: - tagM = colorTagR.search(msg) - if tagM: - # print the text up to the tag. - self.emit_raw(record, msg[:tagM.start()]) - newColor = tagM.group('name') - if newColor == 'default': - if len(colorStack) > 0: - colorStack.pop() - if len(colorStack) > 0: - lastColor = colorStack[-1] - else: - lastColor = 'default' - ctypes.windll.kernel32.SetConsoleTextAttribute( - std_out_handle, windowsColors[lastColor]) - else: - colorStack.append(newColor) - # set the new color - ctypes.windll.kernel32.SetConsoleTextAttribute( - std_out_handle, windowsColors[newColor]) - msg = msg[tagM.end():] - # print the rest of the text - self.emit_raw(record, msg) - # just to be sure, reset the color - ctypes.windll.kernel32.SetConsoleTextAttribute( - std_out_handle, windowsColors['default']) - else: - # ctypes is only available since Python 2.5, and we won't - # try to colorize without it. Instead we add *** after the text - # as a whole if anything needed to be colorized. - lines = msg.split('\n') - for line in lines: - line, count = colorTagR.subn('', line) - if count > 0: - line += '***' - line += '\n' - self.emit_raw(record, line) - - def emit(self, record): - text = self.format(record) - if config.transliterate: - # Encode unicode string in the encoding used by the user's console, - # and decode it back to unicode. Then we can see which characters - # can't be represented in the console encoding. - codecedText = text.encode(config.console_encoding, 'replace' - ).decode(config.console_encoding) - transliteratedText = list(codecedText) - # Note: A transliteration replacement might be longer than the - # original character; e.g., ч is transliterated to ch. - # the resulting list will have as many elements as there are - # characters in the original text, but some list elements may - # contain multiple characters - prev = "-" - prevchar = -1 - cursor = 0 - while cursor < len(codecedText): - char = codecedText.find(u"?", cursor) - if char == -1: - break - cursor = char + 1 - # work on characters that couldn't be encoded, but not on - # original question marks. - if text[char] != u"?": - if char > 0 and prevchar != char - 1: - prev = transliteratedText[char-1] - try: - transliterated = transliteration.trans( - text[char], default='?', - prev=prev, next=text[char+1]) - except IndexError: - transliterated = transliteration.trans( - text[char], default='?', - prev=prev, next=' ') - # transliteration was successful. The replacement - # could consist of multiple letters. - # mark the transliterated letters in yellow. - transliteratedText[char] = u'\03{lightyellow}%s\03{default}' \ - % transliterated - # save the last transliterated character - prev = transliterated[-1:] - prevchar = char - text = u"".join(transliteratedText) - if config.colorized_output: - if sys.platform == 'win32': - self.emitColorizedInWindows(record, text) - else: - self.emitColorizedInUnix(record, text) - else: - self.emit_raw(record, text) - - -class TerminalFormatter(logging.Formatter): - pass - - -class MaxLevelFilter(logging.Filter): - """Filter that only passes records at or below a specific level. - - (setting handler level only passes records at or *above* a specified level, - so this provides the opposite functionality) - - """ - def __init__(self, level=None): - self.level = level - - def filter(self, record): - if self.level: - return record.levelno <= self.level - else: - return True +if sys.platform == 'win32': + from terminal_interface_win32 import Win32UI as UI +else: + from terminal_interface_unix import UnixUI as UI
Added: branches/rewrite/pywikibot/userinterfaces/terminal_interface_base.py =================================================================== --- branches/rewrite/pywikibot/userinterfaces/terminal_interface_base.py (rev 0) +++ branches/rewrite/pywikibot/userinterfaces/terminal_interface_base.py 2013-06-12 21:24:42 UTC (rev 11646) @@ -0,0 +1,323 @@ +# -*- coding: utf-8 -*- +# +# (C) Pywikipedia bot team, 2003-2013 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import transliteration +import traceback, re, sys +import pywikibot as wikipedia +from pywikibot import config +from pywikibot.bot import DEBUG, VERBOSE, INFO, STDOUT, INPUT, WARNING +import logging + +transliterator = transliteration.transliterator(config.console_encoding) + +colors = [ + 'default', + 'black', + 'blue', + 'green', + 'aqua', + 'red', + 'purple', + 'yellow', + 'lightgray', + 'gray', + 'lightblue', + 'lightgreen', + 'lightaqua', + 'lightred', + 'lightpurple', + 'lightyellow', + 'white', +] + +colorTagR = re.compile('\03{(?P<name>%s)}' % '|'.join(colors)) + + +class UI: + def __init__(self): + self.stdin = sys.stdin + self.stdout = sys.stdout + self.stderr = sys.stderr + self.encoding = config.console_encoding + self.transliteration_target = config.transliteration_target + + self.stderr = sys.stderr + self.stdout = sys.stdout + + def init_handlers(self, root_logger, default_stream='stderr'): + """Initialize the handlers for user output. + + This method initializes handler(s) for output levels VERBOSE (if + enabled by config.verbose_output), INFO, STDOUT, WARNING, ERROR, + and CRITICAL. STDOUT writes its output to sys.stdout; all the + others write theirs to sys.stderr. + + """ + + if default_stream == 'stdout': + default_stream = self.stdout + elif default_stream == 'stderr': + default_stream = self.stderr + + # default handler for display to terminal + default_handler = TerminalHandler(self, strm=default_stream) + if config.verbose_output: + default_handler.setLevel(VERBOSE) + else: + default_handler.setLevel(INFO) + # this handler ignores levels above INPUT + default_handler.addFilter(MaxLevelFilter(INPUT)) + default_handler.setFormatter( + TerminalFormatter(fmt="%(message)s%(newline)s")) + root_logger.addHandler(default_handler) + + # handler for level STDOUT + output_handler = TerminalHandler(self, strm=self.stdout) + output_handler.setLevel(STDOUT) + output_handler.addFilter(MaxLevelFilter(STDOUT)) + output_handler.setFormatter( + TerminalFormatter(fmt="%(message)s%(newline)s")) + root_logger.addHandler(output_handler) + + # handler for levels WARNING and higher + warning_handler = TerminalHandler(self, strm=self.stderr) + warning_handler.setLevel(logging.WARNING) + warning_handler.setFormatter( + TerminalFormatter(fmt="%(levelname)s: %(message)s%(newline)s")) + root_logger.addHandler(warning_handler) + + def printNonColorized(self, text, targetStream): + # We add *** after the text as a whole if anything needed to be colorized. + lines = text.split('\n') + for i,line in enumerate(lines): + if i > 0: + line = "\n" + line + line, count = colorTagR.subn('', line) + if count > 0: + line += ' ***' + targetStream.write(line.encode(self.encoding, 'replace')) + + printColorized = printNonColorized + + def _print(self, text, targetStream): + if config.colorized_output: + self.printColorized(text, targetStream) + else: + self.printNonColorized(text, targetStream) + + def output(self, text, toStdout=False, targetStream=None): + """ + If a character can't be displayed in the encoding used by the user's + terminal, it will be replaced with a question mark or by a + transliteration. + + """ + if config.transliterate: + # Encode our unicode string in the encoding used by the user's + # console, and decode it back to unicode. Then we can see which + # characters can't be represented in the console encoding. + # We need to take min(console_encoding, transliteration_target) + # the first is what the terminal is capable of + # the second is how unicode-y the user would like the output + codecedText = text.encode(self.encoding, + 'replace').decode(self.encoding) + if self.transliteration_target: + codecedText = codecedText.encode(self.transliteration_target, + 'replace').decode(self.transliteration_target) + transliteratedText = '' + # Note: A transliteration replacement might be longer than the + # original character, e.g. ч is transliterated to ch. + prev = "-" + for i in xrange(len(codecedText)): + # work on characters that couldn't be encoded, but not on + # original question marks. + if codecedText[i] == '?' and text[i] != u'?': + try: + transliterated = transliterator.transliterate( + text[i], default='?', prev=prev, next=text[i+1]) + except IndexError: + transliterated = transliterator.transliterate( + text[i], default = '?', prev=prev, next=' ') + # transliteration was successful. The replacement + # could consist of multiple letters. + # mark the transliterated letters in yellow. + transliteratedText += '\03{lightyellow}%s\03{default}' \ + % transliterated + transLength = len(transliterated) + # memorize if we replaced a single letter by multiple + # letters. + if len(transliterated) > 0: + prev = transliterated[-1] + else: + # no need to try to transliterate. + transliteratedText += codecedText[i] + prev = codecedText[i] + text = transliteratedText + + if not targetStream: + if toStdout: + targetStream = self.stdout + else: + targetStream = self.stderr + + self._print(text, targetStream) + + def _raw_input(self): + return raw_input() + + def input(self, question, password = False): + """ + Ask the user a question and return the answer. + + Works like raw_input(), but returns a unicode string instead of ASCII. + + Unlike raw_input, this function automatically adds a space after the + question. + + """ + + # sound the terminal bell to notify the user + if config.ring_bell: + sys.stdout.write('\07') + # TODO: make sure this is logged as well + self.output(question + ' ') + if password: + import getpass + text = getpass.getpass('') + else: + text = self._raw_input() + text = unicode(text, self.encoding) + return text + + def inputChoice(self, question, options, hotkeys, default=None): + """ + Ask the user a question with a predefined list of acceptable answers. + """ + options = options[:] # we don't want to edit the passed parameter + for i in range(len(options)): + option = options[i] + hotkey = hotkeys[i] + # try to mark a part of the option name as the hotkey + m = re.search('[%s%s]' % (hotkey.lower(), hotkey.upper()), option) + if hotkey == default: + caseHotkey = hotkey.upper() + else: + caseHotkey = hotkey + if m: + pos = m.start() + options[i] = '%s[%s]%s' % (option[:pos], caseHotkey, + option[pos+1:]) + else: + options[i] = '%s [%s]' % (option, caseHotkey) + # loop until the user entered a valid choice + while True: + prompt = '%s (%s)' % (question, ', '.join(options)) + answer = self.input(prompt) + if answer.lower() in hotkeys or answer.upper() in hotkeys: + return answer + elif default and answer=='': # empty string entered + return default + + def editText(self, text, jumpIndex=None, highlight=None): + """Return the text as edited by the user. + + Uses a Tkinter edit box because we don't have a console editor + + Parameters: + * text - a Unicode string + * jumpIndex - an integer: position at which to put the caret + * highlight - a substring; each occurence will be highlighted + + """ + try: + import gui + except ImportError, e: + print 'Could not load GUI modules: %s' % e + return text + editor = gui.EditBoxWindow() + return editor.edit(text, jumpIndex=jumpIndex, highlight=highlight) + + def askForCaptcha(self, url): + """Show the user a CAPTCHA image and return the answer.""" + try: + import webbrowser + wikipedia.output(u'Opening CAPTCHA in your web browser...') + if webbrowser.open(url): + return wikipedia.input( + u'What is the solution of the CAPTCHA that is shown in ' + u'your web browser?') + else: + raise + except: + wikipedia.output(u'Error in opening web browser: %s' + % sys.exc_info()[0]) + wikipedia.output( + u'Please copy this url to your web browser and open it:\n %s' + % url) + return wikipedia.input( + u'What is the solution of the CAPTCHA at this url ?') + +class TerminalHandler(logging.Handler): + """A handler class that writes logging records, appropriately formatted, to + a stream connected to a terminal. This class does not close the stream, + as sys.stdout or sys.stderr may be (and usually will be) used. + + Slightly modified version of the StreamHandler class that ships with + logging module, plus code for colorization of output. + + """ + + # create a class-level lock that can be shared by all instances + import threading + sharedlock = threading.RLock() + + def __init__(self, UI, strm=None): + """Initialize the handler. + + If strm is not specified, sys.stderr is used. + + """ + logging.Handler.__init__(self) + # replace Handler's instance-specific lock with the shared class lock + # to ensure that only one instance of this handler can write to + # the console at a time + self.lock = TerminalHandler.sharedlock + if strm is None: + strm = sys.stderr + self.stream = strm + self.formatter = None + self.UI = UI + + def flush(self): + """Flush the stream. """ + self.stream.flush() + + def emit(self, record): + text = self.format(record) + return self.UI.output(text, targetStream = self.stream) + + +class TerminalFormatter(logging.Formatter): + pass + + +class MaxLevelFilter(logging.Filter): + """Filter that only passes records at or below a specific level. + + (setting handler level only passes records at or *above* a specified level, + so this provides the opposite functionality) + + """ + def __init__(self, level=None): + self.level = level + + def filter(self, record): + if self.level: + return record.levelno <= self.level + else: + return True \ No newline at end of file
Property changes on: branches/rewrite/pywikibot/userinterfaces/terminal_interface_base.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:executable + *
Added: branches/rewrite/pywikibot/userinterfaces/terminal_interface_unix.py =================================================================== --- branches/rewrite/pywikibot/userinterfaces/terminal_interface_unix.py (rev 0) +++ branches/rewrite/pywikibot/userinterfaces/terminal_interface_unix.py 2013-06-12 21:24:42 UTC (rev 11646) @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# +# (C) Pywikipedia bot team, 2003-2012 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import terminal_interface_base + +# TODO: other colors: + #0 = Black + #1 = Blue + #2 = Green + #3 = Aqua + #4 = Red + #5 = Purple + #6 = Yellow + #7 = White + #8 = Gray + #9 = Light Blue + #10 = Light Green + #11 = Light Aqua + #12 = Light Red + #13 = Light Purple + #14 = Light Yellow + #15 = Bright White + +unixColors = { + 'default': chr(27) + '[0m', # Unix end tag to switch back to default + 'lightblue': chr(27) + '[94;1m', # Light Blue start tag + 'lightgreen': chr(27) + '[92;1m', # Light Green start tag + 'lightaqua': chr(27) + '[36;1m', # Light Aqua start tag + 'lightred': chr(27) + '[91;1m', # Light Red start tag + 'lightpurple': chr(27) + '[35;1m', # Light Purple start tag + 'lightyellow': chr(27) + '[33;1m', # Light Yellow start tag +} + +class UnixUI(terminal_interface_base.UI): + def printColorized(self, text, targetStream): + lastColor = None + totalcount = 0 + for key, value in unixColors.iteritems(): + ckey = '\03{%s}' % key + totalcount += text.count(ckey) + text = text.replace(ckey, value) + + if totalcount > 0: + # just to be sure, reset the color + text += unixColors['default'] + + targetStream.write(text.encode(self.encoding, 'replace'))
Property changes on: branches/rewrite/pywikibot/userinterfaces/terminal_interface_unix.py ___________________________________________________________________ Added: svn:executable + * Added: svn:keywords + Author Date Id Revision
Added: branches/rewrite/pywikibot/userinterfaces/terminal_interface_win32.py =================================================================== --- branches/rewrite/pywikibot/userinterfaces/terminal_interface_win32.py (rev 0) +++ branches/rewrite/pywikibot/userinterfaces/terminal_interface_win32.py 2013-06-12 21:24:42 UTC (rev 11646) @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +# +# (C) Pywikipedia bot team, 2003-2012 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import re +import terminal_interface_base + +try: + import ctypes + ctypes_found = True +except ImportError: + ctypes_found = False + +windowsColors = { + 'default': 7, + 'black': 0, + 'blue': 1, + 'green': 2, + 'aqua': 3, + 'red': 4, + 'purple': 5, + 'yellow': 6, + 'lightgray': 7, + 'gray': 8, + 'lightblue': 9, + 'lightgreen': 10, + 'lightaqua': 11, + 'lightred': 12, + 'lightpurple': 13, + 'lightyellow': 14, + 'white': 15, +} + +colorTagR = re.compile('\03{(?P<name>%s)}' % '|'.join(windowsColors.keys())) + +# Compat for python <= 2.5 +class Win32BaseUI(terminal_interface_base.UI): + def __init__(self): + terminal_interface_base.UI.__init__(self) + self.encoding = 'ascii' + + +class Win32CtypesUI(Win32BaseUI): + def __init__(self): + Win32BaseUI.__init__(self) + from win32_unicode import stdin, stdout, stderr + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.encoding = 'utf-8' + + def printColorized(self, text, targetStream): + std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11) + # Color tags might be cascaded, e.g. because of transliteration. + # Therefore we need this stack. + import win32api + colorStack = [] + tagM = True + while tagM: + tagM = colorTagR.search(text) + if tagM: + # print the text up to the tag. + targetStream.write(text[:tagM.start()].encode(self.encoding, 'replace')) + newColor = tagM.group('name') + if newColor == 'default': + if len(colorStack) > 0: + colorStack.pop() + if len(colorStack) > 0: + lastColor = colorStack[-1] + else: + lastColor = 'default' + ctypes.windll.kernel32.SetConsoleTextAttribute(std_out_handle, windowsColors[lastColor]) + else: + colorStack.append(newColor) + # set the new color + ctypes.windll.kernel32.SetConsoleTextAttribute(std_out_handle, windowsColors[newColor]) + text = text[tagM.end():] + # print the rest of the text + targetStream.write(text.encode(self.encoding, 'replace')) + # just to be sure, reset the color + ctypes.windll.kernel32.SetConsoleTextAttribute(std_out_handle, windowsColors['default']) + + def _raw_input(self): + data = self.stdin.readline() + if '\x1a' in data: + raise EOFError() + return data.strip() + +if ctypes_found: + Win32UI = Win32CtypesUI +else: + Win32UI = Win32BaseUI
Property changes on: branches/rewrite/pywikibot/userinterfaces/terminal_interface_win32.py ___________________________________________________________________ Added: svn:executable + * Added: svn:keywords + Author Date Id Revision
Modified: branches/rewrite/pywikibot/userinterfaces/transliteration.py =================================================================== --- branches/rewrite/pywikibot/userinterfaces/transliteration.py 2013-06-12 18:59:33 UTC (rev 11645) +++ branches/rewrite/pywikibot/userinterfaces/transliteration.py 2013-06-12 21:24:42 UTC (rev 11646) @@ -1,1907 +1,1686 @@ -# -*- coding: utf-8 -*- -# -# (C) Pywikipedia bot team, 2006-2008 -# -# Distributed under the terms of the MIT license. -# -def trans(char, default = '?', prev = '-', next = '-'): - # Give a transliteration for char, or default if none is known - # Accented etc. Latin characters - if char in u"ÀÁÂẦẤẪẨẬÃĀĂẰẮẴẶẲȦǠẠḀȂĄǍẢ": - return u"A" - if char in u"ȀǞ": - return u"Ä" - if char == u"Ǻ": - return u"Å" - if char == u"Ä": - return u"Ae" - if char == u"Å": - return u"Aa" - if char in u"àáâầấẫẩậãāăằắẵặẳȧǡạḁȃąǎảẚ": - return u"a" - if char in u"ȁǟ": - return u"ä" - if char == u"ǻ": - return u"å" - if char == u"ä": - return u"ae" - if char == u"å": - return u"aa" - if char in u"ḂḄḆƁƂ": - return u"B" - if char in u"ḃḅḇƀɓƃ": - return u"b" - if char in u"ĆĈĊÇČƇ": - return u"C" - if char in u"ćĉċçčƈȼ": - return u"c" - if char == u"Ḉ": - return u"Ç" - if char == u"ḉ": - return u"ç" - if char == u"Ð": - return u"Dh" - if char == u"ð": - return u"dh" - if char in u"ĎḊḌḎḐḒĐƉƊƋ": - return u"D" - if char in u"ďḋḍḏḑḓđɖɗƌ": - return u"d" - if char in u"ÈȄÉÊḚËĒḔḖĔĖẸE̩ȆȨḜĘĚẼḘẺ": - return u"E" - if char in u"ỀẾỄỆỂ": - return u"Ê" - if char in u"èȅéêḛëēḕḗĕėẹe̩ȇȩḝęěẽḙẻ": - return u"e" - if char in u"ềếễệể": - return u"ê" - if char in u"ḞƑ": - return u"F" - if char in u"ḟƒ": - return u"f" - if char in u"ǴḠĞĠĢǦǤƓ": - return u"G" - if char in u"ǵḡğġģǧǥɠ": - return u"g" - if char == u"Ĝ": - return u"Gx" - if char == u"ĝ": - return u"gx" - if char in u"ḢḤḦȞḨḪH̱ĦǶ": - return u"H" - if char in u"ḣḥḧȟḩḫ̱ẖħƕ": - return u"h" - if char in u"IÌȈÍÎĨḬÏḮĪĬȊĮǏİỊỈƗ": - return u"I" - if char in u"ıìȉíîĩḭïḯīĭȋįǐiịỉɨ": - return u"i" - if char in u"ĴJ": - return u"J" - if char in u"ɟĵ̌ǰ": - return u"j" - if char in u"ḰǨĶḲḴƘ": - return u"K" - if char in u"ḱǩķḳḵƙ": - return u"k" - if char in u"ĹĻĽḶḸḺḼȽŁ": - return u"L" - if char in u"ĺļľḷḹḻḽƚłɫ": - return u"l" - if char in u"ḾṀṂ": - return u"M" - if char in u"ḿṁṃɱ": - return u"m" - if char in u"ǸŃÑŅŇṄṆṈṊŊƝɲȠ": - return u"N" - if char in u"ǹńñņňṅṇṉṋŋɲƞ": - return u"n" - if char in u"ÒÓÔÕṌṎȬÖŌṐṒŎǑȮȰỌǪǬƠỜỚỠỢỞỎƟØǾ": - return u"O" - if char in u"òóôõṍṏȭöōṑṓŏǒȯȱọǫǭơờớỡợởỏɵøǿ": - return u"o" - if char in u"ȌŐȪ": - return u"Ö" - if char in u"ȍőȫ": - return u"ö" - if char in u"ỒỐỖỘỔȎ": - return u"Ô" - if char in u"ồốỗộổȏ": - return u"ô" - if char in u"ṔṖƤ": - return u"P" - if char in u"ṕṗƥ": - return u"p" - if char == u"ᵽ": - return u"q" - if char in u"ȐŔŖŘȒṘṚṜṞ": - return u"R" - if char in u"ȑŕŗřȓṙṛṝṟɽ": - return u"r" - if char in u"ŚṤŞȘŠṦṠṢṨ": - return u"S" - if char in u"śṥşșšṧṡṣṩȿ": - return u"s" - if char == u"Ŝ": - return u"Sx" - if char == u"ŝ": - return u"sx" - if char in u"ŢȚŤṪṬṮṰŦƬƮ": - return u"T" - if char in u"ţțťṫṭṯṱŧȾƭʈ": - return u"t" - if char in u"ÙÚŨṸṴÜṲŪṺŬỤŮŲǓṶỦƯỮỰỬ": - return u"U" - if char in u"ùúũṹṵüṳūṻŭụůųǔṷủưữựửʉ": - return u"u" - if char in u"ȔŰǛǗǕǙ": - return u"Ü" - if char in u"ȕűǜǘǖǚ": - return u"ü" - if char == u"Û": - return u"Ux" - if char == u"û": - return u"ux" - if char == u"Ȗ": - return u"Û" - if char == u"ȗ": - return u"û" - if char == u"Ừ": - return u"Ù" - if char == u"ừ": - return u"ù" - if char == u"Ứ": - return u"Ú" - if char == u"ứ": - return u"ú" - if char in u"ṼṾ": - return u"V" - if char in u"ṽṿ": - return u"v" - if char in u"ẀẂŴẄẆẈ": - return u"W" - if char in u"ẁẃŵẅẇẉ": - return u"w" - if char in u"ẊẌ": - return u"X" - if char in u"ẋẍ": - return u"x" - if char in u"ỲÝŶŸỸȲẎỴỶƳ": - return u"Y" - if char in u"ỳýŷÿỹȳẏỵỷƴ": - return u"y" - if char in u"ŹẐŻẒŽẔƵȤ": - return u"Z" - if char in u"źẑżẓžẕƶȥ": - return u"z" - if char == u"ɀ": - return u"zv" +# -*- coding: utf-8 -*- +__version__ = '$Id$'
- # Latin: extended Latin alphabet - if char == u"ɑ": - return u"a" - if char in u"ÆǼǢ": - return u"AE" - if char in u"æǽǣ": - return u"ae" - if char == u"Ð": - return u"Dh" - if char == u"ð": - return u"dh" - if char in u"ƎƏƐ": - return u"E" - if char in u"ǝəɛ": - return u"e" - if char in u"ƔƢ": - return u"G" - if char in u"ᵷɣƣᵹ": - return u"g" - if char == u"Ƅ": - return u"H" - if char == u"ƅ": - return u"h" - if char == u"Ƕ": - return u"Wh" - if char == u"ƕ": - return u"wh" - if char == u"Ɩ": - return u"I" - if char == u"ɩ": - return u"i" - if char == u"Ŋ": - return u"Ng" - if char == u"ŋ": - return u"ng" - if char == u"Œ": - return u"OE" - if char == u"œ": - return u"oe" - if char == u"Ɔ": - return u"O" - if char == u"ɔ": - return u"o" - if char == u"Ȣ": - return u"Ou" - if char == u"ȣ": - return u"ou" - if char == u"Ƽ": - return u"Q" - if char in u"ĸƽ": - return u"q" - if char == u"ȹ": - return u"qp" - if char == u"": - return u"r" - if char == u"ſ": - return u"s" - if char == u"ß": - return u"ss" - if char == u"Ʃ": - return u"Sh" - if char == u"ʃᶋ": - return u"sh" - if char == u"Ʉ": - return u"U" - if char == u"ʉ": - return u"u" - if char == u"Ʌ": - return u"V" - if char == u"ʌ": - return u"v" - if char in u"ƜǷ": - return u"W" - if char in u"ɯƿ": - return u"w" - if char == u"Ȝ": - return u"Y" - if char == u"ȝ": - return u"y" - if char == u"IJ": - return u"IJ" - if char == u"ij": - return u"ij" - if char == u"Ƨ": - return u"Z" - if char in u"ʮƨ": - return u"z" - if char == u"Ʒ": - return u"Zh" - if char == u"ʒ": - return u"zh" - if char == u"Ǯ": - return u"Dzh" - if char == u"ǯ": - return u"dzh" - if char in u"ƸƹʔˀɁɂ": - return u"'" - if char in u"Þ": - return u"Th" - if char in u"þ": - return u"th" - if char in u"Cʗǃ": - return u"!"
- #Punctuation and typography - if char in u"«»“”„¨": - return u'"' - if char in u"‘’′": - return u"'" - if char == u"•": - return u"*" - if char == u"@": - return u"(at)" - if char == u"¤": - return u"$" - if char == u"¢": - return u"c" - if char == u"€": - return u"E" - if char == u"£": - return u"L" - if char == u"¥": - return u"yen" - if char == u"†": - return u"+" - if char == u"‡": - return u"++" - if char == u"°": - return u":" - if char == u"¡": - return u"!" - if char == u"¿": - return u"?" - if char == u"‰": - return u"o/oo" - if char == u"‱": - return u"o/ooo" - if char in u"¶§": - return u">" - if char in u"…": - return u"..." - if char in u"‒–—―": - return u"-" - if char in u"·": - return u" " - if char == u"¦": - return u"|" - if char == u"⁂": - return u"***" - if char == u"◊": - return u"<>" - if char == u"‽": - return u"?!" - if char == u"؟": - return u";-)" +class transliterator(object): + def __init__(self, encoding): + self.trans = {} + for char in u"ÀÁÂẦẤẪẨẬÃĀĂẰẮẴẶẲȦǠẠḀȂĄǍẢ": + self.trans[char] = u"A" + for char in u"ȀǞ": + self.trans[char] = u"Ä" + self.trans[u"Ǻ"] = u"Å" + self.trans[u"Ä"] = u"Ae" + self.trans[u"Å"] = u"Aa" + for char in u"àáâầấẫẩậãāăằắẵặẳȧǡạḁȃąǎảẚ": + self.trans[char] = u"a" + for char in u"ȁǟ": + self.trans[char] = u"ä" + self.trans[u"ǻ"] = u"å" + self.trans[u"ä"] = u"ae" + self.trans[u"å"] = u"aa" + for char in u"ḂḄḆƁƂ": + self.trans[char] = u"B" + for char in u"ḃḅḇƀɓƃ": + self.trans[char] = u"b" + for char in u"ĆĈĊÇČƇ": + self.trans[char] = u"C" + for char in u"ćĉċçčƈȼ": + self.trans[char] = u"c" + self.trans[u"Ḉ"] = u"Ç" + self.trans[u"ḉ"] = u"ç" + self.trans[u"Ð"] = u"Dh" + self.trans[u"ð"] = u"dh" + for char in u"ĎḊḌḎḐḒĐƉƊƋ": + self.trans[char] = u"D" + for char in u"ďḋḍḏḑḓđɖɗƌ": + self.trans[char] = u"d" + for char in u"ÈȄÉÊḚËĒḔḖĔĖẸE̩ȆȨḜĘĚẼḘẺ": + self.trans[char] = u"E" + for char in u"ỀẾỄỆỂ": + self.trans[char] = u"Ê" + for char in u"èȅéêḛëēḕḗĕėẹe̩ȇȩḝęěẽḙẻ": + self.trans[char] = u"e" + for char in u"ềếễệể": + self.trans[char] = u"ê" + for char in u"ḞƑ": + self.trans[char] = u"F" + for char in u"ḟƒ": + self.trans[char] = u"f" + for char in u"ǴḠĞĠĢǦǤƓ": + self.trans[char] = u"G" + for char in u"ǵḡğġģǧǥɠ": + self.trans[char] = u"g" + self.trans[u"Ĝ"] = u"Gx" + self.trans[u"ĝ"] = u"gx" + for char in u"ḢḤḦȞḨḪH̱ĦǶ": + self.trans[char] = u"H" + for char in u"ḣḥḧȟḩḫ̱ẖħƕ": + self.trans[char] = u"h" + for char in u"IÌȈÍÎĨḬÏḮĪĬȊĮǏİỊỈƗ": + self.trans[char] = u"I" + for char in u"ıìȉíîĩḭïḯīĭȋįǐiịỉɨ": + self.trans[char] = u"i" + for char in u"ĴJ": + self.trans[char] = u"J" + for char in u"ɟĵ̌ǰ": + self.trans[char] = u"j" + for char in u"ḰǨĶḲḴƘ": + self.trans[char] = u"K" + for char in u"ḱǩķḳḵƙ": + self.trans[char] = u"k" + for char in u"ĹĻĽḶḸḺḼȽŁ": + self.trans[char] = u"L" + for char in u"ĺļľḷḹḻḽƚłɫ": + self.trans[char] = u"l" + for char in u"ḾṀṂ": + self.trans[char] = u"M" + for char in u"ḿṁṃɱ": + self.trans[char] = u"m" + for char in u"ǸŃÑŅŇṄṆṈṊŊƝɲȠ": + self.trans[char] = u"N" + for char in u"ǹńñņňṅṇṉṋŋɲƞ": + self.trans[char] = u"n" + for char in u"ÒÓÔÕṌṎȬÖŌṐṒŎǑȮȰỌǪǬƠỜỚỠỢỞỎƟØǾ": + self.trans[char] = u"O" + for char in u"òóôõṍṏȭöōṑṓŏǒȯȱọǫǭơờớỡợởỏɵøǿ": + self.trans[char] = u"o" + for char in u"ȌŐȪ": + self.trans[char] = u"Ö" + for char in u"ȍőȫ": + self.trans[char] = u"ö" + for char in u"ỒỐỖỘỔȎ": + self.trans[char] = u"Ô" + for char in u"ồốỗộổȏ": + self.trans[char] = u"ô" + for char in u"ṔṖƤ": + self.trans[char] = u"P" + for char in u"ṕṗƥ": + self.trans[char] = u"p" + self.trans[u"ᵽ"] = u"q" + for char in u"ȐŔŖŘȒṘṚṜṞ": + self.trans[char] = u"R" + for char in u"ȑŕŗřȓṙṛṝṟɽ": + self.trans[char] = u"r" + for char in u"ŚṤŞȘŠṦṠṢṨ": + self.trans[char] = u"S" + for char in u"śṥşșšṧṡṣṩȿ": + self.trans[char] = u"s" + self.trans[u"Ŝ"] = u"Sx" + self.trans[u"ŝ"] = u"sx" + for char in u"ŢȚŤṪṬṮṰŦƬƮ": + self.trans[char] = u"T" + for char in u"ţțťṫṭṯṱŧȾƭʈ": + self.trans[char] = u"t" + for char in u"ÙÚŨṸṴÜṲŪṺŬỤŮŲǓṶỦƯỮỰỬ": + self.trans[char] = u"U" + for char in u"ùúũṹṵüṳūṻŭụůųǔṷủưữựửʉ": + self.trans[char] = u"u" + for char in u"ȔŰǛǗǕǙ": + self.trans[char] = u"Ü" + for char in u"ȕűǜǘǖǚ": + self.trans[char] = u"ü" + self.trans[u"Û"] = u"Ux" + self.trans[u"û"] = u"ux" + self.trans[u"Ȗ"] = u"Û" + self.trans[u"ȗ"] = u"û" + self.trans[u"Ừ"] = u"Ù" + self.trans[u"ừ"] = u"ù" + self.trans[u"Ứ"] = u"Ú" + self.trans[u"ứ"] = u"ú" + for char in u"ṼṾ": + self.trans[char] = u"V" + for char in u"ṽṿ": + self.trans[char] = u"v" + for char in u"ẀẂŴẄẆẈ": + self.trans[char] = u"W" + for char in u"ẁẃŵẅẇẉ": + self.trans[char] = u"w" + for char in u"ẊẌ": + self.trans[char] = u"X" + for char in u"ẋẍ": + self.trans[char] = u"x" + for char in u"ỲÝŶŸỸȲẎỴỶƳ": + self.trans[char] = u"Y" + for char in u"ỳýŷÿỹȳẏỵỷƴ": + self.trans[char] = u"y" + for char in u"ŹẐŻẒŽẔƵȤ": + self.trans[char] = u"Z" + for char in u"źẑżẓžẕƶȥ": + self.trans[char] = u"z" + self.trans[u"ɀ"] = u"zv"
+ # Latin: extended Latin alphabet + self.trans[u"ɑ"] = u"a" + for char in u"ÆǼǢ": + self.trans[char] = u"AE" + for char in u"æǽǣ": + self.trans[char] = u"ae" + self.trans[u"Ð"] = u"Dh" + self.trans[u"ð"] = u"dh" + for char in u"ƎƏƐ": + self.trans[char] = u"E" + for char in u"ǝəɛ": + self.trans[char] = u"e" + for char in u"ƔƢ": + self.trans[char] = u"G" + for char in u"ᵷɣƣᵹ": + self.trans[char] = u"g" + self.trans[u"Ƅ"] = u"H" + self.trans[u"ƅ"] = u"h" + self.trans[u"Ƕ"] = u"Wh" + self.trans[u"ƕ"] = u"wh" + self.trans[u"Ɩ"] = u"I" + self.trans[u"ɩ"] = u"i" + self.trans[u"Ŋ"] = u"Ng" + self.trans[u"ŋ"] = u"ng" + self.trans[u"Œ"] = u"OE" + self.trans[u"œ"] = u"oe" + self.trans[u"Ɔ"] = u"O" + self.trans[u"ɔ"] = u"o" + self.trans[u"Ȣ"] = u"Ou" + self.trans[u"ȣ"] = u"ou" + self.trans[u"Ƽ"] = u"Q" + for char in u"ĸƽ": + self.trans[char] = u"q" + self.trans[u"ȹ"] = u"qp" + self.trans[u""] = u"r" + self.trans[u"ſ"] = u"s" + self.trans[u"ß"] = u"ss" + self.trans[u"Ʃ"] = u"Sh" + for char in u"ʃᶋ": + self.trans[char] = u"sh" + self.trans[u"Ʉ"] = u"U" + self.trans[u"ʉ"] = u"u" + self.trans[u"Ʌ"] = u"V" + self.trans[u"ʌ"] = u"v" + for char in u"ƜǷ": + self.trans[char] = u"W" + for char in u"ɯƿ": + self.trans[char] = u"w" + self.trans[u"Ȝ"] = u"Y" + self.trans[u"ȝ"] = u"y" + self.trans[u"IJ"] = u"IJ" + self.trans[u"ij"] = u"ij" + self.trans[u"Ƨ"] = u"Z" + for char in u"ʮƨ": + self.trans[char] = u"z" + self.trans[u"Ʒ"] = u"Zh" + self.trans[u"ʒ"] = u"zh" + self.trans[u"Ǯ"] = u"Dzh" + self.trans[u"ǯ"] = u"dzh" + for char in u"ƸƹʔˀɁɂ": + self.trans[char] = u"'" + for char in u"Þ": + self.trans[char] = u"Th" + for char in u"þ": + self.trans[char] = u"th" + for char in u"Cʗǃ": + self.trans[char] = u"!"
- # Cyrillic - if char == u"А": - return u"A" - if char == u"а": - return u"a" - if char == u"Б": - return u"B" - if char == u"б": - return u"b" - if char == u"В": - return u"V" - if char == u"в": - return u"v" - if char == u"Г": - return u"G" - if char == u"г": - return u"g" - if char == u"Д": - return u"D" - if char == u"д": - return u"d" - if char == u"Е": - return u"E" - if char == u"е": - return u"e" - if char == u"Ж": - return u"Zh" - if char == u"ж": - return u"zh" - if char == u"З": - return u"Z" - if char == u"з": - return u"z" - if char == u"И": - return u"I" - if char == u"и": - return u"i" - if char == u"Й": - return u"J" - if char == u"й": - return u"j" - if char == u"К": - return u"K" - if char == u"к": - return u"k" - if char == u"Л": - return u"L" - if char == u"л": - return u"l" - if char == u"М": - return u"M" - if char == u"м": - return u"m" - if char == u"Н": - return u"N" - if char == u"н": - return u"n" - if char == u"О": - return u"O" - if char == u"о": - return u"o" - if char == u"П": - return u"P" - if char == u"п": - return u"p" - if char == u"Р": - return u"R" - if char == u"р": - return u"r" - if char == u"С": - return u"S" - if char == u"с": - return u"s" - if char == u"Т": - return u"T" - if char == u"т": - return u"t" - if char in u"У": - return u"U" - if char == u"у": - return u"u" - if char == u"Ф": - return u"F" - if char == u"ф": - return u"f" - if char in u"ХΧ": - if prev.lower() == prev: - return u"Kh" - else: - return u"KH" - if char == u"х": - return u"kh" - if char == u"Ц": - return u"C" - if char == u"ц": - return u"c" - if char == u"Ч": - return u"Ch" - if char == u"ч": - return u"ch" - if char == u"Ш": - return u"Sh" - if char == u"ш": - return u"sh" - if char == u"Щ": - return u"Shch" - if char == u"щ": - return u"shch" - if char in u"Ьь": - return u"'" - if char in u"Ъъ": - return '"' - if char == u"Ю": - return u"Yu" - if char == u"ю": - return u"yu" - if char == u"Я": - return u"Ya" - if char == u"я": - return u"ya" - # Additional Cyrillic letters, most occuring in only one or a few languages - if char == u"Ы": - return u"Y" - if char == u"ы": - return u"y" - if char == u"Ё": - return u"Ë" - if char == u"ё": - return u"ë" - if char in u"ЭЀ": - return u"È" - if char in u"эѐ": - return u"è" - if char == u"І": - return u"I" - if char == u"і": - return u"i" - if char == u"Ї": - return u"Ji" - if char == u"ї": - return u"ji" - if char == u"Є": - return u"Je" - if char == u"є": - return u"je" - if char in u"ҐҜ": - return u"G" - if char in u"ґҝ": - return u"g" - if char == u"Ђ": - return u"Dj" - if char == u"ђ": - return u"dj" - if char in u"ЈӤҊ": - return u"J" - if char in u"јӥҋ": - return u"j" - if char == u"Ӣ": - return u"Y" - if char == u"ӣ": - return u"y" - if char == u"Љ": - return u"Lj" - if char == u"љ": - return u"lj" - if char == u"Њ": - return u"Nj" - if char == u"њ": - return u"nj" - if char == u"Ћ": - return u"Cj" - if char == u"ћ": - return u"cj" - if char in u"ЏӁӜҶ": - return u"Dzh" - if char in u"џӂӝҷ": - return u"dzh" - if char == u"Җ": - return u"Zhj" - if char == u"җ": - return u"zhj" - if char in u"ЅӞӠӋҸ": - return u"Dz" - if char in u"ѕӟӡӌҹ": - return u"dz" - if char == u"Ѓ": - return u"Gj" - if char == u"ѓ": - return u"gj" - if char == u"Ќ": - return u"Kj" - if char == u"ќ": - return u"kj" - if char in u"ҒӶҔ": - return u"G" - if char in u"ғӷҕ": - return u"g" - if char == u"Ӣ": - return u"Ii" - if char == u"ӣ": - return u"ii" - if char in u"ҚҞҠӃ": - return u"Q" - if char == u"қҟҡӄ": - return u"q" - if char == u"Ӯ": - return u"U" - if char == u"ӯ": - return u"u" - if char == u"Ҳ": - return u"H" - if char == u"ҳ": - return u"h" - if char == u"Ҷ": - return u"Dz" - if char == u"ҷ": - return u"dz" - if char in u"ӨӪ": - return u"Ô" - if char in u"өӫ": - return u"ô" - if char == u"Ү": - return u"Y" - if char == u"ү": - return u"y" - if char == u"Һ": - return u"H" - if char == u"һ": - return u"h" - if char in u"ӘӔ": - return u"AE" - if char == u"ә": - return u"ae" - if char == u"ӚӬ": - return u"Ë" - if char == u"ӛӭ": - return u"ë" - if char == u"Җ": - return u"Zhj" - if char == u"җ": - return u"zhj" - if char == u"ҢҤӉӇ": - return u"Ng" - if char == u"ңҥӊӈ": - return u"ng" - if char == u"Ұ": - return u"U" - if char == u"ұ": - return u"u" - if char == u"ў": - return u"ù" - if char == u"Ў": - return u"Ù" - if char == u"ѝ": - return u"ì" - if char == u"Ѝ": - return u"Ì" - if char == u"Ӑ": - return u"A" - if char == u"ă": - return u"a" - if char == u"Ӓ": - return u"Ä" - if char == u"ä": - return u"ä" - if char in u"ӖѢҌ": - return u"E" - if char in u"ӗѣҍ": - return u"e" - if char == u"ҼҾ": - return u"Ts" - if char == u"ҽҿ": - return u"ts" - if char == u"Ҙ": - return u"Dh" - if char == u"ҙ": - return u"dh" - if char in u"Ӏӏ": - return u"" - if char == u"Ӆ": - return u"L" - if char == u"ӆ": - return u"l" - if char == u"Ӎ": - return u"M" - if char == u"ӎ": - return u"m" - if char == u"Ӧ": - return u"Ö" - if char == u"ӧ": - return u"ö" - if char == u"Ҩ": - return u"u" - if char == u"ҩ": - return u"u" - if char == u"Ҧ": - return u"Ph" - if char == u"ҧ": - return u"ph" - if char == u"Ҏ": - return u"R" - if char == u"ҏ": - return u"r" - if char == u"Ҫ": - return u"Th" - if char == u"ҫ": - return u"th" - if char == u"Ҭ": - return u"T" - if char == u"ҭ": - return u"t" - if char in u"ӲӰҮ": - return u"Ü" - if char in u"ӳӱү": - return u"ü" - if char == u"Ӯ": - return u"Û" - if char == u"ӯ": - return u"û" - if char == u"ҰӸ": - return u"U" - if char == u"ұӹ": - return u"u" - if char == u"Ҵ": - return u"Tts" - if char == u"ҵ": - return u"tts" - if char == u"Ӵ": - return u"Ch" - if char == u"ӵ": - return u"ch" + #Punctuation and typography + for char in u"«»“”„¨": + self.trans[char] = u'"' + for char in u"‘’′": + self.trans[char] = u"'" + self.trans[u"•"] = u"*" + self.trans[u"@"] = u"(at)" + self.trans[u"¤"] = u"$" + self.trans[u"¢"] = u"c" + self.trans[u"€"] = u"E" + self.trans[u"£"] = u"L" + self.trans[u"¥"] = u"yen" + self.trans[u"†"] = u"+" + self.trans[u"‡"] = u"++" + self.trans[u"°"] = u":" + self.trans[u"¡"] = u"!" + self.trans[u"¿"] = u"?" + self.trans[u"‰"] = u"o/oo" + self.trans[u"‱"] = u"o/ooo" + for char in u"¶§": + self.trans[char] = u">" + for char in u"…": + self.trans[char] = u"..." + for char in u"‒–—―": + self.trans[char] = u"-" + for char in u"·": + self.trans[char] = u" " + self.trans[u"¦"] = u"|" + self.trans[u"⁂"] = u"***" + self.trans[u"◊"] = u"<>" + self.trans[u"‽"] = u"?!" + self.trans[u"؟"] = u";-)" + self.trans[u"¹"] = u"1" + self.trans[u"²"] = u"2" + self.trans[u"³"] = u"3"
- # Archaic Cyrillic letters - if char == u"Ѹ": - return u"Ou" - if char == u"ѹ": - return u"ou" - if char in u"ѠѺ": - return u"O" - if char in u"ѡѻ": - return u"o" - if char == u"Ѿ": - return u"Ot" - if char == u"ѿ": - return u"ot" - if char == u"Ѣ": - return u"E" - if char == u"ѣ": - return u"e" - if char in u"ѤѦ": - return u"Ei" - if char in u"ѥѧ": - return u"ei" - if char == u"Ѫ": - return u"Ai" - if char == u"ѫ": - return u"ai" - if char == u"Ѯ": - return u"X" - if char == u"ѯ": - return u"x" - if char == u"Ѱ": - return u"Ps" - if char == u"ѱ": - return u"ps" - if char == u"Ѳ": - return u"Th" - if char == u"ѳ": - return u"th" - if char in u"ѴѶ": - return u"Ü" - if char == u"ѵ": - return u"ü" + # Cyrillic + self.trans.update({u"А" : u"A", u"а" : u"a", u"Б" : u"B", u"б" : u"b", + u"В" : u"V", u"в" : u"v", u"Г" : u"G", u"г" : u"g", + u"Д" : u"D", u"д" : u"d", u"Е" : u"E", u"е" : u"e", + u"Ж" : u"Zh", u"ж" : u"zh", u"З" : u"Z", u"з" : u"z", + u"И" : u"I", u"и" : u"i", u"Й" : u"J", u"й" : u"j", + u"К" : u"K", u"к" : u"k", u"Л" : u"L", u"л" : u"l", + u"М" : u"M", u"м" : u"m", u"Н" : u"N", u"н" : u"n", + u"О" : u"O", u"о" : u"o", u"П" : u"P", u"п" : u"p", + u"Р" : u"R", u"р" : u"r", u"С" : u"S", u"с" : u"s", + u"Т" : u"T", u"т" : u"t", u"У" : u"U", u"у" : u"u", + u"Ф" : u"F", u"ф" : u"f", u"х" : u"kh", u"Ц" : u"C", + u"ц" : u"c", u"Ч" : u"Ch", u"ч" : u"ch", u"Ш" : u"Sh", + u"ш" : u"sh", u"Щ" : u"Shch", u"щ" : u"shch", u"Ь" : u"'", + u"ь" : "'", u"Ъ" : u'"', u"ъ" : '"', u"Ю" : u"Yu", + u"ю" : u"yu", u"Я" : u"Ya", u"я" : u"ya", u"Х" : u"Kh", + u"Χ" : u"Kh"})
+ # Additional Cyrillic letters, most occuring in only one or a few languages + self.trans.update({u"Ы" : u"Y", u"ы" : u"y", u"Ё" : u"Ë", u"ё" : u"ë", + u"Э" : u"È", u"Ѐ" : u"È", u"э" : u"è", u"ѐ" : u"è", + u"І" : u"I", u"і" : u"i", u"Ї" : u"Ji", u"ї" : u"ji", + u"Є" : u"Je", u"є" : u"je", u"Ґ" : u"G", u"Ҝ" : u"G", + u"ґ" : u"g", u"ҝ" : u"g", u"Ђ" : u"Dj", u"ђ" : u"dj", + u"Ӣ" : u"Y", u"ӣ" : u"y", u"Љ" : u"Lj", u"љ" : u"lj", + u"Њ" : u"Nj", u"њ" : u"nj", u"Ћ" : u"Cj", u"ћ" : u"cj", + u"Җ" : u"Zhj", u"җ" : u"zhj", u"Ѓ" : u"Gj", u"ѓ" : u"gj", + u"Ќ" : u"Kj", u"ќ" : u"kj", u"Ӣ" : u"Ii", u"ӣ" : u"ii", + u"Ӯ" : u"U", u"ӯ" : u"u", u"Ҳ" : u"H", u"ҳ" : u"h", + u"Ҷ" : u"Dz",u"ҷ" : u"dz", u"Ө" :u"Ô", u"Ӫ" : u"Ô", + u"ө" : u"ô", u"ӫ" : u"ô", u"Ү": u"Y", u"ү": u"y", u"Һ": u"H", + u"һ": u"h", u"Ә": u"AE", u"Ӕ": u"AE", u"ә": u"ae", + u"Ӛ": u"Ë", u"Ӭ": u"Ë", u"ӛ": u"ë", u"ӭ": u"ë", u"Җ": u"Zhj", + u"җ": u"zhj", u"Ұ": u"U", u"ұ": u"u", u"ў": u"ù", u"Ў": u"Ù", + u"ѝ": u"ì", u"Ѝ": u"Ì", u"Ӑ": u"A", u"ă": u"a", u"Ӓ": u"Ä", + u"ҿ": u"ä", u"Ҽ" : u"Ts", u"Ҿ": u"Ts", u"ҽ": u"ts", u"ҿ": u"ts", + u"Ҙ": u"Dh", u"ҙ": u"dh", u"Ӏ": u"", u"ӏ": u"", u"Ӆ": u"L", + u"ӆ": u"l", u"Ӎ": u"M", u"ӎ": u"m", u"Ӧ": u"Ö", u"ӧ": u"ö", + u"Ҩ": u"u", u"ҩ": u"u", u"Ҧ": u"Ph", u"ҧ": u"ph", u"Ҏ": u"R", + u"ҏ": u"r", u"Ҫ": u"Th", u"ҫ": u"th", u"Ҭ": u"T", u"ҭ": u"t", + u"Ӯ": u"Û", u"ӯ": u"û", u"Ұ": u"U", u"Ӹ": u"U", u"ұ": u"u", + u"ӹ": u"u", u"Ҵ": u"Tts", u"ҵ": u"tts", u"Ӵ": u"Ch", u"ӵ": u"ch"})
- # Hebrew alphabet - if char in u"אע": - return u"'" - if char == u"ב": - return u"b" - if char == u"ג": - return u"g" - if char == u"ד": - return u"d" - if char == u"ה": - return u"h" - if char == u"ו": - return u"v" - if char == u"ז": - return u"z" - if char == u"ח": - return u"kh" - if char == u"ט": - return u"t" - if char == u"י": - return u"y" - if char in u"ךכ": - return u"k" - if char == u"ל": - return u"l" - if char in u"םמ": - return u"m" - if char in u"ןנ": - return u"n" - if char == u"ס": - return u"s" - if char in u"ףפ": - return u"ph" - if char in u"ץצ": - return u"ts" - if char == u"ק": - return u"q" - if char == u"ר": - return u"r" - if char == u"ש": - return u"sh" - if char == u"ת": - return u"th" + for char in u"ЈӤҊ": + self.trans[char] = u"J" + for char in u"јӥҋ": + self.trans[char] = u"j" + for char in u"ЏӁӜҶ": + self.trans[char] = u"Dzh" + for char in u"џӂӝҷ": + self.trans[char] = u"dzh" + for char in u"ЅӞӠӋҸ": + self.trans[char] = u"Dz" + for char in u"ѕӟӡӌҹ": + self.trans[char] = u"dz" + for char in u"ҒӶҔ": + self.trans[char] = u"G" + for char in u"ғӷҕ": + self.trans[char] = u"g" + for char in u"ҚҞҠӃ": + self.trans[char] = u"Q" + for char in u"қҟҡӄ": + self.trans[char] = u"q" + for char in u"ҢҤӉӇ": + self.trans[char] = u"Ng" + for char in u"ңҥӊӈ": + self.trans[char] = u"ng" + for char in u"ӖѢҌ": + self.trans[char] = u"E" + for char in u"ӗѣҍ": + self.trans[char] = u"e" + for char in u"ӲӰҮ": + self.trans[char] = u"Ü" + for char in u"ӳӱү": + self.trans[char] = u"ü"
- # Arab alphabet - if char in u"اﺍﺎ": - return u"a" - if char in u"بﺏﺐﺒﺑ": - return u"b" - if char in u"تﺕﺖﺘﺗ": - return u"t" - if char in u"ثﺙﺚﺜﺛ": - return u"th" - if char in u"جﺝﺞﺠﺟ": - return u"g" - if char in u"حﺡﺢﺤﺣ": - return u"h" - if char in u"خﺥﺦﺨﺧ": - return u"kh" - if char in u"دﺩﺪ": - return u"d" - if char in u"ذﺫﺬ": - return u"dh" - if char in u"رﺭﺮ": - return u"r" - if char in u"زﺯﺰ": - return u"z" - if char in u"سﺱﺲﺴﺳ": - return u"s" - if char in u"شﺵﺶﺸﺷ": - return u"sh" - if char in u"صﺹﺺﺼﺻ": - return u"s" - if char in u"ضﺽﺾﻀﺿ": - return u"d" - if char in u"طﻁﻂﻄﻃ": - return u"t" - if char in u"ظﻅﻆﻈﻇ": - return u"z" - if char in u"عﻉﻊﻌﻋ": - return u"'" - if char in u"غﻍﻎﻐﻏ": - return u"gh" - if char in u"فﻑﻒﻔﻓ": - return u"f" - if char in u"قﻕﻖﻘﻗ": - return u"q" - if char in u"كﻙﻚﻜﻛک": - return u"k" - if char in u"لﻝﻞﻠﻟ": - return u"l" - if char in u"مﻡﻢﻤﻣ": - return u"m" - if char in u"نﻥﻦﻨﻧ": - return u"n" - if char in u"هﻩﻪﻬﻫ": - return u"h" - if char in u"وﻭﻮ": - return u"w" - if char in u"یيﻱﻲﻴﻳ": - return u"y" - # Arabic - additional letters, modified letters and ligatures - if char == u"ﺀ": - return u"'" - if char in u"آﺁﺂ": - return u"'a" - if char in u"ةﺓﺔ": - return u"th" - if char in u"ىﻯﻰ": - return u"á" - if char in u"یﯼﯽﯿﯾ": - return u"y" - if char == u"؟": - return u"?" - # Arabic - ligatures - if char in u"ﻻﻼ": - return u"la" - if char == u"ﷲ": - return u"llah" - if char in u"إأ": - return u"a'" - if char == u"ؤ": - return u"w'" - if char == u"ئ": - return u"y'" - if char == u"◌": - return prev - if char in u"◌◌": - return u"" # indicates absence of vowels - # Arabic vowels - if char == u"◌": - return u"a" - if char == u"◌": - return u"u" - if char == u"◌": - return u"i" - if char == u"◌": - return u"a" - if char == u"◌": - return u"ay" - if char == u"◌": - return u"ay" - if char == u"◌": - return u"u" - if char == u"◌": - return u"iy" - # Arab numerals - if char in u"٠۰": - return u"0" - if char in u"١۱": - return u"1" - if char in u"٢۲": - return u"2" - if char in u"٣۳": - return u"3" - if char in u"٤۴": - return u"4" - if char in u"٥۵": - return u"5" - if char in u"٦۶": - return u"6" - if char in u"٧۷": - return u"7" - if char in u"٨۸": - return u"8" - if char in u"٩۹": - return u"9" - # Perso-Arabic - if char in u"پﭙﭙپ": - return u"p" - if char in u"چچچچ": - return u"ch" - if char in u"ژژ": - return u"zh" - if char in u"گﮔﮕﮓ": - return u"g" + # Archaic Cyrillic letters + self.trans.update({u"Ѹ": u"Ou", u"ѹ": u"ou", u"Ѡ": u"O", u"Ѻ": u"O", u"ѡ": u"o", + u"ѻ": u"o", u"Ѿ": u"Ot", u"ѿ": u"ot", u"Ѣ": u"E", u"ѣ": u"e", + u"Ѥ": u"Ei", u"Ѧ": u"Ei", u"ѥ": u"ei", u"ѧ": u"ei", u"Ѫ": u"Ai", + u"ѫ": u"ai", u"Ѯ": u"X", u"ѯ": u"x", u"Ѱ": u"Ps", u"ѱ": u"ps", + u"Ѳ": u"Th", u"ѳ": u"th", u"Ѵ": u"Ü", u"Ѷ": u"Ü", u"ѵ": u"ü"})
- # Greek - if char == u"Α": - return u"A" - if char == u"α": - return u"a" - if char == u"Β": - return u"B" - if char == u"β": - return u"b" - if char == u"Γ": - return u"G" - if char == u"γ": - return u"g" - if char == u"Δ": - return u"D" - if char == u"δ": - return u"d" - if char == u"Ε": - return u"E" - if char == u"ε": - return u"e" - if char == u"Ζ": - return u"Z" - if char == u"ζ": - return u"z" - if char == u"Η": - return u"I" - if char == u"η": - return u"i" - if char == u"Θ": - if prev.lower() == prev: - return u"Th" - else: - return u"TH" - if char == u"θ": - return u"th" - if char == u"Ι": - return u"I" - if char == u"ι": - return u"i" - if char == u"Κ": - return u"K" - if char == u"κ": - return u"k" - if char == u"Λ": - return u"L" - if char == u"λ": - return u"l" - if char == u"Μ": - return u"M" - if char == u"μ": - return u"m" - if char == u"Ν": - return u"N" - if char == u"ν": - return u"n" - if char == u"Ξ": - return u"X" - if char == u"ξ": - return u"x" - if char == u"Ο": - return u"O" - if char == u"ο": - return u"o" - if char == u"Π": - return u"P" - if char == u"π": - return u"p" - if char == u"Ρ": - return u"R" - if char == u"ρ": - return u"r" - if char == u"Σ": - return u"S" - if char in u"σς": - return u"s" - if char == u"Τ": - return u"T" - if char == u"τ": - return u"t" - if char == u"Υ": - return u"Y" - if char == u"υ": - return u"y" - if char == u"Φ": - return u"F" - if char == u"φ": - return u"f" - if char == u"Ψ": - if prev.lower() == prev: - return u"Ps" - else: - return u"PS" - if char == u"ψ": - return u"ps" - if char == u"Ω": - return u"O" - if char == u"ω": - return u"o" - # Greek: Special and old characters - if char == u"ϗ": - return u"&" - if char == u"Ϛ": - if prev.lower() == prev: - return u"St" - else: - return u"ST" - if char == u"ϛ": - return u"st" - if char in u"ϘϞ": - return u"Q" - if char in u"ϙϟ": - return u"q" - if char == u"Ϻ": - return u"S" - if char == u"ϻ": - return u"s" - if char == u"Ϡ": - if prev.lower() == prev: - return u"Ss" - else: - return u"SS" - if char == u"ϡ": - return u"ss" - if char == u"Ϸ": - if prev.lower() == prev: - return u"Sh" - else: - return u"SH" - if char == u"ϸ": - return u"sh" - if char == u"·": - return u":" - # Greek: Accented characters - if char == u"Ά": - return u"Á" - if char == u"ά": - return u"á" - if char in u"ΈΉ": - return u"É" - if char in u"έή": - return u"é" - if char == u"Ί": - return u"Í" - if char == u"ί": - return u"í" - if char == u"Ϊ": - return u"Ï" - if char in u"ϊΐ": - return u"ï" - if char == u"Ό": - return u"Ó" - if char == u"ό": - return u"ó" - if char == u"Ύ": - return u"Ý" - if char == u"ύ": - return u"ý" - if char == u"Ϋ": - return u"Y" - if char in u"ϋΰ": - return u"ÿ" - if char == u"Ώ": - return u"Ó" - if char == u"ώ": - return u"ó" + # Hebrew alphabet + for char in u"אע": + self.trans[char] = u"'" + self.trans[u"ב"] = u"b" + self.trans[u"ג"] = u"g" + self.trans[u"ד"] = u"d" + self.trans[u"ה"] = u"h" + self.trans[u"ו"] = u"v" + self.trans[u"ז"] = u"z" + self.trans[u"ח"] = u"kh" + self.trans[u"ט"] = u"t" + self.trans[u"י"] = u"y" + for char in u"ךכ": + self.trans[char] = u"k" + self.trans[u"ל"] = u"l" + for char in u"םמ": + self.trans[char] = u"m" + for char in u"ןנ": + self.trans[char] = u"n" + self.trans[u"ס"] = u"s" + for char in u"ףפ": + self.trans[char] = u"ph" + for char in u"ץצ": + self.trans[char] = u"ts" + self.trans[u"ק"] = u"q" + self.trans[u"ר"] = u"r" + self.trans[u"ש"] = u"sh" + self.trans[u"ת"] = u"th"
- # Japanese (katakana and hiragana) - if char in u"アァあ": - return u"a" - if char in u"イィい": - return u"i" - if char in u"ウう": - return u"u" - if char in u"エェえ": - return u"e" - if char in u"オォお": - return u"o" - if char in u"ャや": - return u"ya" - if char in u"ュゆ": - return u"yu" - if char in u"ョよ": - return u"yo" - if char in u"カか": - return u"ka" - if char in u"キき": - return u"ki" - if char in u"クく": - return u"ku" - if char in u"ケけ": - return u"ke" - if char in u"コこ": - return u"ko" - if char in u"サさ": - return u"sa" - if char in u"シし": - return u"shi" - if char in u"スす": - return u"su" - if char in u"セせ": - return u"se" - if char in u"ソそ": - return u"so" - if char in u"タた": - return u"ta" - if char in u"チち": - return u"chi" - if char in u"ツつ": - return u"tsu" - if char in u"テて": - return u"te" - if char in u"トと": - return u"to" - if char in u"ナな": - return u"na" - if char in u"ニに": - return u"ni" - if char in u"ヌぬ": - return u"nu" - if char in u"ネね": - return u"ne" - if char in u"ノの": - return u"no" - if char in u"ハは": - return u"ha" - if char in u"ヒひ": - return u"hi" - if char in u"フふ": - return u"fu" - if char in u"ヘへ": - return u"he" - if char in u"ホほ": - return u"ho" - if char in u"マま": - return u"ma" - if char in u"ミみ": - return u"mi" - if char in u"ムむ": - return u"mu" - if char in u"メめ": - return u"me" - if char in u"モも": - return u"mo" - if char in u"ラら": - return u"ra" - if char in u"リり": - return u"ri" - if char in u"ルる": - return u"ru" - if char in u"レれ": - return u"re" - if char in u"ロろ": - return u"ro" - if char in u"ワわ": - return u"wa" - if char in u"ヰゐ": - return u"wi" - if char in u"ヱゑ": - return u"we" - if char in u"ヲを": - return u"wo" - if char in u"ンん": - return u"n" - if char in u"ガが": - return u"ga" - if char in u"ギぎ": - return u"gi" - if char in u"グぐ": - return u"gu" - if char in u"ゲげ": - return u"ge" - if char in u"ゴご": - return u"go" - if char in u"ザざ": - return u"za" - if char in u"ジじ": - return u"ji" - if char in u"ズず": - return u"zu" - if char in u"ゼぜ": - return u"ze" - if char in u"ゾぞ": - return u"zo" - if char in u"ダだ": - return u"da" - if char in u"ヂぢ": - return u"dji" - if char in u"ヅづ": - return u"dzu" - if char in u"デで": - return u"de" - if char in u"ドど": - return u"do" - if char in u"バば": - return u"ba" - if char in u"ビび": - return u"bi" - if char in u"ブぶ": - return u"bu" - if char in u"ベべ": - return u"be" - if char in u"ボぼ": - return u"bo" - if char in u"パぱ": - return u"pa" - if char in u"ピぴ": - return u"pi" - if char in u"プぷ": - return u"pu" - if char in u"ペぺ": - return u"pe" - if char in u"ポぽ": - return u"po" - if char in u"ヴゔ": - return u"vu" - if char == u"ヷ": - return u"va" - if char == u"ヸ": - return u"vi" - if char == u"ヹ": - return u"ve" - if char == u"ヺ": - return u"vo" - if char == u"ッ": - return trans(next)[0] + # Arab alphabet + for char in u"اﺍﺎ": + self.trans[char] = u"a" + for char in u"بﺏﺐﺒﺑ": + self.trans[char] = u"b" + for char in u"تﺕﺖﺘﺗ": + self.trans[char] = u"t" + for char in u"ثﺙﺚﺜﺛ": + self.trans[char] = u"th" + for char in u"جﺝﺞﺠﺟ": + self.trans[char] = u"g" + for char in u"حﺡﺢﺤﺣ": + self.trans[char] = u"h" + for char in u"خﺥﺦﺨﺧ": + self.trans[char] = u"kh" + for char in u"دﺩﺪ": + self.trans[char] = u"d" + for char in u"ذﺫﺬ": + self.trans[char] = u"dh" + for char in u"رﺭﺮ": + self.trans[char] = u"r" + for char in u"زﺯﺰ": + self.trans[char] = u"z" + for char in u"سﺱﺲﺴﺳ": + self.trans[char] = u"s" + for char in u"شﺵﺶﺸﺷ": + self.trans[char] = u"sh" + for char in u"صﺹﺺﺼﺻ": + self.trans[char] = u"s" + for char in u"ضﺽﺾﻀﺿ": + self.trans[char] = u"d" + for char in u"طﻁﻂﻄﻃ": + self.trans[char] = u"t" + for char in u"ظﻅﻆﻈﻇ": + self.trans[char] = u"z" + for char in u"عﻉﻊﻌﻋ": + self.trans[char] = u"'" + for char in u"غﻍﻎﻐﻏ": + self.trans[char] = u"gh" + for char in u"فﻑﻒﻔﻓ": + self.trans[char] = u"f" + for char in u"قﻕﻖﻘﻗ": + self.trans[char] = u"q" + for char in u"كﻙﻚﻜﻛک": + self.trans[char] = u"k" + for char in u"لﻝﻞﻠﻟ": + self.trans[char] = u"l" + for char in u"مﻡﻢﻤﻣ": + self.trans[char] = u"m" + for char in u"نﻥﻦﻨﻧ": + self.trans[char] = u"n" + for char in u"هﻩﻪﻬﻫ": + self.trans[char] = u"h" + for char in u"وﻭﻮ": + self.trans[char] = u"w" + for char in u"یيﻱﻲﻴﻳ": + self.trans[char] = u"y" + # Arabic - additional letters, modified letters and ligatures + self.trans[u"ﺀ"] = u"'" + for char in u"آﺁﺂ": + self.trans[char] = u"'a" + for char in u"ةﺓﺔ": + self.trans[char] = u"th" + for char in u"ىﻯﻰ": + self.trans[char] = u"á" + for char in u"یﯼﯽﯿﯾ": + self.trans[char] = u"y" + self.trans[u"؟"] = u"?" + # Arabic - ligatures + for char in u"ﻻﻼ": + self.trans[char] = u"la" + self.trans[u"ﷲ"] = u"llah" + for char in u"إأ": + self.trans[char] = u"a'" + self.trans[u"ؤ"] = u"w'" + self.trans[u"ئ"] = u"y'" + for char in u"◌◌": + self.trans[char] = u"" # indicates absence of vowels + # Arabic vowels + self.trans[u"◌"] = u"a" + self.trans[u"◌"] = u"u" + self.trans[u"◌"] = u"i" + self.trans[u"◌"] = u"a" + self.trans[u"◌"] = u"ay" + self.trans[u"◌"] = u"ay" + self.trans[u"◌"] = u"u" + self.trans[u"◌"] = u"iy" + # Arab numerals + for char in u"٠۰": + self.trans[char] = u"0" + for char in u"١۱": + self.trans[char] = u"1" + for char in u"٢۲": + self.trans[char] = u"2" + for char in u"٣۳": + self.trans[char] = u"3" + for char in u"٤۴": + self.trans[char] = u"4" + for char in u"٥۵": + self.trans[char] = u"5" + for char in u"٦۶": + self.trans[char] = u"6" + for char in u"٧۷": + self.trans[char] = u"7" + for char in u"٨۸": + self.trans[char] = u"8" + for char in u"٩۹": + self.trans[char] = u"9" + # Perso-Arabic + for char in u"پﭙﭙپ": + self.trans[char] = u"p" + for char in u"چچچچ": + self.trans[char] = u"ch" + for char in u"ژژ": + self.trans[char] = u"zh" + for char in u"گﮔﮕﮓ": + self.trans[char] = u"g"
- # Japanese and Chinese punctuation and typography - if char == u"・·": - return u" " - if char == u"々仝ヽヾゝゞ〱〲〳〵〴〵": - return prev - if char in u"〃『』《》": - return u'"' - if char in u"「」〈〉〘〙〚〛": - return u"'" - if char in u"(〔": - return u"(" - if char in u")〕": - return u")" - if char in u"[【〖": - return u"[" - if char in u"]】〗": - return u"]" - if char == u"{": - return u"{" - if char == u"}": - return u"}" - if char == u"っ": - return u":" - if char == u"ー": - return u"h" - if char == u"゛": - return u"'" - if char == u"゜": - return u"p" - if char == u"。": - return u". " - if char == u"、": - return u", " - if char == u"・": - return u" " - if char == u"〆": - return u"shime" - if char == u"〜": - return u"-" - if char == u"…": - return u"..." - if char == u"‥": - return u".." - if char == u"ヶ": - return u"months" - if char in u"•◦": - return u"_" - if char in u"※*": - return u"*" - if char == u"Ⓧ": - return u"(X)" - if char == u"Ⓨ": - return u"(Y)" - if char == u"!": - return u"!" - if char == u"?": - return u"?" - if char == u";": - return u";" - if char == u":": - return u":" - if char == u"。": - return u"." - if char in u",、": - return u"," + # Greek + self.trans.update({u"Α": u"A", u"α": u"a", u"Β": u"B", u"β": u"b", u"Γ": u"G", + u"γ": u"g", u"Δ": u"D", u"δ": u"d", u"Ε": u"E", u"ε": u"e", + u"Ζ": u"Z", u"ζ": u"z", u"Η": u"I", u"η": u"i", u"θ": u"th", + u"Θ": u"Th", u"Ι": u"I", u"ι": u"i", u"Κ": u"K", u"κ": u"k", + u"Λ": u"L", u"λ": u"l", u"Μ": u"M", u"μ": u"m", u"Ν": u"N", + u"ν": u"n", u"Ξ": u"X", u"ξ": u"x", u"Ο": u"O", u"ο": u"o", + u"Π": u"P", u"π": u"p", u"Ρ": u"R", u"ρ": u"r", u"Σ": u"S", + u"σ": u"s", u"ς": u"s", u"Τ": u"T", u"τ": u"t", u"Υ": u"Y", + u"υ": u"y", u"Φ": u"F", u"φ": u"f", u"Ψ": u"Ps", u"ψ": u"ps", + u"Ω": u"O", u"ω": u"o", u"ϗ": u"&", u"Ϛ": u"St", u"ϛ": u"st", + u"Ϙ": u"Q", u"Ϟ": u"Q", u"ϙ": u"q", u"ϟ": u"q", u"Ϻ": u"S", + u"ϻ": u"s", u"Ϡ": u"Ss", u"ϡ": u"ss", u"Ϸ": u"Sh", u"ϸ": u"sh", + u"·": u":", u"Ά": u"Á", u"ά": u"á", u"Έ": u"É", u"Ή": u"É", + u"έ": u"é", u"ή": u"é", u"Ί": u"Í", u"ί": u"í", u"Ϊ": u"Ï", + u"ϊ": u"ï", u"ΐ": u"ï", u"Ό": u"Ó", u"ό": u"ó", u"Ύ": u"Ý", + u"ύ": u"ý", u"Ϋ": u"Y", u"ϋ": u"ÿ", u"ΰ": u"ÿ", u"Ώ": u"Ó", + u"ώ": u"ó"})
- # Georgian - if char == u"ა": - return u"a" - if char == u"ბ": - return u"b" - if char == u"გ": - return u"g" - if char == u"დ": - return u"d" - if char in u"ეჱ": - return u"e" - if char == u"ვ": - return u"v" - if char == u"ზ": - return u"z" - if char == u"თ":# - return u"th" - if char == u"ი": - return u"i" - if char == u"კ":# - return u"k" - if char == u"ლ": - return u"l" - if char == u"მ": - return u"m" - if char == u"ნ": - return u"n" - if char == u"ო": - return u"o" - if char == u"პ":# - return u"p" - if char == u"ჟ":# - return u"zh" - if char == u"რ": - return u"r" - if char == u"ს": - return u"s" - if char == u"ტ":# - return u"t" - if char == u"უ": - return u"u" - if char == u"ფ":# - return u"ph" - if char == u"ქ":# - return u"q" - if char == u"ღ":# - return u"gh" - if char == u"ყ":# - return u"q'" - if char == u"შ": - return u"sh" - if char == u"ჩ": - return u"ch" - if char == u"ც": - return u"ts" - if char == u"ძ": - return u"dz" - if char == u"წ":# - return u"ts'" - if char == u"ჭ":# - return u"ch'" - if char == u"ხ": - return u"kh" - if char == u"ჯ":# - return u"j" - if char == u"ჰ": - return u"h" - if char == u"ჳ": - return u"w" - if char == u"ჵ": - return u"o" - if char == u"ჶ": - return u"f" + # Japanese (katakana and hiragana) + for char in u"アァあ": + self.trans[char] = u"a" + for char in u"イィい": + self.trans[char] = u"i" + for char in u"ウう": + self.trans[char] = u"u" + for char in u"エェえ": + self.trans[char] = u"e" + for char in u"オォお": + self.trans[char] = u"o" + for char in u"ャや": + self.trans[char] = u"ya" + for char in u"ュゆ": + self.trans[char] = u"yu" + for char in u"ョよ": + self.trans[char] = u"yo" + for char in u"カか": + self.trans[char] = u"ka" + for char in u"キき": + self.trans[char] = u"ki" + for char in u"クく": + self.trans[char] = u"ku" + for char in u"ケけ": + self.trans[char] = u"ke" + for char in u"コこ": + self.trans[char] = u"ko" + for char in u"サさ": + self.trans[char] = u"sa" + for char in u"シし": + self.trans[char] = u"shi" + for char in u"スす": + self.trans[char] = u"su" + for char in u"セせ": + self.trans[char] = u"se" + for char in u"ソそ": + self.trans[char] = u"so" + for char in u"タた": + self.trans[char] = u"ta" + for char in u"チち": + self.trans[char] = u"chi" + for char in u"ツつ": + self.trans[char] = u"tsu" + for char in u"テて": + self.trans[char] = u"te" + for char in u"トと": + self.trans[char] = u"to" + for char in u"ナな": + self.trans[char] = u"na" + for char in u"ニに": + self.trans[char] = u"ni" + for char in u"ヌぬ": + self.trans[char] = u"nu" + for char in u"ネね": + self.trans[char] = u"ne" + for char in u"ノの": + self.trans[char] = u"no" + for char in u"ハは": + self.trans[char] = u"ha" + for char in u"ヒひ": + self.trans[char] = u"hi" + for char in u"フふ": + self.trans[char] = u"fu" + for char in u"ヘへ": + self.trans[char] = u"he" + for char in u"ホほ": + self.trans[char] = u"ho" + for char in u"マま": + self.trans[char] = u"ma" + for char in u"ミみ": + self.trans[char] = u"mi" + for char in u"ムむ": + self.trans[char] = u"mu" + for char in u"メめ": + self.trans[char] = u"me" + for char in u"モも": + self.trans[char] = u"mo" + for char in u"ラら": + self.trans[char] = u"ra" + for char in u"リり": + self.trans[char] = u"ri" + for char in u"ルる": + self.trans[char] = u"ru" + for char in u"レれ": + self.trans[char] = u"re" + for char in u"ロろ": + self.trans[char] = u"ro" + for char in u"ワわ": + self.trans[char] = u"wa" + for char in u"ヰゐ": + self.trans[char] = u"wi" + for char in u"ヱゑ": + self.trans[char] = u"we" + for char in u"ヲを": + self.trans[char] = u"wo" + for char in u"ンん": + self.trans[char] = u"n" + for char in u"ガが": + self.trans[char] = u"ga" + for char in u"ギぎ": + self.trans[char] = u"gi" + for char in u"グぐ": + self.trans[char] = u"gu" + for char in u"ゲげ": + self.trans[char] = u"ge" + for char in u"ゴご": + self.trans[char] = u"go" + for char in u"ザざ": + self.trans[char] = u"za" + for char in u"ジじ": + self.trans[char] = u"ji" + for char in u"ズず": + self.trans[char] = u"zu" + for char in u"ゼぜ": + self.trans[char] = u"ze" + for char in u"ゾぞ": + self.trans[char] = u"zo" + for char in u"ダだ": + self.trans[char] = u"da" + for char in u"ヂぢ": + self.trans[char] = u"dji" + for char in u"ヅづ": + self.trans[char] = u"dzu" + for char in u"デで": + self.trans[char] = u"de" + for char in u"ドど": + self.trans[char] = u"do" + for char in u"バば": + self.trans[char] = u"ba" + for char in u"ビび": + self.trans[char] = u"bi" + for char in u"ブぶ": + self.trans[char] = u"bu" + for char in u"ベべ": + self.trans[char] = u"be" + for char in u"ボぼ": + self.trans[char] = u"bo" + for char in u"パぱ": + self.trans[char] = u"pa" + for char in u"ピぴ": + self.trans[char] = u"pi" + for char in u"プぷ": + self.trans[char] = u"pu" + for char in u"ペぺ": + self.trans[char] = u"pe" + for char in u"ポぽ": + self.trans[char] = u"po" + for char in u"ヴゔ": + self.trans[char] = u"vu" + self.trans[u"ヷ"] = u"va" + self.trans[u"ヸ"] = u"vi" + self.trans[u"ヹ"] = u"ve" + self.trans[u"ヺ"] = u"vo"
- # Devanagari - if char in u"पप": - return u"p" - if char in u"अ": - return u"a" - if char in u"आा": - return u"aa" - if char == u"प": - return u"pa" - if char in u"इि": - return u"i" - if char in u"ईी": - return u"ii" - if char in u"उु": - return u"u" - if char in u"ऊू": - return u"uu" - if char in u"एे": - return u"e" - if char in u"ऐै": - return u"ai" - if char in u"ओो": - return u"o" - if char in u"औौ": - return u"au" - if char in u"ऋृर": - return u"r" - if char in u"ॠॄ": - return u"rr" - if char in u"ऌॢल": - return u"l" - if char in u"ॡॣ": - return u"ll" - if char == u"क": - return u"k" - if char == u"ख": - return u"kh" - if char == u"ग": - return u"g" - if char == u"घ": - return u"gh" - if char == u"ङ": - return u"ng" - if char == u"च": - return u"c" - if char == u"छ": - return u"ch" - if char == u"ज": - return u"j" - if char == u"झ": - return u"jh" - if char == u"ञ": - return u"ñ" - if char in u"टत": - return u"t" - if char in u"ठथ": - return u"th" - if char in u"डद": - return u"d" - if char in u"ढध": - return u"dh" - if char in u"णन": - return u"n" - if char == u"फ": - return u"ph" - if char == u"ब": - return u"b" - if char == u"भ": - return u"bh" - if char == u"म": - return u"m" - if char == u"य": - return u"y" - if char == u"व": - return u"v" - if char == u"श": - return u"sh" - if char in u"षस": - return u"s" - if char == u"ह": - return u"h" - if char == u"क": - return u"x" - if char == u"त": - return u"tr" - if char == u"ज": - return u"gj" - if char == u"क़": - return u"q" - if char == u"फ": - return u"f" - if char == u"ख": - return u"hh" - if char == u"H": - return u"gh" - if char == u"ज": - return u"z" - if char in u"डढ": - return u"r" - # Devanagari ligatures (possibly incomplete and/or incorrect) - if char == u"ख्": - return u"khn" - if char == u"त": - return u"tn" - if char == u"द्": - return u"dn" - if char == u"श": - return u"cn" - if char == u"ह्": - return u"fn" - if char in u"अँ": - return u"m" - if char in u"॒॑": - return u"" - if char == u"०": - return u"0" - if char == u"१": - return u"1" - if char == u"२": - return u"2" - if char == u"३": - return u"3" - if char == u"४": - return u"4" - if char == u"५": - return u"5" - if char == u"६": - return u"6" - if char == u"७": - return u"7" - if char == u"८": - return u"8" - if char == u"९": - return u"9" + # Japanese and Chinese punctuation and typography + for char in u"・·": + self.trans[char] = u" " + for char in u"〃『』《》": + self.trans[char] = u'"' + for char in u"「」〈〉〘〙〚〛": + self.trans[char] = u"'" + for char in u"(〔": + self.trans[char] = u"(" + for char in u")〕": + self.trans[char] = u")" + for char in u"[【〖": + self.trans[char] = u"[" + for char in u"]】〗": + self.trans[char] = u"]" + for char in u"{": + self.trans[char] = u"{" + for char in u"}": + self.trans[char] = u"}" + for char in u"っ": + self.trans[char] = u":" + for char in u"ー": + self.trans[char] = u"h" + for char in u"゛": + self.trans[char] = u"'" + for char in u"゜": + self.trans[char] = u"p" + for char in u"。": + self.trans[char] = u". " + for char in u"、": + self.trans[char] = u", " + for char in u"・": + self.trans[char] = u" " + for char in u"〆": + self.trans[char] = u"shime" + for char in u"〜": + self.trans[char] = u"-" + for char in u"…": + self.trans[char] = u"..." + for char in u"‥": + self.trans[char] = u".." + for char in u"ヶ": + self.trans[char] = u"months" + for char in u"•◦": + self.trans[char] = u"_" + for char in u"※*": + self.trans[char] = u"*" + for char in u"Ⓧ": + self.trans[char] = u"(X)" + for char in u"Ⓨ": + self.trans[char] = u"(Y)" + for char in u"!": + self.trans[char] = u"!" + for char in u"?": + self.trans[char] = u"?" + for char in u";": + self.trans[char] = u";" + for char in u":": + self.trans[char] = u":" + for char in u"。": + self.trans[char] = u"." + for char in u",、": + self.trans[char] = u","
- # Armenian - if char == u"Ա": - return u"A" - if char == u"ա": - return u"a" - if char == u"Բ": - return u"B" - if char == u"բ": - return u"b" - if char == u"Գ": - return u"G" - if char == u"գ": - return u"g" - if char == u"Դ": - return u"D" - if char == u"դ": - return u"d" - if char == u"Ե": - return u"Je" - if char == u"ե": - return u"e" - if char == u"Զ": - return u"Z" - if char == u"զ": - return u"z" - if char == u"Է": - return u"É" - if char == u"է": - return u"é" - if char == u"Ը": - return u"Ë" - if char == u"ը": - return u"ë" - if char == u"Թ": - return u"Th" - if char == u"թ": - return u"th" - if char == u"Ժ": - return u"Zh" - if char == u"ժ": - return u"zh" - if char == u"Ի": - return u"I" - if char == u"ի": - return u"i" - if char == u"Լ": - return u"L" - if char == u"լ": - return u"l" - if char == u"Խ": - return u"Ch" - if char == u"խ": - return u"ch" - if char == u"Ծ": - return u"Ts" - if char == u"ծ": - return u"ts" - if char == u"Կ": - return u"K" - if char == u"կ": - return u"k" - if char == u"Հ": - return u"H" - if char == u"հ": - return u"h" - if char == u"Ձ": - return u"Dz" - if char == u"ձ": - return u"dz" - if char == u"Ղ": - return u"R" - if char == u"ղ": - return u"r" - if char == u"Ճ": - return u"Cz" - if char == u"ճ": - return u"cz" - if char == u"Մ": - return u"M" - if char == u"մ": - return u"m" - if char == u"Յ": - return u"J" - if char == u"յ": - return u"j" - if char == u"Ն": - return u"N" - if char == u"ն": - return u"n" - if char == u"Շ": - return u"S" - if char == u"շ": - return u"s" - if char == u"Շ": - return u"Vo" - if char == u"շ": - return u"o" - if char == u"Չ": - return u"Tsh" - if char == u"չ": - return u"tsh" - if char == u"Պ": - return u"P" - if char == u"պ": - return u"p" - if char == u"Ջ": - return u"Dz" - if char == u"ջ": - return u"dz" - if char == u"Ռ": - return u"R" - if char == u"ռ": - return u"r" - if char == u"Ս": - return u"S" - if char == u"ս": - return u"s" - if char == u"Վ": - return u"V" - if char == u"վ": - return u"v" - if char == u"Տ": - return u"T'" - if char == u"տ": - return u"t'" - if char == u"Ր": - return u"R" - if char == u"ր": - return u"r" - if char == u"Ց": - return u"Tsh" - if char == u"ց": - return u"tsh" - if char == u"Ւ": - return u"V" - if char == u"ւ": - return u"v" - if char == u"Փ": - return u"Ph" - if char == u"փ": - return u"ph" - if char == u"Ք": - return u"Kh" - if char == u"ք": - return u"kh" - if char == u"Օ": - return u"O" - if char == u"օ": - return u"o" - if char == u"Ֆ": - return u"F" - if char == u"ֆ": - return u"f" - if char == u"և": - return u"&" - if char == u"՟": - return u"." - if char == u"՞": - return u"?" - if char == u"՝": - return u";" - if char == u"՛": - return u"" + # Georgian + for char in u"ა": + self.trans[char] = u"a" + for char in u"ბ": + self.trans[char] = u"b" + for char in u"გ": + self.trans[char] = u"g" + for char in u"დ": + self.trans[char] = u"d" + for char in u"ეჱ": + self.trans[char] = u"e" + for char in u"ვ": + self.trans[char] = u"v" + for char in u"ზ": + self.trans[char] = u"z" + for char in u"თ":# + self.trans[char] = u"th" + for char in u"ი": + self.trans[char] = u"i" + for char in u"კ":# + self.trans[char] = u"k" + for char in u"ლ": + self.trans[char] = u"l" + for char in u"მ": + self.trans[char] = u"m" + for char in u"ნ": + self.trans[char] = u"n" + for char in u"ო": + self.trans[char] = u"o" + for char in u"პ":# + self.trans[char] = u"p" + for char in u"ჟ":# + self.trans[char] = u"zh" + for char in u"რ": + self.trans[char] = u"r" + for char in u"ს": + self.trans[char] = u"s" + for char in u"ტ":# + self.trans[char] = u"t" + for char in u"უ": + self.trans[char] = u"u" + for char in u"ფ":# + self.trans[char] = u"ph" + for char in u"ქ":# + self.trans[char] = u"q" + for char in u"ღ":# + self.trans[char] = u"gh" + for char in u"ყ":# + self.trans[char] = u"q'" + for char in u"შ": + self.trans[char] = u"sh" + for char in u"ჩ": + self.trans[char] = u"ch" + for char in u"ც": + self.trans[char] = u"ts" + for char in u"ძ": + self.trans[char] = u"dz" + for char in u"წ":# + self.trans[char] = u"ts'" + for char in u"ჭ":# + self.trans[char] = u"ch'" + for char in u"ხ": + self.trans[char] = u"kh" + for char in u"ჯ":# + self.trans[char] = u"j" + for char in u"ჰ": + self.trans[char] = u"h" + for char in u"ჳ": + self.trans[char] = u"w" + for char in u"ჵ": + self.trans[char] = u"o" + for char in u"ჶ": + self.trans[char] = u"f"
- # Tamil - if char == u"க்": - return u"k" - if char in u"ஙண்ந்ன்": - return u"n" - if char == u"ச": - return u"c" - if char == u"ஞ்": - return u"ñ" - if char == u"ட்": - return u"th" - if char == u"த": - return u"t" - if char == u"ப": - return u"p" - if char == u"ம்": - return u"m" - if char == u"ய்": - return u"y" - if char in u"ர்ழ்ற": - return u"r" - if char in u"ல்ள": - return u"l" - if char == u"வ்": - return u"v" - if char == u"ஜ": - return u"j" - if char == u"ஷ": - return u"sh" - if char == u"ஸ": - return u"s" - if char == u"ஹ": - return u"h" - if char == u"க்ஷ": - return u"x" - if char == u"அ": - return u"a" - if char == u"ஆ": - return u"aa" - if char == u"இ": - return u"i" - if char == u"ஈ": - return u"ii" - if char == u"உ": - return u"u" - if char == u"ஊ": - return u"uu" - if char == u"எ": - return u"e" - if char == u"ஏ": - return u"ee" - if char == u"ஐ": - return u"ai" - if char == u"ஒ": - return u"o" - if char == u"ஓ": - return u"oo" - if char == u"ஔ": - return u"au" - if char == u"ஃ": - return "" + # Devanagari + for char in u"पप": + self.trans[char] = u"p" + for char in u"अ": + self.trans[char] = u"a" + for char in u"आा": + self.trans[char] = u"aa" + for char in u"प": + self.trans[char] = u"pa" + for char in u"इि": + self.trans[char] = u"i" + for char in u"ईी": + self.trans[char] = u"ii" + for char in u"उु": + self.trans[char] = u"u" + for char in u"ऊू": + self.trans[char] = u"uu" + for char in u"एे": + self.trans[char] = u"e" + for char in u"ऐै": + self.trans[char] = u"ai" + for char in u"ओो": + self.trans[char] = u"o" + for char in u"औौ": + self.trans[char] = u"au" + for char in u"ऋृर": + self.trans[char] = u"r" + for char in u"ॠॄ": + self.trans[char] = u"rr" + for char in u"ऌॢल": + self.trans[char] = u"l" + for char in u"ॡॣ": + self.trans[char] = u"ll" + for char in u"क": + self.trans[char] = u"k" + for char in u"ख": + self.trans[char] = u"kh" + for char in u"ग": + self.trans[char] = u"g" + for char in u"घ": + self.trans[char] = u"gh" + for char in u"ङ": + self.trans[char] = u"ng" + for char in u"च": + self.trans[char] = u"c" + for char in u"छ": + self.trans[char] = u"ch" + for char in u"ज": + self.trans[char] = u"j" + for char in u"झ": + self.trans[char] = u"jh" + for char in u"ञ": + self.trans[char] = u"ñ" + for char in u"टत": + self.trans[char] = u"t" + for char in u"ठथ": + self.trans[char] = u"th" + for char in u"डद": + self.trans[char] = u"d" + for char in u"ढध": + self.trans[char] = u"dh" + for char in u"णन": + self.trans[char] = u"n" + for char in u"फ": + self.trans[char] = u"ph" + for char in u"ब": + self.trans[char] = u"b" + for char in u"भ": + self.trans[char] = u"bh" + for char in u"म": + self.trans[char] = u"m" + for char in u"य": + self.trans[char] = u"y" + for char in u"व": + self.trans[char] = u"v" + for char in u"श": + self.trans[char] = u"sh" + for char in u"षस": + self.trans[char] = u"s" + for char in u"ह": + self.trans[char] = u"h" + for char in u"क": + self.trans[char] = u"x" + for char in u"त": + self.trans[char] = u"tr" + for char in u"ज": + self.trans[char] = u"gj" + for char in u"क़": + self.trans[char] = u"q" + for char in u"फ": + self.trans[char] = u"f" + for char in u"ख": + self.trans[char] = u"hh" + for char in u"H": + self.trans[char] = u"gh" + for char in u"ज": + self.trans[char] = u"z" + for char in u"डढ": + self.trans[char] = u"r" + # Devanagari ligatures (possibly incomplete and/or incorrect) + for char in u"ख्": + self.trans[char] = u"khn" + for char in u"त": + self.trans[char] = u"tn" + for char in u"द्": + self.trans[char] = u"dn" + for char in u"श": + self.trans[char] = u"cn" + for char in u"ह्": + self.trans[char] = u"fn" + for char in u"अँ": + self.trans[char] = u"m" + for char in u"॒॑": + self.trans[char] = u"" + for char in u"०": + self.trans[char] = u"0" + for char in u"१": + self.trans[char] = u"1" + for char in u"२": + self.trans[char] = u"2" + for char in u"३": + self.trans[char] = u"3" + for char in u"४": + self.trans[char] = u"4" + for char in u"५": + self.trans[char] = u"5" + for char in u"६": + self.trans[char] = u"6" + for char in u"७": + self.trans[char] = u"7" + for char in u"८": + self.trans[char] = u"8" + for char in u"९": + self.trans[char] = u"9"
- # Bengali - if char == u"অ": - return u"ô" - if char in u"আা": - return u"a" - if char in u"ইিঈী": - return u"i" - if char in u"উুঊূ": - return u"u" - if char in u"ঋৃ": - return u"ri" - if char in u"এেয়": - return u"e" - if char in u"ঐৈ": - return u"oi" - if char in u"ওো": - return u"o" - if char in u"ঔৌ": - return "ou" - if char == u"্": - return u"" - if char == u"ৎ": - return u"t" - if char == u"ং": - return u"n" - if char == u"ঃ": - return u"h" - if char == u"ঁ": - return u"ñ" - if char == u"ক": - return u"k" - if char == u"খ": - return u"kh" - if char == u"গ": - return u"g" - if char == u"ঘ": - return u"gh" - if char == u"ঙ": - return u"ng" - if char == u"চ": - return u"ch" - if char == u"ছ": - return u"chh" - if char in u"জ": - return u"j" - if char == u"ঝ": - return u"jh" - if char == u"ঞ": - return u"n" - if char in u"টত": - return u"t" - if char in u"ঠথ": - return u"th" - if char in u"ডদ": - return u"d" - if char in u"ঢধ": - return u"dh" - if char in u"ণন": - return u"n" - if char == u"প": - return u"p" - if char == u"ফ": - return u"ph" - if char == u"ব": - return u"b" - if char == u"ভ": - return u"bh" - if char == u"ম": - return u"m" - if char == u"য": - return u"dzh" - if char == u"র": - return u"r" - if char == u"ল": - return u"l" - if char == u"শ": - return u"s" - if char == u"হ": - return u"h" - if char == u"য়": - return u"-" - if char == u"ড়": - return u"r" - if char == u"ঢ": - return u"rh" - if char == u"০": - return u"0" - if char == u"১": - return u"1" - if char == u"২": - return u"2" - if char == u"৩": - return u"3" - if char == u"৪": - return u"4" - if char == u"৫": - return u"5" - if char == u"৬": - return u"6" - if char == u"৭": - return u"7" - if char == u"৮": - return u"8" - if char == u"৯": - return u"9" + # Armenian + for char in u"Ա": + self.trans[char] = u"A" + for char in u"ա": + self.trans[char] = u"a" + for char in u"Բ": + self.trans[char] = u"B" + for char in u"բ": + self.trans[char] = u"b" + for char in u"Գ": + self.trans[char] = u"G" + for char in u"գ": + self.trans[char] = u"g" + for char in u"Դ": + self.trans[char] = u"D" + for char in u"դ": + self.trans[char] = u"d" + for char in u"Ե": + self.trans[char] = u"Je" + for char in u"ե": + self.trans[char] = u"e" + for char in u"Զ": + self.trans[char] = u"Z" + for char in u"զ": + self.trans[char] = u"z" + for char in u"Է": + self.trans[char] = u"É" + for char in u"է": + self.trans[char] = u"é" + for char in u"Ը": + self.trans[char] = u"Ë" + for char in u"ը": + self.trans[char] = u"ë" + for char in u"Թ": + self.trans[char] = u"Th" + for char in u"թ": + self.trans[char] = u"th" + for char in u"Ժ": + self.trans[char] = u"Zh" + for char in u"ժ": + self.trans[char] = u"zh" + for char in u"Ի": + self.trans[char] = u"I" + for char in u"ի": + self.trans[char] = u"i" + for char in u"Լ": + self.trans[char] = u"L" + for char in u"լ": + self.trans[char] = u"l" + for char in u"Խ": + self.trans[char] = u"Ch" + for char in u"խ": + self.trans[char] = u"ch" + for char in u"Ծ": + self.trans[char] = u"Ts" + for char in u"ծ": + self.trans[char] = u"ts" + for char in u"Կ": + self.trans[char] = u"K" + for char in u"կ": + self.trans[char] = u"k" + for char in u"Հ": + self.trans[char] = u"H" + for char in u"հ": + self.trans[char] = u"h" + for char in u"Ձ": + self.trans[char] = u"Dz" + for char in u"ձ": + self.trans[char] = u"dz" + for char in u"Ղ": + self.trans[char] = u"R" + for char in u"ղ": + self.trans[char] = u"r" + for char in u"Ճ": + self.trans[char] = u"Cz" + for char in u"ճ": + self.trans[char] = u"cz" + for char in u"Մ": + self.trans[char] = u"M" + for char in u"մ": + self.trans[char] = u"m" + for char in u"Յ": + self.trans[char] = u"J" + for char in u"յ": + self.trans[char] = u"j" + for char in u"Ն": + self.trans[char] = u"N" + for char in u"ն": + self.trans[char] = u"n" + for char in u"Շ": + self.trans[char] = u"S" + for char in u"շ": + self.trans[char] = u"s" + for char in u"Շ": + self.trans[char] = u"Vo" + for char in u"շ": + self.trans[char] = u"o" + for char in u"Չ": + self.trans[char] = u"Tsh" + for char in u"չ": + self.trans[char] = u"tsh" + for char in u"Պ": + self.trans[char] = u"P" + for char in u"պ": + self.trans[char] = u"p" + for char in u"Ջ": + self.trans[char] = u"Dz" + for char in u"ջ": + self.trans[char] = u"dz" + for char in u"Ռ": + self.trans[char] = u"R" + for char in u"ռ": + self.trans[char] = u"r" + for char in u"Ս": + self.trans[char] = u"S" + for char in u"ս": + self.trans[char] = u"s" + for char in u"Վ": + self.trans[char] = u"V" + for char in u"վ": + self.trans[char] = u"v" + for char in u"Տ": + self.trans[char] = u"T'" + for char in u"տ": + self.trans[char] = u"t'" + for char in u"Ր": + self.trans[char] = u"R" + for char in u"ր": + self.trans[char] = u"r" + for char in u"Ց": + self.trans[char] = u"Tsh" + for char in u"ց": + self.trans[char] = u"tsh" + for char in u"Ւ": + self.trans[char] = u"V" + for char in u"ւ": + self.trans[char] = u"v" + for char in u"Փ": + self.trans[char] = u"Ph" + for char in u"փ": + self.trans[char] = u"ph" + for char in u"Ք": + self.trans[char] = u"Kh" + for char in u"ք": + self.trans[char] = u"kh" + for char in u"Օ": + self.trans[char] = u"O" + for char in u"օ": + self.trans[char] = u"o" + for char in u"Ֆ": + self.trans[char] = u"F" + for char in u"ֆ": + self.trans[char] = u"f" + for char in u"և": + self.trans[char] = u"&" + for char in u"՟": + self.trans[char] = u"." + for char in u"՞": + self.trans[char] = u"?" + for char in u"՝": + self.trans[char] = u";" + for char in u"՛": + self.trans[char] = u""
- # Thai (because of complications of the alphabet, transliterations - # are very imprecise here) - if char == u"ก": - return u"k" - if char in u"ขฃคฅฆ": - return u"kh" - if char == u"ง": - return u"ng" - if char in u"จฉชฌ": - return u"ch" - if char in u"ซศษส": - return u"s" - if char in u"ญย": - return u"y" - if char in u"ฎด": - return u"d" - if char in u"ฏต": - return u"t" - if char in u"ฐฑฒถทธ": - return u"th" - if char in u"ณน": - return u"n" - if char == u"บ": - return u"b" - if char == u"ป": - return u"p" - if char in u"ผพภ": - return u"ph" - if char in u"ฝฟ": - return u"f" - if char in u"ม": - return u"m" - if char == u"ร": - return u"r" - if char == u"ฤ": - return u"rue" - if char in u"ๅ": - return u":" - if char in u"ลฬ": - return u"l" - if char == u"ฦ": - return u"lue" - if char == u"ว": - return u"w" - if char in u"หฮ": - return u"h" - if char == u"อ": - return u"" - if char == u"ร": - return u"ü" - if char == u"ว": - return u"ua" - if char in u"อว–โิ": - return u"o" - if char in u"ะัา": - return u"a" - if char in u"ว": - return u"u" - if char == u"ำ": - return u"am" - if char == u"ิ": - return u"i" - if char == u"ี": - return u"i:" - if char == u"ึ": - return u"ue" - if char == u"ื": - return u"ue:" - if char == u"ุ": - return u"u" - if char == u"ู": - return u"u:" - if char in u"เ็": - return u"e" - if char == u"แ": - return u"ae" - if char in u"ใไ": - return u"ai" - if char in u"่้๊๋็์": - return u"" - if char in u"ฯ": - return u"." - if char in u"ๆ": - return u"(2)" + # Tamil + for char in u"க்": + self.trans[char] = u"k" + for char in u"ஙண்ந்ன்": + self.trans[char] = u"n" + for char in u"ச": + self.trans[char] = u"c" + for char in u"ஞ்": + self.trans[char] = u"ñ" + for char in u"ட்": + self.trans[char] = u"th" + for char in u"த": + self.trans[char] = u"t" + for char in u"ப": + self.trans[char] = u"p" + for char in u"ம்": + self.trans[char] = u"m" + for char in u"ய்": + self.trans[char] = u"y" + for char in u"ர்ழ்ற": + self.trans[char] = u"r" + for char in u"ல்ள": + self.trans[char] = u"l" + for char in u"வ்": + self.trans[char] = u"v" + for char in u"ஜ": + self.trans[char] = u"j" + for char in u"ஷ": + self.trans[char] = u"sh" + for char in u"ஸ": + self.trans[char] = u"s" + for char in u"ஹ": + self.trans[char] = u"h" + for char in u"க்ஷ": + self.trans[char] = u"x" + for char in u"அ": + self.trans[char] = u"a" + for char in u"ஆ": + self.trans[char] = u"aa" + for char in u"இ": + self.trans[char] = u"i" + for char in u"ஈ": + self.trans[char] = u"ii" + for char in u"உ": + self.trans[char] = u"u" + for char in u"ஊ": + self.trans[char] = u"uu" + for char in u"எ": + self.trans[char] = u"e" + for char in u"ஏ": + self.trans[char] = u"ee" + for char in u"ஐ": + self.trans[char] = u"ai" + for char in u"ஒ": + self.trans[char] = u"o" + for char in u"ஓ": + self.trans[char] = u"oo" + for char in u"ஔ": + self.trans[char] = u"au" + for char in u"ஃ": + self.trans[char] = ""
- return default + # Bengali + for char in u"অ": + self.trans[char] = u"ô" + for char in u"আা": + self.trans[char] = u"a" + for char in u"ইিঈী": + self.trans[char] = u"i" + for char in u"উুঊূ": + self.trans[char] = u"u" + for char in u"ঋৃ": + self.trans[char] = u"ri" + for char in u"এেয়": + self.trans[char] = u"e" + for char in u"ঐৈ": + self.trans[char] = u"oi" + for char in u"ওো": + self.trans[char] = u"o" + for char in u"ঔৌ": + self.trans[char] = "ou" + for char in u"্": + self.trans[char] = u"" + for char in u"ৎ": + self.trans[char] = u"t" + for char in u"ং": + self.trans[char] = u"n" + for char in u"ঃ": + self.trans[char] = u"h" + for char in u"ঁ": + self.trans[char] = u"ñ" + for char in u"ক": + self.trans[char] = u"k" + for char in u"খ": + self.trans[char] = u"kh" + for char in u"গ": + self.trans[char] = u"g" + for char in u"ঘ": + self.trans[char] = u"gh" + for char in u"ঙ": + self.trans[char] = u"ng" + for char in u"চ": + self.trans[char] = u"ch" + for char in u"ছ": + self.trans[char] = u"chh" + for char in u"জ": + self.trans[char] = u"j" + for char in u"ঝ": + self.trans[char] = u"jh" + for char in u"ঞ": + self.trans[char] = u"n" + for char in u"টত": + self.trans[char] = u"t" + for char in u"ঠথ": + self.trans[char] = u"th" + for char in u"ডদ": + self.trans[char] = u"d" + for char in u"ঢধ": + self.trans[char] = u"dh" + for char in u"ণন": + self.trans[char] = u"n" + for char in u"প": + self.trans[char] = u"p" + for char in u"ফ": + self.trans[char] = u"ph" + for char in u"ব": + self.trans[char] = u"b" + for char in u"ভ": + self.trans[char] = u"bh" + for char in u"ম": + self.trans[char] = u"m" + for char in u"য": + self.trans[char] = u"dzh" + for char in u"র": + self.trans[char] = u"r" + for char in u"ল": + self.trans[char] = u"l" + for char in u"শ": + self.trans[char] = u"s" + for char in u"হ": + self.trans[char] = u"h" + for char in u"য়": + self.trans[char] = u"-" + for char in u"ড়": + self.trans[char] = u"r" + for char in u"ঢ": + self.trans[char] = u"rh" + for char in u"০": + self.trans[char] = u"0" + for char in u"১": + self.trans[char] = u"1" + for char in u"২": + self.trans[char] = u"2" + for char in u"৩": + self.trans[char] = u"3" + for char in u"৪": + self.trans[char] = u"4" + for char in u"৫": + self.trans[char] = u"5" + for char in u"৬": + self.trans[char] = u"6" + for char in u"৭": + self.trans[char] = u"7" + for char in u"৮": + self.trans[char] = u"8" + for char in u"৯": + self.trans[char] = u"9" + + # Thai (because of complications of the alphabet, self.transliterations + # are very imprecise here) + for char in u"ก": + self.trans[char] = u"k" + for char in u"ขฃคฅฆ": + self.trans[char] = u"kh" + for char in u"ง": + self.trans[char] = u"ng" + for char in u"จฉชฌ": + self.trans[char] = u"ch" + for char in u"ซศษส": + self.trans[char] = u"s" + for char in u"ญย": + self.trans[char] = u"y" + for char in u"ฎด": + self.trans[char] = u"d" + for char in u"ฏต": + self.trans[char] = u"t" + for char in u"ฐฑฒถทธ": + self.trans[char] = u"th" + for char in u"ณน": + self.trans[char] = u"n" + for char in u"บ": + self.trans[char] = u"b" + for char in u"ป": + self.trans[char] = u"p" + for char in u"ผพภ": + self.trans[char] = u"ph" + for char in u"ฝฟ": + self.trans[char] = u"f" + for char in u"ม": + self.trans[char] = u"m" + for char in u"ร": + self.trans[char] = u"r" + for char in u"ฤ": + self.trans[char] = u"rue" + for char in u"ๅ": + self.trans[char] = u":" + for char in u"ลฬ": + self.trans[char] = u"l" + for char in u"ฦ": + self.trans[char] = u"lue" + for char in u"ว": + self.trans[char] = u"w" + for char in u"หฮ": + self.trans[char] = u"h" + for char in u"อ": + self.trans[char] = u"" + for char in u"ร": + self.trans[char] = u"ü" + for char in u"ว": + self.trans[char] = u"ua" + for char in u"อวโิ": + self.trans[char] = u"o" + for char in u"ะัา": + self.trans[char] = u"a" + for char in u"ว": + self.trans[char] = u"u" + for char in u"ำ": + self.trans[char] = u"am" + for char in u"ิ": + self.trans[char] = u"i" + for char in u"ี": + self.trans[char] = u"i:" + for char in u"ึ": + self.trans[char] = u"ue" + for char in u"ื": + self.trans[char] = u"ue:" + for char in u"ุ": + self.trans[char] = u"u" + for char in u"ู": + self.trans[char] = u"u:" + for char in u"เ็": + self.trans[char] = u"e" + for char in u"แ": + self.trans[char] = u"ae" + for char in u"ใไ": + self.trans[char] = u"ai" + for char in u"่้๊๋็์": + self.trans[char] = u"" + for char in u"ฯ": + self.trans[char] = u"." + for char in u"ๆ": + self.trans[char] = u"(2)" + + # Korean (Revised Romanization system within possible, incomplete) + for char in u"국": + self.trans[char] = u"guk" + for char in u"명": + self.trans[char] = u"myeong" + for char in u"검": + self.trans[char] = u"geom" + for char in u"타": + self.trans[char] = u"ta" + for char in u"분": + self.trans[char] = u"bun" + for char in u"사": + self.trans[char] = u"sa" + for char in u"류": + self.trans[char] = u"ryu" + for char in u"포": + self.trans[char] = u"po" + for char in u"르": + self.trans[char] = u"reu" + for char in u"투": + self.trans[char] = u"tu" + for char in u"갈": + self.trans[char] = u"gal" + for char in u"어": + self.trans[char] = u"eo" + for char in u"노": + self.trans[char] = u"no" + for char in u"웨": + self.trans[char] = u"we" + for char in u"이": + self.trans[char] = u"i" + for char in u"라": + self.trans[char] = u"ra" + for char in u"틴": + self.trans[char] = u"tin" + for char in u"루": + self.trans[char] = u"ru" + for char in u"마": + self.trans[char] = u"ma" + for char in u"니": + self.trans[char] = u"ni" + for char in u"아": + self.trans[char] = u"a" + for char in u"독": + self.trans[char] = u"dok" + for char in u"일": + self.trans[char] = u"il" + for char in u"모": + self.trans[char] = u"mo" + for char in u"크": + self.trans[char] = u"keu" + for char in u"샤": + self.trans[char] = u"sya" + for char in u"영": + self.trans[char] = u"yeong" + for char in u"불": + self.trans[char] = u"bul" + for char in u"가": + self.trans[char] = u"ga" + for char in u"리": + self.trans[char] = u"ri" + for char in u"그": + self.trans[char] = u"geu" + for char in u"지": + self.trans[char] = u"ji" + for char in u"야": + self.trans[char] = u"ya" + for char in u"바": + self.trans[char] = u"ba" + for char in u"슈": + self.trans[char] = u"syu" + for char in u"키": + self.trans[char] = u"ki" + for char in u"프": + self.trans[char] = u"peu" + for char in u"랑": + self.trans[char] = u"rang" + for char in u"스": + self.trans[char] = u"seu" + for char in u"로": + self.trans[char] = u"ro" + for char in u"메": + self.trans[char] = u"me" + for char in u"역": + self.trans[char] = u"yeok" + for char in u"도": + self.trans[char] = u"do" + + # Kannada + self.trans[u"ಅ"] = u"a" + for char in u"ಆಾ": + self.trans[char] = u"aa" + for char in u"ಇಿ": + self.trans[char] = u"i" + for char in u"ಈೀ": + self.trans[char] = u"ii" + for char in u"ಉು": + self.trans[char] = u"u" + for char in u"ಊೂ": + self.trans[char] = u"uu" + for char in u"ಋೂ": + self.trans[char] = u"r'" + for char in u"ಎೆ": + self.trans[char] = u"e" + for char in u"ಏೇ": + self.trans[char] = u"ee" + for char in u"ಐೈ": + self.trans[char] = u"ai" + for char in u"ಒೊ": + self.trans[char] = u"o" + for char in u"ಓೋ": + self.trans[char] = u"oo" + for char in u"ಔೌ": + self.trans[char] = u"au" + self.trans[u"ಂ"] = u"m'" + self.trans[u"ಃ"] = u"h'" + self.trans[u"ಕ"] = u"k" + self.trans[u"ಖ"] = u"kh" + self.trans[u"ಗ"] = u"g" + self.trans[u"ಘ"] = u"gh" + self.trans[u"ಙ"] = u"ng" + self.trans[u"ಚ"] = u"c" + self.trans[u"ಛ"] = u"ch" + self.trans[u"ಜ"] = u"j" + self.trans[u"ಝ"] = u"ny" + self.trans[u"ಟ"] = u"tt" + self.trans[u"ಠ"] = u"tth" + self.trans[u"ಡ"] = u"dd" + self.trans[u"ಢ"] = u"ddh" + self.trans[u"ಣ"] = u"nn" + self.trans[u"ತ"] = u"t" + self.trans[u"ಥ"] = u"th" + self.trans[u"ದ"] = u"d" + self.trans[u"ಧ"] = u"dh" + self.trans[u"ನ"] = u"n" + self.trans[u"ಪ"] = u"p" + self.trans[u"ಫ"] = u"ph" + self.trans[u"ಬ"] = u"b" + self.trans[u"ಭ"] = u"bh" + self.trans[u"ಮ"] = u"m" + self.trans[u"ಯ"] = u"y" + self.trans[u"ರ"] = u"r" + self.trans[u"ಲ"] = u"l" + self.trans[u"ವ"] = u"v" + self.trans[u"ಶ"] = u"sh" + self.trans[u"ಷ"] = u"ss" + self.trans[u"ಸ"] = u"s" + self.trans[u"ಹ"] = u"h" + self.trans[u"ಳ"] = u"ll" + self.trans[u"೦"] = u"0" + self.trans[u"೧"] = u"1" + self.trans[u"೨"] = u"2" + self.trans[u"೩"] = u"3" + self.trans[u"೪"] = u"4" + self.trans[u"೫"] = u"5" + self.trans[u"೬"] = u"6" + self.trans[u"೭"] = u"7" + self.trans[u"೮"] = u"8" + self.trans[u"೯"] = u"9" + # Telugu + for char in u"అ": + self.trans[char] = u"a" + for char in u"ఆా": + self.trans[char] = u"aa" + for char in u"ఇి": + self.trans[char] = u"i" + for char in u"ఈీ": + self.trans[char] = u"ii" + for char in u"ఉు": + self.trans[char] = u"u" + for char in u"ఊూ": + self.trans[char] = u"uu" + for char in u"ఋృ": + self.trans[char] = u"r'" + for char in u"ౠౄ": + self.trans[char] = u'r"' + self.trans[u"ఌ"] = u"l'" + self.trans[u"ౡ"] = u'l"' + for char in u"ఎె": + self.trans[char] = u"e" + for char in u"ఏే": + self.trans[char] = u"ee" + for char in u"ఐై": + self.trans[char] = u"ai" + for char in u"ఒొ": + self.trans[char] = u"o" + for char in u"ఓో": + self.trans[char] = u"oo" + for char in u"ఔౌ": + self.trans[char] = u"au" + self.trans[u"ం"] = u"'" + self.trans[u"ః"] = u'"' + self.trans[u"క"] = u"k" + self.trans[u"ఖ"] = u"kh" + self.trans[u"గ"] = u"g" + self.trans[u"ఘ"] = u"gh" + self.trans[u"ఙ"] = u"ng" + self.trans[u"చ"] = u"ts" + self.trans[u"ఛ"] = u"tsh" + self.trans[u"జ"] = u"j" + self.trans[u"ఝ"] = u"jh" + self.trans[u"ఞ"] = u"ñ" + for char in u"టత": + self.trans[char] = u"t" + for char in u"ఠథ": + self.trans[char] = u"th" + for char in u"డద": + self.trans[char] = u"d" + for char in u"ఢధ": + self.trans[char] = u"dh" + for char in u"ణన": + self.trans[char] = u"n" + self.trans[u"ప"] = u"p" + self.trans[u"ఫ"] = u"ph" + self.trans[u"బ"] = u"b" + self.trans[u"భ"] = u"bh" + self.trans[u"మ"] = u"m" + self.trans[u"య"] = u"y" + for char in u"రఱ": + self.trans[char] = u"r" + for char in u"లళ": + self.trans[char] = u"l" + self.trans[u"వ"] = u"v" + self.trans[u"శ"] = u"sh" + for char in u"షస": + self.trans[char] = u"s" + self.trans[u"హ"] = u"h" + self.trans[u"్"] = "" + for char in u"ంఁ": + self.trans[char] = u"^" + self.trans[u"ః"] = u"-" + self.trans[u"౦"] = u"0" + self.trans[u"౧"] = u"1" + self.trans[u"౨"] = u"2" + self.trans[u"౩"] = u"3" + self.trans[u"౪"] = u"4" + self.trans[u"౫"] = u"5" + self.trans[u"౬"] = u"6" + self.trans[u"౭"] = u"7" + self.trans[u"౮"] = u"8" + self.trans[u"౯"] = u"9" + self.trans[u"౹"] = u"1/4" + self.trans[u"౺"] = u"1/2" + self.trans[u"౻"] = u"3/4" + self.trans[u"౼"] = u"1/16" + self.trans[u"౽"] = u"1/8" + self.trans[u"౾"] = u"3/16" + # Lao - note: pronounciation in initial position is used; + # different pronounciation in final position is ignored + self.trans[u"ກ"] = "k" + for char in u"ຂຄ": + self.trans[char] = "kh" + self.trans[u"ງ"] = "ng" + self.trans[u"ຈ"] = "ch" + for char in u"ສຊ": + self.trans[char] = "s" + self.trans[u"ຍ"] = "ny" + self.trans[u"ດ"] = "d" + self.trans[u"ຕ"] = "t" + for char in u"ຖທ": + self.trans[char] = "th" + self.trans[u"ນ"] = "n" + self.trans[u"ບ"] = "b" + self.trans[u"ປ"] = "p" + for char in u"ຜພ": + self.trans[char] = "ph" + for char in u"ຝຟ": + self.trans[char] = "f" + for char in u"ມໝ": + self.trans[char] = "m" + self.trans[u"ຢ"] = "y" + for char in u"ຣຼ": + self.trans[char] = "r" + for char in u"ລຼ": + self.trans[char] = "l" + self.trans[u"ວ"] = "v" + for char in u"ຮ": + self.trans[char] = "h" + self.trans[u"ອ"] = "'" + for char in u"ະັ": + self.trans[char] = "a" + self.trans[u"ິ"] = "i" + self.trans[u"ຶ"] = "ue" + self.trans[u"ຸ"] = "u" + self.trans[u"ເ"] = u"é" + self.trans[u"ແ"] = u"è" + for char in u"ໂົາໍ": + self.trans[char] = "o" + self.trans[u"ຽ"] = "ia" + self.trans[u"ເຶ"] = "uea" + self.trans[u"ຍ"] = "i" + for char in u"ໄໃ": + self.trans[char] = "ai" + self.trans[u"ຳ"] = "am" + self.trans[u"າ"] = "aa" + self.trans[u"ີ"] = "ii" + self.trans[u"ື"] = "yy" + self.trans[u"ູ"] = "uu" + self.trans[u"ເ"] = "e" + self.trans[u"ແ"] = "ei" + self.trans[u"໐"] = "0" + self.trans[u"໑"] = "1" + self.trans[u"໒"] = "2" + self.trans[u"໓"] = "3" + self.trans[u"໔"] = "4" + self.trans[u"໕"] = "5" + self.trans[u"໖"] = "6" + self.trans[u"໗"] = "7" + self.trans[u"໘"] = "8" + self.trans[u"໙"] = "9" + for char in self.trans: + value = self.trans[char] + if value == "?": continue + while value.encode(encoding, 'replace').decode(encoding) == "?" and value in self.trans: + assert value != self.trans[value], "%r == self.trans[%r]!" % (value, value) + value = self.trans[value] + self.trans[char] = value + + def transliterate(self, char, default="?", prev="-", next="-"): + if char in self.trans: + return self.trans[char] + #Arabic + if char == u"◌": + return prev + #Japanese + if char == u"ッ": + return self.transliterate(next)[0] + if char in u"々仝ヽヾゝゞ〱〲〳〵〴〵": + return prev + #Lao + if char == u"ຫ": + if next in u"ງຍນຣລຼຼວ": + return "" + else: + return "h" + return default +
Added: branches/rewrite/pywikibot/userinterfaces/win32_unicode.py =================================================================== --- branches/rewrite/pywikibot/userinterfaces/win32_unicode.py (rev 0) +++ branches/rewrite/pywikibot/userinterfaces/win32_unicode.py 2013-06-12 21:24:42 UTC (rev 11646) @@ -0,0 +1,258 @@ +# Stdout, stderr and argv support: +############################################## +# Support for unicode in windows cmd.exe +# Posted on Stack Overflow [1], available under CC-BY-SA [2] +# +# Question: "Windows cmd encoding change causes Python crash" [3] by Alex [4], +# Answered [5] by David-Sarah Hopwood [6]. +# +# [1] http://stackoverflow.com +# [2] http://creativecommons.org/licenses/by-sa/3.0/ +# [3] http://stackoverflow.com/questions/878972 +# [4] http://stackoverflow.com/users/85185 +# [4] http://stackoverflow.com/a/3259271/118671 +# [5] http://stackoverflow.com/users/393146 +# +################################################ +# +# stdin support added by Merlijn van Deen valhallasw@gmail.com, march 2012 +# Licensed under both CC-BY-SA as the MIT license. +# +################################################ + +import sys +stdin = sys.stdin +stdout = sys.stdout +stderr = sys.stderr +argv = sys.argv +if sys.platform == "win32": + import codecs + from ctypes import WINFUNCTYPE, windll, POINTER, byref, c_int, \ + create_unicode_buffer + from ctypes.wintypes import BOOL, HANDLE, DWORD, LPWSTR, LPCWSTR + try: + from ctypes.wintypes import LPVOID + except ImportError: + from ctypes import c_void_p as LPVOID + + original_stderr = sys.stderr + + # If any exception occurs in this code, we'll probably try to print it on stderr, + # which makes for frustrating debugging if stderr is directed to our wrapper. + # So be paranoid about catching errors and reporting them to original_stderr, + # so that we can at least see them. + def _complain(message): + print >>original_stderr, isinstance(message, str) and message or repr(message) + + # Work around http://bugs.python.org/issue6058. + codecs.register(lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) + + # Make Unicode console output work independently of the current code page. + # This also fixes http://bugs.python.org/issue1602. + # Credit to Michael Kaplan http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx + # and TZOmegaTZIOY + # http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462. + try: + # http://msdn.microsoft.com/en-us/library/ms683231(VS.85).aspx + # HANDLE WINAPI GetStdHandle(DWORD nStdHandle); + # returns INVALID_HANDLE_VALUE, NULL, or a valid handle + # + # http://msdn.microsoft.com/en-us/library/aa364960(VS.85).aspx + # DWORD WINAPI GetFileType(DWORD hFile); + # + # http://msdn.microsoft.com/en-us/library/ms683167(VS.85).aspx + # BOOL WINAPI GetConsoleMode(HANDLE hConsole, LPDWORD lpMode); + + GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(("GetStdHandle", windll.kernel32)) + STD_INPUT_HANDLE = DWORD(-10) + STD_OUTPUT_HANDLE = DWORD(-11) + STD_ERROR_HANDLE = DWORD(-12) + GetFileType = WINFUNCTYPE(DWORD, DWORD)(("GetFileType", windll.kernel32)) + FILE_TYPE_CHAR = 0x0002 + FILE_TYPE_REMOTE = 0x8000 + GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD)) \ + (("GetConsoleMode", windll.kernel32)) + INVALID_HANDLE_VALUE = DWORD(-1).value + + def not_a_console(handle): + if handle == INVALID_HANDLE_VALUE or handle is None: + return True + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR + or GetConsoleMode(handle, byref(DWORD())) == 0) + + old_stdin_fileno = None + old_stdout_fileno = None + old_stderr_fileno = None + + if hasattr(sys.stdin, 'fileno'): + old_stdin_fileno = sys.stdin.fileno() + if hasattr(sys.stdout, 'fileno'): + old_stdout_fileno = sys.stdout.fileno() + if hasattr(sys.stderr, 'fileno'): + old_stderr_fileno = sys.stderr.fileno() + + STDIN_FILENO = 0 + STDOUT_FILENO = 1 + STDERR_FILENO = 2 + real_stdin = (old_stdin_fileno == STDIN_FILENO) + real_stdout = (old_stdout_fileno == STDOUT_FILENO) + real_stderr = (old_stderr_fileno == STDERR_FILENO) + + if real_stdin: + hStdin = GetStdHandle(STD_INPUT_HANDLE) + if not_a_console(hStdin): + real_stdin = False + + if real_stdout: + hStdout = GetStdHandle(STD_OUTPUT_HANDLE) + if not_a_console(hStdout): + real_stdout = False + + if real_stderr: + hStderr = GetStdHandle(STD_ERROR_HANDLE) + if not_a_console(hStderr): + real_stderr = False + + if real_stdin: + ReadConsoleW = WINFUNCTYPE(BOOL, HANDLE, LPVOID, DWORD, POINTER(DWORD), \ + LPVOID)(("ReadConsoleW", windll.kernel32)) + + class UnicodeInput: + def __init__(self, hConsole, name, bufsize=1024): + self._hConsole = hConsole + self.bufsize = bufsize + self.buffer = create_unicode_buffer(bufsize) + self.name = name + self.encoding = 'utf-8' + + def readline(self): + maxnum = DWORD(self.bufsize-1) + numrecv = DWORD(0) + result = ReadConsoleW(self._hConsole, self.buffer, maxnum, byref(numrecv), None) + if not result: + raise Exception("stdin failure") + return self.buffer.value[:numrecv.value].encode(self.encoding) + + + if real_stdout or real_stderr: + # BOOL WINAPI WriteConsoleW(HANDLE hOutput, LPWSTR lpBuffer, DWORD nChars, + # LPDWORD lpCharsWritten, LPVOID lpReserved); + + WriteConsoleW = WINFUNCTYPE(BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), \ + LPVOID)(("WriteConsoleW", windll.kernel32)) + + class UnicodeOutput: + def __init__(self, hConsole, stream, fileno, name): + self._hConsole = hConsole + self._stream = stream + self._fileno = fileno + self.closed = False + self.softspace = False + self.mode = 'w' + self.encoding = 'utf-8' + self.name = name + self.flush() + + def isatty(self): + return False + def close(self): + # don't really close the handle, that would only cause problems + self.closed = True + def fileno(self): + return self._fileno + def flush(self): + if self._hConsole is None: + try: + self._stream.flush() + except Exception, e: + _complain("%s.flush: %r from %r" + % (self.name, e, self._stream)) + raise + + def write(self, text): + try: + if self._hConsole is None: + if isinstance(text, unicode): + text = text.encode('utf-8') + self._stream.write(text) + else: + if not isinstance(text, unicode): + text = str(text).decode('utf-8') + remaining = len(text) + while remaining > 0: + n = DWORD(0) + # There is a shorter-than-documented limitation on the + # length of the string passed to WriteConsoleW (see + # http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232. + retval = WriteConsoleW(self._hConsole, text, + min(remaining, 10000), + byref(n), None) + if retval == 0 or n.value == 0: + raise IOError("WriteConsoleW returned %r, n.value = %r" + % (retval, n.value)) + remaining -= n.value + if remaining == 0: break + text = text[n.value:] + except Exception, e: + _complain("%s.write: %r" % (self.name, e)) + raise + + def writelines(self, lines): + try: + for line in lines: + self.write(line) + except Exception, e: + _complain("%s.writelines: %r" % (self.name, e)) + raise + + if real_stdin: + stdin = UnicodeInput(hStdin, name='<Unicode console stdin>') + + if real_stdout: + stdout = UnicodeOutput(hStdout, None, STDOUT_FILENO, + '<Unicode console stdout>') + else: + stdout = UnicodeOutput(None, sys.stdout, old_stdout_fileno, + '<Unicode redirected stdout>') + + if real_stderr: + stderr = UnicodeOutput(hStderr, None, STDERR_FILENO, + '<Unicode console stderr>') + else: + stderr = UnicodeOutput(None, sys.stderr, old_stderr_fileno, + '<Unicode redirected stderr>') + except Exception, e: + _complain("exception %r while fixing up sys.stdout and sys.stderr" % (e,)) + + + # While we're at it, let's unmangle the command-line arguments: + + # This works around http://bugs.python.org/issue2128. + GetCommandLineW = WINFUNCTYPE(LPWSTR)(("GetCommandLineW", windll.kernel32)) + CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int)) \ + (("CommandLineToArgvW", windll.shell32)) + + argc = c_int(0) + argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc)) + + argv = [argv_unicode[i].encode('utf-8') for i in xrange(0, argc.value)] + + if not hasattr(sys, 'frozen'): + # If this is an executable produced by py2exe or bbfreeze, then it will + # have been invoked directly. Otherwise, unicode_argv[0] is the Python + # interpreter, so skip that. + argv = argv[1:] + + # Also skip option arguments to the Python interpreter. + while len(argv) > 0: + arg = argv[0] + if not arg.startswith(u"-") or arg == u"-": + break + argv = argv[1:] + if arg == u'-m': + # sys.argv[0] should really be the absolute path of the module source, + # but never mind + break + if arg == u'-c': + argv[0] = u'-c' + break
Property changes on: branches/rewrite/pywikibot/userinterfaces/win32_unicode.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:executable + *