[Pywikipedia-l] SVN: [6275] trunk/pywikipedia/userinterfaces
a_engels at svn.wikimedia.org
a_engels at svn.wikimedia.org
Wed Jan 21 20:21:19 UTC 2009
Revision: 6275
Author: a_engels
Date: 2009-01-21 20:21:19 +0000 (Wed, 21 Jan 2009)
Log Message:
-----------
speeding up transliteration by using a dictionary instead of a sequence of elifs
Modified Paths:
--------------
trunk/pywikipedia/userinterfaces/terminal_interface.py
trunk/pywikipedia/userinterfaces/transliteration.py
Modified: trunk/pywikipedia/userinterfaces/terminal_interface.py
===================================================================
--- trunk/pywikipedia/userinterfaces/terminal_interface.py 2009-01-21 19:19:12 UTC (rev 6274)
+++ trunk/pywikipedia/userinterfaces/terminal_interface.py 2009-01-21 20:21:19 UTC (rev 6275)
@@ -16,6 +16,8 @@
except ImportError:
ctypes_found = False
+transliterator = transliteration.transliterator()
+
def getDefaultTextColorInWindows():
"""
This method determines the default text color and saves its color
@@ -195,9 +197,9 @@
# original question marks.
if codecedText[i] == '?' and text[i] != u'?':
try:
- transliterated = transliteration.trans(text[i], default = '?', prev = prev, next = text[i+1])
+ transliterated = transliterator.transliterate(text[i], default = '?', prev = prev, next = text[i+1])
except IndexError:
- transliterated = transliteration.trans(text[i], default = '?', prev = prev, next = ' ')
+ transliterated = transliterator.transliterate(text[i], default = '?', prev = prev, next = ' ')
# transliteration was successful. The replacement
# could consist of multiple letters.
# mark the transliterated letters in yellow.
Modified: trunk/pywikipedia/userinterfaces/transliteration.py
===================================================================
--- trunk/pywikipedia/userinterfaces/transliteration.py 2009-01-21 19:19:12 UTC (rev 6274)
+++ trunk/pywikipedia/userinterfaces/transliteration.py 2009-01-21 20:21:19 UTC (rev 6275)
@@ -1,1902 +1,1357 @@
-# -*- coding: utf-8 -*-
-def trans(char, default = '?', prev = '-', next = '-'):
- # Give a transliteration for char, or default if none is known
- # Accented etc. Latin characters
- if char in u"ÀÁÂẦẤẪẨẬÃĀĂẰẮẴẶẲȦǠẠḀȂĄǍẢ":
- return u"A"
- if char in u"ȀǞ":
- return u"Ä"
- if char == u"Ǻ":
- return u"Å"
- if char == u"Ä":
- return u"Ae"
- if char == u"Å":
- return u"Aa"
- if char in u"àáâầấẫẩậãāăằắẵặẳȧǡạḁȃąǎảẚ":
- return u"a"
- if char in u"ȁǟ":
- return u"ä"
- if char == u"ǻ":
- return u"å"
- if char == u"ä":
- return u"ae"
- if char == u"å":
- return u"aa"
- if char in u"ḂḄḆƁƂ":
- return u"B"
- if char in u"ḃḅḇƀɓƃ":
- return u"b"
- if char in u"ĆĈĊÇČƇ":
- return u"C"
- if char in u"ćĉċçčƈȼ":
- return u"c"
- if char == u"Ḉ":
- return u"Ç"
- if char == u"ḉ":
- return u"ç"
- if char == u"Ð":
- return u"Dh"
- if char == u"ð":
- return u"dh"
- if char in u"ĎḊḌḎḐḒĐƉƊƋ":
- return u"D"
- if char in u"ďḋḍḏḑḓđɖɗƌ":
- return u"d"
- if char in u"ÈȄÉÊḚËĒḔḖĔĖẸE̩ȆȨḜĘĚẼḘẺ":
- return u"E"
- if char in u"ỀẾỄỆỂ":
- return u"Ê"
- if char in u"èȅéêḛëēḕḗĕėẹe̩ȇȩḝęěẽḙẻ":
- return u"e"
- if char in u"ềếễệể":
- return u"ê"
- if char in u"ḞƑ":
- return u"F"
- if char in u"ḟƒ":
- return u"f"
- if char in u"ǴḠĞĠĢǦǤƓ":
- return u"G"
- if char in u"ǵḡğġģǧǥɠ":
- return u"g"
- if char == u"Ĝ":
- return u"Gx"
- if char == u"ĝ":
- return u"gx"
- if char in u"ḢḤḦȞḨḪH̱ĦǶ":
- return u"H"
- if char in u"ḣḥḧȟḩḫ̱ẖħƕ":
- return u"h"
- if char in u"IÌȈÍÎĨḬÏḮĪĬȊĮǏİỊỈƗ":
- return u"I"
- if char in u"ıìȉíîĩḭïḯīĭȋįǐiịỉɨ":
- return u"i"
- if char in u"ĴJ":
- return u"J"
- if char in u"ɟĵ̌ǰ":
- return u"j"
- if char in u"ḰǨĶḲḴƘ":
- return u"K"
- if char in u"ḱǩķḳḵƙ":
- return u"k"
- if char in u"ĹĻĽḶḸḺḼȽŁ":
- return u"L"
- if char in u"ĺļľḷḹḻḽƚłɫ":
- return u"l"
- if char in u"ḾṀṂ":
- return u"M"
- if char in u"ḿṁṃɱ":
- return u"m"
- if char in u"ǸŃÑŅŇṄṆṈṊŊƝɲȠ":
- return u"N"
- if char in u"ǹńñņňṅṇṉṋŋɲƞ":
- return u"n"
- if char in u"ÒÓÔÕṌṎȬÖŌṐṒŎǑȮȰỌǪǬƠỜỚỠỢỞỎƟØǾ":
- return u"O"
- if char in u"òóôõṍṏȭöōṑṓŏǒȯȱọǫǭơờớỡợởỏɵøǿ":
- return u"o"
- if char in u"ȌŐȪ":
- return u"Ö"
- if char in u"ȍőȫ":
- return u"ö"
- if char in u"ỒỐỖỘỔȎ":
- return u"Ô"
- if char in u"ồốỗộổȏ":
- return u"ô"
- if char in u"ṔṖƤ":
- return u"P"
- if char in u"ṕṗƥ":
- return u"p"
- if char == u"ᵽ":
- return u"q"
- if char in u"ȐŔŖŘȒṘṚṜṞ":
- return u"R"
- if char in u"ȑŕŗřȓṙṛṝṟɽ":
- return u"r"
- if char in u"ŚṤŞȘŠṦṠṢṨ":
- return u"S"
- if char in u"śṥşșšṧṡṣṩȿ":
- return u"s"
- if char == u"Ŝ":
- return u"Sx"
- if char == u"ŝ":
- return u"sx"
- if char in u"ŢȚŤṪṬṮṰŦƬƮ":
- return u"T"
- if char in u"ţțťṫṭṯṱŧȾƭʈ":
- return u"t"
- if char in u"ÙÚŨṸṴÜṲŪṺŬỤŮŲǓṶỦƯỮỰỬ":
- return u"U"
- if char in u"ùúũṹṵüṳūṻŭụůųǔṷủưữựửʉ":
- return u"u"
- if char in u"ȔŰǛǗǕǙ":
- return u"Ü"
- if char in u"ȕűǜǘǖǚ":
- return u"ü"
- if char == u"Û":
- return u"Ux"
- if char == u"û":
- return u"ux"
- if char == u"Ȗ":
- return u"Û"
- if char == u"ȗ":
- return u"û"
- if char == u"Ừ":
- return u"Ù"
- if char == u"ừ":
- return u"ù"
- if char == u"Ứ":
- return u"Ú"
- if char == u"ứ":
- return u"ú"
- if char in u"ṼṾ":
- return u"V"
- if char in u"ṽṿ":
- return u"v"
- if char in u"ẀẂŴẄẆẈ":
- return u"W"
- if char in u"ẁẃŵẅẇẉ":
- return u"w"
- if char in u"ẊẌ":
- return u"X"
- if char in u"ẋẍ":
- return u"x"
- if char in u"ỲÝŶŸỸȲẎỴỶƳ":
- return u"Y"
- if char in u"ỳýŷÿỹȳẏỵỷƴ":
- return u"y"
- if char in u"ŹẐŻẒŽẔƵȤ":
- return u"Z"
- if char in u"źẑżẓžẕƶȥ":
- return u"z"
- if char == u"ɀ":
- return u"zv"
-
- # Latin: extended Latin alphabet
- if char == u"ɑ":
- return u"a"
- if char in u"ÆǼǢ":
- return u"AE"
- if char in u"æǽǣ":
- return u"ae"
- if char == u"Ð":
- return u"Dh"
- if char == u"ð":
- return u"dh"
- if char in u"ƎƏƐ":
- return u"E"
- if char in u"ǝəɛ":
- return u"e"
- if char in u"ƔƢ":
- return u"G"
- if char in u"ᵷɣƣᵹ":
- return u"g"
- if char == u"Ƅ":
- return u"H"
- if char == u"ƅ":
- return u"h"
- if char == u"Ƕ":
- return u"Wh"
- if char == u"ƕ":
- return u"wh"
- if char == u"Ɩ":
- return u"I"
- if char == u"ɩ":
- return u"i"
- if char == u"Ŋ":
- return u"Ng"
- if char == u"ŋ":
- return u"ng"
- if char == u"Œ":
- return u"OE"
- if char == u"œ":
- return u"oe"
- if char == u"Ɔ":
- return u"O"
- if char == u"ɔ":
- return u"o"
- if char == u"Ȣ":
- return u"Ou"
- if char == u"ȣ":
- return u"ou"
- if char == u"Ƽ":
- return u"Q"
- if char in u"ĸƽ":
- return u"q"
- if char == u"ȹ":
- return u"qp"
- if char == u"":
- return u"r"
- if char == u"ſ":
- return u"s"
- if char == u"ß":
- return u"ss"
- if char == u"Ʃ":
- return u"Sh"
- if char == u"ʃᶋ":
- return u"sh"
- if char == u"Ʉ":
- return u"U"
- if char == u"ʉ":
- return u"u"
- if char == u"Ʌ":
- return u"V"
- if char == u"ʌ":
- return u"v"
- if char in u"ƜǷ":
- return u"W"
- if char in u"ɯƿ":
- return u"w"
- if char == u"Ȝ":
- return u"Y"
- if char == u"ȝ":
- return u"y"
- if char == u"IJ":
- return u"IJ"
- if char == u"ij":
- return u"ij"
- if char == u"Ƨ":
- return u"Z"
- if char in u"ʮƨ":
- return u"z"
- if char == u"Ʒ":
- return u"Zh"
- if char == u"ʒ":
- return u"zh"
- if char == u"Ǯ":
- return u"Dzh"
- if char == u"ǯ":
- return u"dzh"
- if char in u"ƸƹʔˀɁɂ":
- return u"'"
- if char in u"Þ":
- return u"Th"
- if char in u"þ":
- return u"th"
- if char in u"Cʗǃ":
- return u"!"
+# -*- coding: utf-8 -*-
+class transliterator(object):
+ def __init__(self):
+ self.trans = {}
+ for char in u"ÀÁÂẦẤẪẨẬÃĀĂẰẮẴẶẲȦǠẠḀȂĄǍẢ":
+ self.trans[char] = u"A"
+ for char in u"ȀǞ":
+ self.trans[char] = u"Ä"
+ self.trans[u"Ǻ"] = u"Å"
+ self.trans[u"Ä"] = u"Ae"
+ self.trans[u"Å"] = u"Aa"
+ for char in u"àáâầấẫẩậãāăằắẵặẳȧǡạḁȃąǎảẚ":
+ self.trans[char] = u"a"
+ for char in u"ȁǟ":
+ self.trans[char] = u"ä"
+ self.trans[u"ǻ"] = u"å"
+ self.trans[u"ä"] = u"ae"
+ self.trans[u"å"] = u"aa"
+ for char in u"ḂḄḆƁƂ":
+ self.trans[char] = u"B"
+ for char in u"ḃḅḇƀɓƃ":
+ self.trans[char] = u"b"
+ for char in u"ĆĈĊÇČƇ":
+ self.trans[char] = u"C"
+ for char in u"ćĉċçčƈȼ":
+ self.trans[char] = u"c"
+ self.trans[u"Ḉ"] = u"Ç"
+ self.trans[u"ḉ"] = u"ç"
+ self.trans[u"Ð"] = u"Dh"
+ self.trans[u"ð"] = u"dh"
+ for char in u"ĎḊḌḎḐḒĐƉƊƋ":
+ self.trans[char] = u"D"
+ for char in u"ďḋḍḏḑḓđɖɗƌ":
+ self.trans[char] = u"d"
+ for char in u"ÈȄÉÊḚËĒḔḖĔĖẸE̩ȆȨḜĘĚẼḘẺ":
+ self.trans[char] = u"E"
+ for char in u"ỀẾỄỆỂ":
+ self.trans[char] = u"Ê"
+ for char in u"èȅéêḛëēḕḗĕėẹe̩ȇȩḝęěẽḙẻ":
+ self.trans[char] = u"e"
+ for char in u"ềếễệể":
+ self.trans[char] = u"ê"
+ for char in u"ḞƑ":
+ self.trans[char] = u"F"
+ for char in u"ḟƒ":
+ self.trans[char] = u"f"
+ for char in u"ǴḠĞĠĢǦǤƓ":
+ self.trans[char] = u"G"
+ for char in u"ǵḡğġģǧǥɠ":
+ self.trans[char] = u"g"
+ self.trans[u"Ĝ"] = u"Gx"
+ self.trans[u"ĝ"] = u"gx"
+ for char in u"ḢḤḦȞḨḪH̱ĦǶ":
+ self.trans[char] = u"H"
+ for char in u"ḣḥḧȟḩḫ̱ẖħƕ":
+ self.trans[char] = u"h"
+ for char in u"IÌȈÍÎĨḬÏḮĪĬȊĮǏİỊỈƗ":
+ self.trans[char] = u"I"
+ for char in u"ıìȉíîĩḭïḯīĭȋįǐiịỉɨ":
+ self.trans[char] = u"i"
+ for char in u"ĴJ":
+ self.trans[char] = u"J"
+ for char in u"ɟĵ̌ǰ":
+ self.trans[char] = u"j"
+ for char in u"ḰǨĶḲḴƘ":
+ self.trans[char] = u"K"
+ for char in u"ḱǩķḳḵƙ":
+ self.trans[char] = u"k"
+ for char in u"ĹĻĽḶḸḺḼȽŁ":
+ self.trans[char] = u"L"
+ for char in u"ĺļľḷḹḻḽƚłɫ":
+ self.trans[char] = u"l"
+ for char in u"ḾṀṂ":
+ self.trans[char] = u"M"
+ for char in u"ḿṁṃɱ":
+ self.trans[char] = u"m"
+ for char in u"ǸŃÑŅŇṄṆṈṊŊƝɲȠ":
+ self.trans[char] = u"N"
+ for char in u"ǹńñņňṅṇṉṋŋɲƞ":
+ self.trans[char] = u"n"
+ for char in u"ÒÓÔÕṌṎȬÖŌṐṒŎǑȮȰỌǪǬƠỜỚỠỢỞỎƟØǾ":
+ self.trans[char] = u"O"
+ for char in u"òóôõṍṏȭöōṑṓŏǒȯȱọǫǭơờớỡợởỏɵøǿ":
+ self.trans[char] = u"o"
+ for char in u"ȌŐȪ":
+ self.trans[char] = u"Ö"
+ for char in u"ȍőȫ":
+ self.trans[char] = u"ö"
+ for char in u"ỒỐỖỘỔȎ":
+ self.trans[char] = u"Ô"
+ for char in u"ồốỗộổȏ":
+ self.trans[char] = u"ô"
+ for char in u"ṔṖƤ":
+ self.trans[char] = u"P"
+ for char in u"ṕṗƥ":
+ self.trans[char] = u"p"
+ self.trans[u"ᵽ"] = u"q"
+ for char in u"ȐŔŖŘȒṘṚṜṞ":
+ self.trans[char] = u"R"
+ for char in u"ȑŕŗřȓṙṛṝṟɽ":
+ self.trans[char] = u"r"
+ for char in u"ŚṤŞȘŠṦṠṢṨ":
+ self.trans[char] = u"S"
+ for char in u"śṥşșšṧṡṣṩȿ":
+ self.trans[char] = u"s"
+ self.trans[u"Ŝ"] = u"Sx"
+ self.trans[u"ŝ"] = u"sx"
+ for char in u"ŢȚŤṪṬṮṰŦƬƮ":
+ self.trans[char] = u"T"
+ for char in u"ţțťṫṭṯṱŧȾƭʈ":
+ self.trans[char] = u"t"
+ for char in u"ÙÚŨṸṴÜṲŪṺŬỤŮŲǓṶỦƯỮỰỬ":
+ self.trans[char] = u"U"
+ for char in u"ùúũṹṵüṳūṻŭụůųǔṷủưữựửʉ":
+ self.trans[char] = u"u"
+ for char in u"ȔŰǛǗǕǙ":
+ self.trans[char] = u"Ü"
+ for char in u"ȕűǜǘǖǚ":
+ self.trans[char] = u"ü"
+ self.trans[u"Û"] = u"Ux"
+ self.trans[u"û"] = u"ux"
+ self.trans[u"Ȗ"] = u"Û"
+ self.trans[u"ȗ"] = u"û"
+ self.trans[u"Ừ"] = u"Ù"
+ self.trans[u"ừ"] = u"ù"
+ self.trans[u"Ứ"] = u"Ú"
+ self.trans[u"ứ"] = u"ú"
+ for char in u"ṼṾ":
+ self.trans[char] = u"V"
+ for char in u"ṽṿ":
+ self.trans[char] = u"v"
+ for char in u"ẀẂŴẄẆẈ":
+ self.trans[char] = u"W"
+ for char in u"ẁẃŵẅẇẉ":
+ self.trans[char] = u"w"
+ for char in u"ẊẌ":
+ self.trans[char] = u"X"
+ for char in u"ẋẍ":
+ self.trans[char] = u"x"
+ for char in u"ỲÝŶŸỸȲẎỴỶƳ":
+ self.trans[char] = u"Y"
+ for char in u"ỳýŷÿỹȳẏỵỷƴ":
+ self.trans[char] = u"y"
+ for char in u"ŹẐŻẒŽẔƵȤ":
+ self.trans[char] = u"Z"
+ for char in u"źẑżẓžẕƶȥ":
+ self.trans[char] = u"z"
+ self.trans[u"ɀ"] = u"zv"
+
+ # Latin: extended Latin alphabet
+ self.trans[u"ɑ"] = u"a"
+ for char in u"ÆǼǢ":
+ self.trans[char] = u"AE"
+ for char in u"æǽǣ":
+ self.trans[char] = u"ae"
+ self.trans[u"Ð"] = u"Dh"
+ self.trans[u"ð"] = u"dh"
+ for char in u"ƎƏƐ":
+ self.trans[char] = u"E"
+ for char in u"ǝəɛ":
+ self.trans[char] = u"e"
+ for char in u"ƔƢ":
+ self.trans[char] = u"G"
+ for char in u"ᵷɣƣᵹ":
+ self.trans[char] = u"g"
+ self.trans[u"Ƅ"] = u"H"
+ self.trans[u"ƅ"] = u"h"
+ self.trans[u"Ƕ"] = u"Wh"
+ self.trans[u"ƕ"] = u"wh"
+ self.trans[u"Ɩ"] = u"I"
+ self.trans[u"ɩ"] = u"i"
+ self.trans[u"Ŋ"] = u"Ng"
+ self.trans[u"ŋ"] = u"ng"
+ self.trans[u"Œ"] = u"OE"
+ self.trans[u"œ"] = u"oe"
+ self.trans[u"Ɔ"] = u"O"
+ self.trans[u"ɔ"] = u"o"
+ self.trans[u"Ȣ"] = u"Ou"
+ self.trans[u"ȣ"] = u"ou"
+ self.trans[u"Ƽ"] = u"Q"
+ for char in u"ĸƽ":
+ self.trans[char] = u"q"
+ self.trans[u"ȹ"] = u"qp"
+ self.trans[u""] = u"r"
+ self.trans[u"ſ"] = u"s"
+ self.trans[u"ß"] = u"ss"
+ self.trans[u"Ʃ"] = u"Sh"
+ for char in u"ʃᶋ":
+ self.trans[char] = u"sh"
+ self.trans[u"Ʉ"] = u"U"
+ self.trans[u"ʉ"] = u"u"
+ self.trans[u"Ʌ"] = u"V"
+ self.trans[u"ʌ"] = u"v"
+ for char in u"ƜǷ":
+ self.trans[char] = u"W"
+ for char in u"ɯƿ":
+ self.trans[char] = u"w"
+ self.trans[u"Ȝ"] = u"Y"
+ self.trans[u"ȝ"] = u"y"
+ self.trans[u"IJ"] = u"IJ"
+ self.trans[u"ij"] = u"ij"
+ self.trans[u"Ƨ"] = u"Z"
+ for char in u"ʮƨ":
+ self.trans[char] = u"z"
+ self.trans[u"Ʒ"] = u"Zh"
+ self.trans[u"ʒ"] = u"zh"
+ self.trans[u"Ǯ"] = u"Dzh"
+ self.trans[u"ǯ"] = u"dzh"
+ for char in u"ƸƹʔˀɁɂ":
+ self.trans[char] = u"'"
+ for char in u"Þ":
+ self.trans[char] = u"Th"
+ for char in u"þ":
+ self.trans[char] = u"th"
+ for char in u"Cʗǃ":
+ self.trans[char] = u"!"
- #Punctuation and typography
- if char in u"«»“”„¨":
- return u'"'
- if char in u"‘’′":
- return u"'"
- if char == u"•":
- return u"*"
- if char == u"@":
- return u"(at)"
- if char == u"¤":
- return u"$"
- if char == u"¢":
- return u"c"
- if char == u"€":
- return u"E"
- if char == u"£":
- return u"L"
- if char == u"¥":
- return u"yen"
- if char == u"†":
- return u"+"
- if char == u"‡":
- return u"++"
- if char == u"°":
- return u":"
- if char == u"¡":
- return u"!"
- if char == u"¿":
- return u"?"
- if char == u"‰":
- return u"o/oo"
- if char == u"‱":
- return u"o/ooo"
- if char in u"¶§":
- return u">"
- if char in u"…":
- return u"..."
- if char in u"‒–—―":
- return u"-"
- if char in u"·":
- return u" "
- if char == u"¦":
- return u"|"
- if char == u"⁂":
- return u"***"
- if char == u"◊":
- return u"<>"
- if char == u"‽":
- return u"?!"
- if char == u"؟":
- return u";-)"
-
+ #Punctuation and typography
+ for char in u"«»“”„¨":
+ self.trans[char] = u'"'
+ for char in u"‘’′":
+ self.trans[char] = u"'"
+ self.trans[u"•"] = u"*"
+ self.trans[u"@"] = u"(at)"
+ self.trans[u"¤"] = u"$"
+ self.trans[u"¢"] = u"c"
+ self.trans[u"€"] = u"E"
+ self.trans[u"£"] = u"L"
+ self.trans[u"¥"] = u"yen"
+ self.trans[u"†"] = u"+"
+ self.trans[u"‡"] = u"++"
+ self.trans[u"°"] = u":"
+ self.trans[u"¡"] = u"!"
+ self.trans[u"¿"] = u"?"
+ self.trans[u"‰"] = u"o/oo"
+ self.trans[u"‱"] = u"o/ooo"
+ for char in u"¶§":
+ self.trans[char] = u">"
+ for char in u"…":
+ self.trans[char] = u"..."
+ for char in u"‒–—―":
+ self.trans[char] = u"-"
+ for char in u"·":
+ self.trans[char] = u" "
+ self.trans[u"¦"] = u"|"
+ self.trans[u"⁂"] = u"***"
+ self.trans[u"◊"] = u"<>"
+ self.trans[u"‽"] = u"?!"
+ self.trans[u"؟"] = u";-)"
- # Cyrillic
- if char == u"А":
- return u"A"
- if char == u"а":
- return u"a"
- if char == u"Б":
- return u"B"
- if char == u"б":
- return u"b"
- if char == u"В":
- return u"V"
- if char == u"в":
- return u"v"
- if char == u"Г":
- return u"G"
- if char == u"г":
- return u"g"
- if char == u"Д":
- return u"D"
- if char == u"д":
- return u"d"
- if char == u"Е":
- return u"E"
- if char == u"е":
- return u"e"
- if char == u"Ж":
- return u"Zh"
- if char == u"ж":
- return u"zh"
- if char == u"З":
- return u"Z"
- if char == u"з":
- return u"z"
- if char == u"И":
- return u"I"
- if char == u"и":
- return u"i"
- if char == u"Й":
- return u"J"
- if char == u"й":
- return u"j"
- if char == u"К":
- return u"K"
- if char == u"к":
- return u"k"
- if char == u"Л":
- return u"L"
- if char == u"л":
- return u"l"
- if char == u"М":
- return u"M"
- if char == u"м":
- return u"m"
- if char == u"Н":
- return u"N"
- if char == u"н":
- return u"n"
- if char == u"О":
- return u"O"
- if char == u"о":
- return u"o"
- if char == u"П":
- return u"P"
- if char == u"п":
- return u"p"
- if char == u"Р":
- return u"R"
- if char == u"р":
- return u"r"
- if char == u"С":
- return u"S"
- if char == u"с":
- return u"s"
- if char == u"Т":
- return u"T"
- if char == u"т":
- return u"t"
- if char in u"У":
- return u"U"
- if char == u"у":
- return u"u"
- if char == u"Ф":
- return u"F"
- if char == u"ф":
- return u"f"
- if char in u"ХΧ":
- if prev.lower() == prev:
- return u"Kh"
- else:
- return u"KH"
- if char == u"х":
- return u"kh"
- if char == u"Ц":
- return u"C"
- if char == u"ц":
- return u"c"
- if char == u"Ч":
- return u"Ch"
- if char == u"ч":
- return u"ch"
- if char == u"Ш":
- return u"Sh"
- if char == u"ш":
- return u"sh"
- if char == u"Щ":
- return u"Shch"
- if char == u"щ":
- return u"shch"
- if char in u"Ьь":
- return u"'"
- if char in u"Ъъ":
- return '"'
- if char == u"Ю":
- return u"Yu"
- if char == u"ю":
- return u"yu"
- if char == u"Я":
- return u"Ya"
- if char == u"я":
- return u"ya"
- # Additional Cyrillic letters, most occuring in only one or a few languages
- if char == u"Ы":
- return u"Y"
- if char == u"ы":
- return u"y"
- if char == u"Ё":
- return u"Ë"
- if char == u"ё":
- return u"ë"
- if char in u"ЭЀ":
- return u"È"
- if char in u"эѐ":
- return u"è"
- if char == u"І":
- return u"I"
- if char == u"і":
- return u"i"
- if char == u"Ї":
- return u"Ji"
- if char == u"ї":
- return u"ji"
- if char == u"Є":
- return u"Je"
- if char == u"є":
- return u"je"
- if char in u"ҐҜ":
- return u"G"
- if char in u"ґҝ":
- return u"g"
- if char == u"Ђ":
- return u"Dj"
- if char == u"ђ":
- return u"dj"
- if char in u"ЈӤҊ":
- return u"J"
- if char in u"јӥҋ":
- return u"j"
- if char == u"Ӣ":
- return u"Y"
- if char == u"ӣ":
- return u"y"
- if char == u"Љ":
- return u"Lj"
- if char == u"љ":
- return u"lj"
- if char == u"Њ":
- return u"Nj"
- if char == u"њ":
- return u"nj"
- if char == u"Ћ":
- return u"Cj"
- if char == u"ћ":
- return u"cj"
- if char in u"ЏӁӜҶ":
- return u"Dzh"
- if char in u"џӂӝҷ":
- return u"dzh"
- if char == u"Җ":
- return u"Zhj"
- if char == u"җ":
- return u"zhj"
- if char in u"ЅӞӠӋҸ":
- return u"Dz"
- if char in u"ѕӟӡӌҹ":
- return u"dz"
- if char == u"Ѓ":
- return u"Gj"
- if char == u"ѓ":
- return u"gj"
- if char == u"Ќ":
- return u"Kj"
- if char == u"ќ":
- return u"kj"
- if char in u"ҒӶҔ":
- return u"G"
- if char in u"ғӷҕ":
- return u"g"
- if char == u"Ӣ":
- return u"Ii"
- if char == u"ӣ":
- return u"ii"
- if char in u"ҚҞҠӃ":
- return u"Q"
- if char == u"қҟҡӄ":
- return u"q"
- if char == u"Ӯ":
- return u"U"
- if char == u"ӯ":
- return u"u"
- if char == u"Ҳ":
- return u"H"
- if char == u"ҳ":
- return u"h"
- if char == u"Ҷ":
- return u"Dz"
- if char == u"ҷ":
- return u"dz"
- if char in u"ӨӪ":
- return u"Ô"
- if char in u"өӫ":
- return u"ô"
- if char == u"Ү":
- return u"Y"
- if char == u"ү":
- return u"y"
- if char == u"Һ":
- return u"H"
- if char == u"һ":
- return u"h"
- if char in u"ӘӔ":
- return u"AE"
- if char == u"ә":
- return u"ae"
- if char == u"ӚӬ":
- return u"Ë"
- if char == u"ӛӭ":
- return u"ë"
- if char == u"Җ":
- return u"Zhj"
- if char == u"җ":
- return u"zhj"
- if char == u"ҢҤӉӇ":
- return u"Ng"
- if char == u"ңҥӊӈ":
- return u"ng"
- if char == u"Ұ":
- return u"U"
- if char == u"ұ":
- return u"u"
- if char == u"ў":
- return u"ù"
- if char == u"Ў":
- return u"Ù"
- if char == u"ѝ":
- return u"ì"
- if char == u"Ѝ":
- return u"Ì"
- if char == u"Ӑ":
- return u"A"
- if char == u"ă":
- return u"a"
- if char == u"Ӓ":
- return u"Ä"
- if char == u"ä":
- return u"ä"
- if char in u"ӖѢҌ":
- return u"E"
- if char in u"ӗѣҍ":
- return u"e"
- if char == u"ҼҾ":
- return u"Ts"
- if char == u"ҽҿ":
- return u"ts"
- if char == u"Ҙ":
- return u"Dh"
- if char == u"ҙ":
- return u"dh"
- if char in u"Ӏӏ":
- return u""
- if char == u"Ӆ":
- return u"L"
- if char == u"ӆ":
- return u"l"
- if char == u"Ӎ":
- return u"M"
- if char == u"ӎ":
- return u"m"
- if char == u"Ӧ":
- return u"Ö"
- if char == u"ӧ":
- return u"ö"
- if char == u"Ҩ":
- return u"u"
- if char == u"ҩ":
- return u"u"
- if char == u"Ҧ":
- return u"Ph"
- if char == u"ҧ":
- return u"ph"
- if char == u"Ҏ":
- return u"R"
- if char == u"ҏ":
- return u"r"
- if char == u"Ҫ":
- return u"Th"
- if char == u"ҫ":
- return u"th"
- if char == u"Ҭ":
- return u"T"
- if char == u"ҭ":
- return u"t"
- if char in u"ӲӰҮ":
- return u"Ü"
- if char in u"ӳӱү":
- return u"ü"
- if char == u"Ӯ":
- return u"Û"
- if char == u"ӯ":
- return u"û"
- if char == u"ҰӸ":
- return u"U"
- if char == u"ұӹ":
- return u"u"
- if char == u"Ҵ":
- return u"Tts"
- if char == u"ҵ":
- return u"tts"
- if char == u"Ӵ":
- return u"Ch"
- if char == u"ӵ":
- return u"ch"
+ # Cyrillic
+ self.trans.update({u"А" : u"A", u"а" : u"a", u"Б" : u"B", u"б" : u"b",
+ u"В" : u"V", u"в" : u"v", u"Г" : u"G", u"г" : u"g",
+ u"Д" : u"D", u"д" : u"d", u"Е" : u"E", u"е" : u"e",
+ u"Ж" : u"Zh", u"ж" : u"zh", u"З" : u"Z", u"з" : u"z",
+ u"И" : u"I", u"и" : u"i", u"Й" : u"J", u"й" : u"j",
+ u"К" : u"K", u"к" : u"k", u"Л" : u"L", u"л" : u"l",
+ u"М" : u"M", u"м" : u"m", u"Н" : u"N", u"н" : u"n",
+ u"О" : u"O", u"о" : u"o", u"П" : u"P", u"п" : u"p",
+ u"Р" : u"R", u"р" : u"r", u"С" : u"S", u"с" : u"s",
+ u"Т" : u"T", u"т" : u"t", u"У" : u"U", u"у" : u"u",
+ u"Ф" : u"F", u"ф" : u"f", u"х" : u"kh", u"Ц" : u"C",
+ u"ц" : u"c", u"Ч" : u"Ch", u"ч" : u"ch", u"Ш" : u"Sh",
+ u"ш" : u"sh", u"Щ" : u"Shch", u"щ" : u"shch", u"Ь" : u"'",
+ u"ь" : "'", u"Ъ" : u'"', u"ъ" : '"', u"Ю" : u"Yu",
+ u"ю" : u"yu", u"Я" : u"Ya", u"я" : u"ya", u"Х" : u"Kh",
+ u"Χ" : u"Kh"})
- # Archaic Cyrillic letters
- if char == u"Ѹ":
- return u"Ou"
- if char == u"ѹ":
- return u"ou"
- if char in u"ѠѺ":
- return u"O"
- if char in u"ѡѻ":
- return u"o"
- if char == u"Ѿ":
- return u"Ot"
- if char == u"ѿ":
- return u"ot"
- if char == u"Ѣ":
- return u"E"
- if char == u"ѣ":
- return u"e"
- if char in u"ѤѦ":
- return u"Ei"
- if char in u"ѥѧ":
- return u"ei"
- if char == u"Ѫ":
- return u"Ai"
- if char == u"ѫ":
- return u"ai"
- if char == u"Ѯ":
- return u"X"
- if char == u"ѯ":
- return u"x"
- if char == u"Ѱ":
- return u"Ps"
- if char == u"ѱ":
- return u"ps"
- if char == u"Ѳ":
- return u"Th"
- if char == u"ѳ":
- return u"th"
- if char in u"ѴѶ":
- return u"Ü"
- if char == u"ѵ":
- return u"ü"
-
+ # Additional Cyrillic letters, most occuring in only one or a few languages
+ self.trans.update({u"Ы" : u"Y", u"ы" : u"y", u"Ё" : u"Ë", u"ё" : u"ë",
+ u"Э" : u"È", u"Ѐ" : u"È", u"э" : u"è", u"ѐ" : u"è",
+ u"І" : u"I", u"і" : u"i", u"Ї" : u"Ji", u"ї" : u"ji",
+ u"Є" : u"Je", u"є" : u"je", u"Ґ" : u"G", u"Ҝ" : u"G",
+ u"ґ" : u"g", u"ҝ" : u"g", u"Ђ" : u"Dj", u"ђ" : u"dj",
+ u"Ӣ" : u"Y", u"ӣ" : u"y", u"Љ" : u"Lj", u"љ" : u"lj",
+ u"Њ" : u"Nj", u"њ" : u"nj", u"Ћ" : u"Cj", u"ћ" : u"cj",
+ u"Җ" : u"Zhj", u"җ" : u"zhj", u"Ѓ" : u"Gj", u"ѓ" : u"gj",
+ u"Ќ" : u"Kj", u"ќ" : u"kj", u"Ӣ" : u"Ii", u"ӣ" : u"ii",
+ u"Ӯ" : u"U", u"ӯ" : u"u", u"Ҳ" : u"H", u"ҳ" : u"h",
+ u"Ҷ" : u"Dz",u"ҷ" : u"dz", u"Ө" :u"Ô", u"Ӫ" : u"Ô",
+ u"ө" : u"ô", u"ӫ" : u"ô", u"Ү": u"Y", u"ү": u"y", u"Һ": u"H",
+ u"һ": u"h", u"Ә": u"AE", u"Ӕ": u"AE", u"ә": u"ae",
+ u"Ӛ": u"Ë", u"Ӭ": u"Ë", u"ӛ": u"ë", u"ӭ": u"ë", u"Җ": u"Zhj",
+ u"җ": u"zhj", u"Ұ": u"U", u"ұ": u"u", u"ў": u"ù", u"Ў": u"Ù",
+ u"ѝ": u"ì", u"Ѝ": u"Ì", u"Ӑ": u"A", u"ă": u"a", u"Ӓ": u"Ä",
+ u"ä": u"ä", u"Ҽ" : u"Ts", u"Ҿ": u"Ts", u"ҽ": u"ts", u"ҿ": u"ts",
+ u"Ҙ": u"Dh", u"ҙ": u"dh", u"Ӏ": u"", u"ӏ": u"", u"Ӆ": u"L",
+ u"ӆ": u"l", u"Ӎ": u"M", u"ӎ": u"m", u"Ӧ": u"Ö", u"ӧ": u"ö",
+ u"Ҩ": u"u", u"ҩ": u"u", u"Ҧ": u"Ph", u"ҧ": u"ph", u"Ҏ": u"R",
+ u"ҏ": u"r", u"Ҫ": u"Th", u"ҫ": u"th", u"Ҭ": u"T", u"ҭ": u"t",
+ u"Ӯ": u"Û", u"ӯ": u"û", u"Ұ": u"U", u"Ӹ": u"U", u"ұ": u"u",
+ u"ӹ": u"u", u"Ҵ": u"Tts", u"ҵ": u"tts", u"Ӵ": u"Ch", u"ӵ": u"ch"})
+
+ for char in u"ЈӤҊ":
+ self.trans[char] = u"J"
+ for char in u"јӥҋ":
+ self.trans[char] = u"j"
+ for char in u"ЏӁӜҶ":
+ self.trans[char] = u"Dzh"
+ for char in u"џӂӝҷ":
+ self.trans[char] = u"dzh"
+ for char in u"ЅӞӠӋҸ":
+ self.trans[char] = u"Dz"
+ for char in u"ѕӟӡӌҹ":
+ self.trans[char] = u"dz"
+ for char in u"ҒӶҔ":
+ self.trans[char] = u"G"
+ for char in u"ғӷҕ":
+ self.trans[char] = u"g"
+ for char in u"ҚҞҠӃ":
+ self.trans[char] = u"Q"
+ for char in u"қҟҡӄ":
+ self.trans[char] = u"q"
+ for char in u"ҢҤӉӇ":
+ self.trans[char] = u"Ng"
+ for char in u"ңҥӊӈ":
+ self.trans[char] = u"ng"
+ for char in u"ӖѢҌ":
+ self.trans[char] = u"E"
+ for char in u"ӗѣҍ":
+ self.trans[char] = u"e"
+ for char in u"ӲӰҮ":
+ self.trans[char] = u"Ü"
+ for char in u"ӳӱү":
+ self.trans[char] = u"ü"
- # Hebrew alphabet
- if char in u"אע":
- return u"'"
- if char == u"ב":
- return u"b"
- if char == u"ג":
- return u"g"
- if char == u"ד":
- return u"d"
- if char == u"ה":
- return u"h"
- if char == u"ו":
- return u"v"
- if char == u"ז":
- return u"z"
- if char == u"ח":
- return u"kh"
- if char == u"ט":
- return u"t"
- if char == u"י":
- return u"y"
- if char in u"ךכ":
- return u"k"
- if char == u"ל":
- return u"l"
- if char in u"םמ":
- return u"m"
- if char in u"ןנ":
- return u"n"
- if char == u"ס":
- return u"s"
- if char in u"ףפ":
- return u"ph"
- if char in u"ץצ":
- return u"ts"
- if char == u"ק":
- return u"q"
- if char == u"ר":
- return u"r"
- if char == u"ש":
- return u"sh"
- if char == u"ת":
- return u"th"
-
- # Arab alphabet
- if char in u"اﺍﺎ":
- return u"a"
- if char in u"بﺏﺐﺒﺑ":
- return u"b"
- if char in u"تﺕﺖﺘﺗ":
- return u"t"
- if char in u"ثﺙﺚﺜﺛ":
- return u"th"
- if char in u"جﺝﺞﺠﺟ":
- return u"g"
- if char in u"حﺡﺢﺤﺣ":
- return u"h"
- if char in u"خﺥﺦﺨﺧ":
- return u"kh"
- if char in u"دﺩﺪ":
- return u"d"
- if char in u"ذﺫﺬ":
- return u"dh"
- if char in u"رﺭﺮ":
- return u"r"
- if char in u"زﺯﺰ":
- return u"z"
- if char in u"سﺱﺲﺴﺳ":
- return u"s"
- if char in u"شﺵﺶﺸﺷ":
- return u"sh"
- if char in u"صﺹﺺﺼﺻ":
- return u"s"
- if char in u"ضﺽﺾﻀﺿ":
- return u"d"
- if char in u"طﻁﻂﻄﻃ":
- return u"t"
- if char in u"ظﻅﻆﻈﻇ":
- return u"z"
- if char in u"عﻉﻊﻌﻋ":
- return u"'"
- if char in u"غﻍﻎﻐﻏ":
- return u"gh"
- if char in u"فﻑﻒﻔﻓ":
- return u"f"
- if char in u"قﻕﻖﻘﻗ":
- return u"q"
- if char in u"كﻙﻚﻜﻛک":
- return u"k"
- if char in u"لﻝﻞﻠﻟ":
- return u"l"
- if char in u"مﻡﻢﻤﻣ":
- return u"m"
- if char in u"نﻥﻦﻨﻧ":
- return u"n"
- if char in u"هﻩﻪﻬﻫ":
- return u"h"
- if char in u"وﻭﻮ":
- return u"w"
- if char in u"یيﻱﻲﻴﻳ":
- return u"y"
- # Arabic - additional letters, modified letters and ligatures
- if char == u"ﺀ":
- return u"'"
- if char in u"آﺁﺂ":
- return u"'a"
- if char in u"ةﺓﺔ":
- return u"th"
- if char in u"ىﻯﻰ":
- return u"á"
- if char in u"یﯼﯽﯿﯾ":
- return u"y"
- if char == u"؟":
- return u"?"
- # Arabic - ligatures
- if char in u"ﻻﻼ":
- return u"la"
- if char == u"ﷲ":
- return u"llah"
- if char in u"إأ":
- return u"a'"
- if char == u"ؤ":
- return u"w'"
- if char == u"ئ":
- return u"y'"
- if char == u"◌":
- return prev
- if char in u"◌◌":
- return u"" # indicates absence of vowels
- # Arabic vowels
- if char == u"◌":
- return u"a"
- if char == u"◌":
- return u"u"
- if char == u"◌":
- return u"i"
- if char == u"◌":
- return u"a"
- if char == u"◌":
- return u"ay"
- if char == u"◌":
- return u"ay"
- if char == u"◌":
- return u"u"
- if char == u"◌":
- return u"iy"
- # Arab numerals
- if char in u"٠۰":
- return u"0"
- if char in u"١۱":
- return u"1"
- if char in u"٢۲":
- return u"2"
- if char in u"٣۳":
- return u"3"
- if char in u"٤۴":
- return u"4"
- if char in u"٥۵":
- return u"5"
- if char in u"٦۶":
- return u"6"
- if char in u"٧۷":
- return u"7"
- if char in u"٨۸":
- return u"8"
- if char in u"٩۹":
- return u"9"
- # Perso-Arabic
- if char in u"پﭙﭙپ":
- return u"p"
- if char in u"چچچچ":
- return u"ch"
- if char in u"ژژ":
- return u"zh"
- if char in u"گﮔﮕﮓ":
- return u"g"
+ # Archaic Cyrillic letters
+ self.trans.update({u"Ѹ": u"Ou", u"ѹ": u"ou", u"Ѡ": u"O", u"Ѻ": u"O", u"ѡ": u"o",
+ u"ѻ": u"o", u"Ѿ": u"Ot", u"ѿ": u"ot", u"Ѣ": u"E", u"ѣ": u"e",
+ u"Ѥ": u"Ei", u"Ѧ": u"Ei", u"ѥ": u"ei", u"ѧ": u"ei", u"Ѫ": u"Ai",
+ u"ѫ": u"ai", u"Ѯ": u"X", u"ѯ": u"x", u"Ѱ": u"Ps", u"ѱ": u"ps",
+ u"Ѳ": u"Th", u"ѳ": u"th", u"Ѵ": u"Ü", u"Ѷ": u"Ü", u"ѵ": u"ü"})
- # Greek
- if char == u"Α":
- return u"A"
- if char == u"α":
- return u"a"
- if char == u"Β":
- return u"B"
- if char == u"β":
- return u"b"
- if char == u"Γ":
- return u"G"
- if char == u"γ":
- return u"g"
- if char == u"Δ":
- return u"D"
- if char == u"δ":
- return u"d"
- if char == u"Ε":
- return u"E"
- if char == u"ε":
- return u"e"
- if char == u"Ζ":
- return u"Z"
- if char == u"ζ":
- return u"z"
- if char == u"Η":
- return u"I"
- if char == u"η":
- return u"i"
- if char == u"Θ":
- if prev.lower() == prev:
- return u"Th"
- else:
- return u"TH"
- if char == u"θ":
- return u"th"
- if char == u"Ι":
- return u"I"
- if char == u"ι":
- return u"i"
- if char == u"Κ":
- return u"K"
- if char == u"κ":
- return u"k"
- if char == u"Λ":
- return u"L"
- if char == u"λ":
- return u"l"
- if char == u"Μ":
- return u"M"
- if char == u"μ":
- return u"m"
- if char == u"Ν":
- return u"N"
- if char == u"ν":
- return u"n"
- if char == u"Ξ":
- return u"X"
- if char == u"ξ":
- return u"x"
- if char == u"Ο":
- return u"O"
- if char == u"ο":
- return u"o"
- if char == u"Π":
- return u"P"
- if char == u"π":
- return u"p"
- if char == u"Ρ":
- return u"R"
- if char == u"ρ":
- return u"r"
- if char == u"Σ":
- return u"S"
- if char in u"σς":
- return u"s"
- if char == u"Τ":
- return u"T"
- if char == u"τ":
- return u"t"
- if char == u"Υ":
- return u"Y"
- if char == u"υ":
- return u"y"
- if char == u"Φ":
- return u"F"
- if char == u"φ":
- return u"f"
- if char == u"Ψ":
- if prev.lower() == prev:
- return u"Ps"
- else:
- return u"PS"
- if char == u"ψ":
- return u"ps"
- if char == u"Ω":
- return u"O"
- if char == u"ω":
- return u"o"
- # Greek: Special and old characters
- if char == u"ϗ":
- return u"&"
- if char == u"Ϛ":
- if prev.lower() == prev:
- return u"St"
- else:
- return u"ST"
- if char == u"ϛ":
- return u"st"
- if char in u"ϘϞ":
- return u"Q"
- if char in u"ϙϟ":
- return u"q"
- if char == u"Ϻ":
- return u"S"
- if char == u"ϻ":
- return u"s"
- if char == u"Ϡ":
- if prev.lower() == prev:
- return u"Ss"
- else:
- return u"SS"
- if char == u"ϡ":
- return u"ss"
- if char == u"Ϸ":
- if prev.lower() == prev:
- return u"Sh"
- else:
- return u"SH"
- if char == u"ϸ":
- return u"sh"
- if char == u"·":
- return u":"
- # Greek: Accented characters
- if char == u"Ά":
- return u"Á"
- if char == u"ά":
- return u"á"
- if char in u"ΈΉ":
- return u"É"
- if char in u"έή":
- return u"é"
- if char == u"Ί":
- return u"Í"
- if char == u"ί":
- return u"í"
- if char == u"Ϊ":
- return u"Ï"
- if char in u"ϊΐ":
- return u"ï"
- if char == u"Ό":
- return u"Ó"
- if char == u"ό":
- return u"ó"
- if char == u"Ύ":
- return u"Ý"
- if char == u"ύ":
- return u"ý"
- if char == u"Ϋ":
- return u"Y"
- if char in u"ϋΰ":
- return u"ÿ"
- if char == u"Ώ":
- return u"Ó"
- if char == u"ώ":
- return u"ó"
+ # Hebrew alphabet
+ for char in u"אע":
+ self.trans[char] = u"'"
+ self.trans[u"ב"] = u"b"
+ self.trans[u"ג"] = u"g"
+ self.trans[u"ד"] = u"d"
+ self.trans[u"ה"] = u"h"
+ self.trans[u"ו"] = u"v"
+ self.trans[u"ז"] = u"z"
+ self.trans[u"ח"] = u"kh"
+ self.trans[u"ט"] = u"t"
+ self.trans[u"י"] = u"y"
+ for char in u"ךכ":
+ self.trans[char] = u"k"
+ self.trans[u"ל"] = u"l"
+ for char in u"םמ":
+ self.trans[char] = u"m"
+ for char in u"ןנ":
+ self.trans[char] = u"n"
+ self.trans[u"ס"] = u"s"
+ for char in u"ףפ":
+ self.trans[char] = u"ph"
+ for char in u"ץצ":
+ self.trans[char] = u"ts"
+ self.trans[u"ק"] = u"q"
+ self.trans[u"ר"] = u"r"
+ self.trans[u"ש"] = u"sh"
+ self.trans[u"ת"] = u"th"
+
+ # Arab alphabet
+ for char in u"اﺍﺎ":
+ self.trans[char] = u"a"
+ for char in u"بﺏﺐﺒﺑ":
+ self.trans[char] = u"b"
+ for char in u"تﺕﺖﺘﺗ":
+ self.trans[char] = u"t"
+ for char in u"ثﺙﺚﺜﺛ":
+ self.trans[char] = u"th"
+ for char in u"جﺝﺞﺠﺟ":
+ self.trans[char] = u"g"
+ for char in u"حﺡﺢﺤﺣ":
+ self.trans[char] = u"h"
+ for char in u"خﺥﺦﺨﺧ":
+ self.trans[char] = u"kh"
+ for char in u"دﺩﺪ":
+ self.trans[char] = u"d"
+ for char in u"ذﺫﺬ":
+ self.trans[char] = u"dh"
+ for char in u"رﺭﺮ":
+ self.trans[char] = u"r"
+ for char in u"زﺯﺰ":
+ self.trans[char] = u"z"
+ for char in u"سﺱﺲﺴﺳ":
+ self.trans[char] = u"s"
+ for char in u"شﺵﺶﺸﺷ":
+ self.trans[char] = u"sh"
+ for char in u"صﺹﺺﺼﺻ":
+ self.trans[char] = u"s"
+ for char in u"ضﺽﺾﻀﺿ":
+ self.trans[char] = u"d"
+ for char in u"طﻁﻂﻄﻃ":
+ self.trans[char] = u"t"
+ for char in u"ظﻅﻆﻈﻇ":
+ self.trans[char] = u"z"
+ for char in u"عﻉﻊﻌﻋ":
+ self.trans[char] = u"'"
+ for char in u"غﻍﻎﻐﻏ":
+ self.trans[char] = u"gh"
+ for char in u"فﻑﻒﻔﻓ":
+ self.trans[char] = u"f"
+ for char in u"قﻕﻖﻘﻗ":
+ self.trans[char] = u"q"
+ for char in u"كﻙﻚﻜﻛک":
+ self.trans[char] = u"k"
+ for char in u"لﻝﻞﻠﻟ":
+ self.trans[char] = u"l"
+ for char in u"مﻡﻢﻤﻣ":
+ self.trans[char] = u"m"
+ for char in u"نﻥﻦﻨﻧ":
+ self.trans[char] = u"n"
+ for char in u"هﻩﻪﻬﻫ":
+ self.trans[char] = u"h"
+ for char in u"وﻭﻮ":
+ self.trans[char] = u"w"
+ for char in u"یيﻱﻲﻴﻳ":
+ self.trans[char] = u"y"
+ # Arabic - additional letters, modified letters and ligatures
+ self.trans[u"ﺀ"] = u"'"
+ for char in u"آﺁﺂ":
+ self.trans[char] = u"'a"
+ for char in u"ةﺓﺔ":
+ self.trans[char] = u"th"
+ for char in u"ىﻯﻰ":
+ self.trans[char] = u"á"
+ for char in u"یﯼﯽﯿﯾ":
+ self.trans[char] = u"y"
+ self.trans[u"؟"] = u"?"
+ # Arabic - ligatures
+ for char in u"ﻻﻼ":
+ self.trans[char] = u"la"
+ self.trans[u"ﷲ"] = u"llah"
+ for char in u"إأ":
+ self.trans[char] = u"a'"
+ self.trans[u"ؤ"] = u"w'"
+ self.trans[u"ئ"] = u"y'"
+ for char in u"◌◌":
+ self.trans[char] = u"" # indicates absence of vowels
+ # Arabic vowels
+ self.trans[u"◌"] = u"a"
+ self.trans[u"◌"] = u"u"
+ self.trans[u"◌"] = u"i"
+ self.trans[u"◌"] = u"a"
+ self.trans[u"◌"] = u"ay"
+ self.trans[u"◌"] = u"ay"
+ self.trans[u"◌"] = u"u"
+ self.trans[u"◌"] = u"iy"
+ # Arab numerals
+ for char in u"٠۰":
+ self.trans[char] = u"0"
+ for char in u"١۱":
+ self.trans[char] = u"1"
+ for char in u"٢۲":
+ self.trans[char] = u"2"
+ for char in u"٣۳":
+ self.trans[char] = u"3"
+ for char in u"٤۴":
+ self.trans[char] = u"4"
+ for char in u"٥۵":
+ self.trans[char] = u"5"
+ for char in u"٦۶":
+ self.trans[char] = u"6"
+ for char in u"٧۷":
+ self.trans[char] = u"7"
+ for char in u"٨۸":
+ self.trans[char] = u"8"
+ for char in u"٩۹":
+ self.trans[char] = u"9"
+ # Perso-Arabic
+ for char in u"پﭙﭙپ":
+ self.trans[char] = u"p"
+ for char in u"چچچچ":
+ self.trans[char] = u"ch"
+ for char in u"ژژ":
+ self.trans[char] = u"zh"
+ for char in u"گﮔﮕﮓ":
+ self.trans[char] = u"g"
- # Japanese (katakana and hiragana)
- if char in u"アァあ":
- return u"a"
- if char in u"イィい":
- return u"i"
- if char in u"ウう":
- return u"u"
- if char in u"エェえ":
- return u"e"
- if char in u"オォお":
- return u"o"
- if char in u"ャや":
- return u"ya"
- if char in u"ュゆ":
- return u"yu"
- if char in u"ョよ":
- return u"yo"
- if char in u"カか":
- return u"ka"
- if char in u"キき":
- return u"ki"
- if char in u"クく":
- return u"ku"
- if char in u"ケけ":
- return u"ke"
- if char in u"コこ":
- return u"ko"
- if char in u"サさ":
- return u"sa"
- if char in u"シし":
- return u"shi"
- if char in u"スす":
- return u"su"
- if char in u"セせ":
- return u"se"
- if char in u"ソそ":
- return u"so"
- if char in u"タた":
- return u"ta"
- if char in u"チち":
- return u"chi"
- if char in u"ツつ":
- return u"tsu"
- if char in u"テて":
- return u"te"
- if char in u"トと":
- return u"to"
- if char in u"ナな":
- return u"na"
- if char in u"ニに":
- return u"ni"
- if char in u"ヌぬ":
- return u"nu"
- if char in u"ネね":
- return u"ne"
- if char in u"ノの":
- return u"no"
- if char in u"ハは":
- return u"ha"
- if char in u"ヒひ":
- return u"hi"
- if char in u"フふ":
- return u"fu"
- if char in u"ヘへ":
- return u"he"
- if char in u"ホほ":
- return u"ho"
- if char in u"マま":
- return u"ma"
- if char in u"ミみ":
- return u"mi"
- if char in u"ムむ":
- return u"mu"
- if char in u"メめ":
- return u"me"
- if char in u"モも":
- return u"mo"
- if char in u"ラら":
- return u"ra"
- if char in u"リり":
- return u"ri"
- if char in u"ルる":
- return u"ru"
- if char in u"レれ":
- return u"re"
- if char in u"ロろ":
- return u"ro"
- if char in u"ワわ":
- return u"wa"
- if char in u"ヰゐ":
- return u"wi"
- if char in u"ヱゑ":
- return u"we"
- if char in u"ヲを":
- return u"wo"
- if char in u"ンん":
- return u"n"
- if char in u"ガが":
- return u"ga"
- if char in u"ギぎ":
- return u"gi"
- if char in u"グぐ":
- return u"gu"
- if char in u"ゲげ":
- return u"ge"
- if char in u"ゴご":
- return u"go"
- if char in u"ザざ":
- return u"za"
- if char in u"ジじ":
- return u"ji"
- if char in u"ズず":
- return u"zu"
- if char in u"ゼぜ":
- return u"ze"
- if char in u"ゾぞ":
- return u"zo"
- if char in u"ダだ":
- return u"da"
- if char in u"ヂぢ":
- return u"dji"
- if char in u"ヅづ":
- return u"dzu"
- if char in u"デで":
- return u"de"
- if char in u"ドど":
- return u"do"
- if char in u"バば":
- return u"ba"
- if char in u"ビび":
- return u"bi"
- if char in u"ブぶ":
- return u"bu"
- if char in u"ベべ":
- return u"be"
- if char in u"ボぼ":
- return u"bo"
- if char in u"パぱ":
- return u"pa"
- if char in u"ピぴ":
- return u"pi"
- if char in u"プぷ":
- return u"pu"
- if char in u"ペぺ":
- return u"pe"
- if char in u"ポぽ":
- return u"po"
- if char in u"ヴゔ":
- return u"vu"
- if char == u"ヷ":
- return u"va"
- if char == u"ヸ":
- return u"vi"
- if char == u"ヹ":
- return u"ve"
- if char == u"ヺ":
- return u"vo"
- if char == u"ッ":
- return trans(next)[0]
+ # Greek
+ self.trans.update({u"Α": u"A", u"α": u"a", u"Β": u"B", u"β": u"b", u"Γ": u"G",
+ u"γ": u"g", u"Δ": u"D", u"δ": u"d", u"Ε": u"E", u"ε": u"e",
+ u"Ζ": u"Z", u"ζ": u"z", u"Η": u"I", u"η": u"i", u"θ": u"th",
+ u"Θ": u"Th", u"Ι": u"I", u"ι": u"i", u"Κ": u"K", u"κ": u"k",
+ u"Λ": u"L", u"λ": u"l", u"Μ": u"M", u"μ": u"m", u"Ν": u"N",
+ u"ν": u"n", u"Ξ": u"X", u"ξ": u"x", u"Ο": u"O", u"ο": u"o",
+ u"Π": u"P", u"π": u"p", u"Ρ": u"R", u"ρ": u"r", u"Σ": u"S",
+ u"σ": u"s", u"ς": u"s", u"Τ": u"T", u"τ": u"t", u"Υ": u"Y",
+ u"υ": u"y", u"Φ": u"F", u"φ": u"f", u"Ψ": u"Ps", u"ψ": u"ps",
+ u"Ω": u"O", u"ω": u"o", u"ϗ": u"&", u"Ϛ": u"St", u"ϛ": u"st",
+ u"Ϙ": u"Q", u"Ϟ": u"Q", u"ϙ": u"q", u"ϟ": u"q", u"Ϻ": u"S",
+ u"ϻ": u"s", u"Ϡ": u"Ss", u"ϡ": u"ss", u"Ϸ": u"Sh", u"ϸ": u"sh",
+ u"·": u":", u"Ά": u"Á", u"ά": u"á", u"Έ": u"É", u"Ή": u"É",
+ u"έ": u"é", u"ή": u"é", u"Ί": u"Í", u"ί": u"í", u"Ϊ": u"Ï",
+ u"ϊ": u"ï", u"ΐ": u"ï", u"Ό": u"Ó", u"ό": u"ó", u"Ύ": u"Ý",
+ u"ύ": u"ý", u"Ϋ": u"Y", u"ϋ": u"ÿ", u"ΰ": u"ÿ", u"Ώ": u"Ó",
+ u"ώ": u"ó"})
- # Japanese and Chinese punctuation and typography
- if char == u"・·":
- return u" "
- if char == u"々仝ヽヾゝゞ〱〲〳〵〴〵":
- return prev
- if char in u"〃『』《》":
- return u'"'
- if char in u"「」〈〉〘〙〚〛":
- return u"'"
- if char in u"(〔":
- return u"("
- if char in u")〕":
- return u")"
- if char in u"[【〖":
- return u"["
- if char in u"]】〗":
- return u"]"
- if char == u"{":
- return u"{"
- if char == u"}":
- return u"}"
- if char == u"っ":
- return u":"
- if char == u"ー":
- return u"h"
- if char == u"゛":
- return u"'"
- if char == u"゜":
- return u"p"
- if char == u"。":
- return u". "
- if char == u"、":
- return u", "
- if char == u"・":
- return u" "
- if char == u"〆":
- return u"shime"
- if char == u"〜":
- return u"-"
- if char == u"…":
- return u"..."
- if char == u"‥":
- return u".."
- if char == u"ヶ":
- return u"months"
- if char in u"•◦":
- return u"_"
- if char in u"※*":
- return u"*"
- if char == u"Ⓧ":
- return u"(X)"
- if char == u"Ⓨ":
- return u"(Y)"
- if char == u"!":
- return u"!"
- if char == u"?":
- return u"?"
- if char == u";":
- return u";"
- if char == u":":
- return u":"
- if char == u"。":
- return u"."
- if char in u",、":
- return u","
+ # Japanese (katakana and hiragana)
+ for char in u"アァあ":
+ self.trans[char] = u"a"
+ for char in u"イィい":
+ self.trans[char] = u"i"
+ for char in u"ウう":
+ self.trans[char] = u"u"
+ for char in u"エェえ":
+ self.trans[char] = u"e"
+ for char in u"オォお":
+ self.trans[char] = u"o"
+ for char in u"ャや":
+ self.trans[char] = u"ya"
+ for char in u"ュゆ":
+ self.trans[char] = u"yu"
+ for char in u"ョよ":
+ self.trans[char] = u"yo"
+ for char in u"カか":
+ self.trans[char] = u"ka"
+ for char in u"キき":
+ self.trans[char] = u"ki"
+ for char in u"クく":
+ self.trans[char] = u"ku"
+ for char in u"ケけ":
+ self.trans[char] = u"ke"
+ for char in u"コこ":
+ self.trans[char] = u"ko"
+ for char in u"サさ":
+ self.trans[char] = u"sa"
+ for char in u"シし":
+ self.trans[char] = u"shi"
+ for char in u"スす":
+ self.trans[char] = u"su"
+ for char in u"セせ":
+ self.trans[char] = u"se"
+ for char in u"ソそ":
+ self.trans[char] = u"so"
+ for char in u"タた":
+ self.trans[char] = u"ta"
+ for char in u"チち":
+ self.trans[char] = u"chi"
+ for char in u"ツつ":
+ self.trans[char] = u"tsu"
+ for char in u"テて":
+ self.trans[char] = u"te"
+ for char in u"トと":
+ self.trans[char] = u"to"
+ for char in u"ナな":
+ self.trans[char] = u"na"
+ for char in u"ニに":
+ self.trans[char] = u"ni"
+ for char in u"ヌぬ":
+ self.trans[char] = u"nu"
+ for char in u"ネね":
+ self.trans[char] = u"ne"
+ for char in u"ノの":
+ self.trans[char] = u"no"
+ for char in u"ハは":
+ self.trans[char] = u"ha"
+ for char in u"ヒひ":
+ self.trans[char] = u"hi"
+ for char in u"フふ":
+ self.trans[char] = u"fu"
+ for char in u"ヘへ":
+ self.trans[char] = u"he"
+ for char in u"ホほ":
+ self.trans[char] = u"ho"
+ for char in u"マま":
+ self.trans[char] = u"ma"
+ for char in u"ミみ":
+ self.trans[char] = u"mi"
+ for char in u"ムむ":
+ self.trans[char] = u"mu"
+ for char in u"メめ":
+ self.trans[char] = u"me"
+ for char in u"モも":
+ self.trans[char] = u"mo"
+ for char in u"ラら":
+ self.trans[char] = u"ra"
+ for char in u"リり":
+ self.trans[char] = u"ri"
+ for char in u"ルる":
+ self.trans[char] = u"ru"
+ for char in u"レれ":
+ self.trans[char] = u"re"
+ for char in u"ロろ":
+ self.trans[char] = u"ro"
+ for char in u"ワわ":
+ self.trans[char] = u"wa"
+ for char in u"ヰゐ":
+ self.trans[char] = u"wi"
+ for char in u"ヱゑ":
+ self.trans[char] = u"we"
+ for char in u"ヲを":
+ self.trans[char] = u"wo"
+ for char in u"ンん":
+ self.trans[char] = u"n"
+ for char in u"ガが":
+ self.trans[char] = u"ga"
+ for char in u"ギぎ":
+ self.trans[char] = u"gi"
+ for char in u"グぐ":
+ self.trans[char] = u"gu"
+ for char in u"ゲげ":
+ self.trans[char] = u"ge"
+ for char in u"ゴご":
+ self.trans[char] = u"go"
+ for char in u"ザざ":
+ self.trans[char] = u"za"
+ for char in u"ジじ":
+ self.trans[char] = u"ji"
+ for char in u"ズず":
+ self.trans[char] = u"zu"
+ for char in u"ゼぜ":
+ self.trans[char] = u"ze"
+ for char in u"ゾぞ":
+ self.trans[char] = u"zo"
+ for char in u"ダだ":
+ self.trans[char] = u"da"
+ for char in u"ヂぢ":
+ self.trans[char] = u"dji"
+ for char in u"ヅづ":
+ self.trans[char] = u"dzu"
+ for char in u"デで":
+ self.trans[char] = u"de"
+ for char in u"ドど":
+ self.trans[char] = u"do"
+ for char in u"バば":
+ self.trans[char] = u"ba"
+ for char in u"ビび":
+ self.trans[char] = u"bi"
+ for char in u"ブぶ":
+ self.trans[char] = u"bu"
+ for char in u"ベべ":
+ self.trans[char] = u"be"
+ for char in u"ボぼ":
+ self.trans[char] = u"bo"
+ for char in u"パぱ":
+ self.trans[char] = u"pa"
+ for char in u"ピぴ":
+ self.trans[char] = u"pi"
+ for char in u"プぷ":
+ self.trans[char] = u"pu"
+ for char in u"ペぺ":
+ self.trans[char] = u"pe"
+ for char in u"ポぽ":
+ self.trans[char] = u"po"
+ for char in u"ヴゔ":
+ self.trans[char] = u"vu"
+ self.trans[u"ヷ"] = u"va"
+ self.trans[u"ヸ"] = u"vi"
+ self.trans[u"ヹ"] = u"ve"
+ self.trans[u"ヺ"] = u"vo"
- # Georgian
- if char == u"ა":
- return u"a"
- if char == u"ბ":
- return u"b"
- if char == u"გ":
- return u"g"
- if char == u"დ":
- return u"d"
- if char in u"ეჱ":
- return u"e"
- if char == u"ვ":
- return u"v"
- if char == u"ზ":
- return u"z"
- if char == u"თ":#
- return u"th"
- if char == u"ი":
- return u"i"
- if char == u"კ":#
- return u"k"
- if char == u"ლ":
- return u"l"
- if char == u"მ":
- return u"m"
- if char == u"ნ":
- return u"n"
- if char == u"ო":
- return u"o"
- if char == u"პ":#
- return u"p"
- if char == u"ჟ":#
- return u"zh"
- if char == u"რ":
- return u"r"
- if char == u"ს":
- return u"s"
- if char == u"ტ":#
- return u"t"
- if char == u"უ":
- return u"u"
- if char == u"ფ":#
- return u"ph"
- if char == u"ქ":#
- return u"q"
- if char == u"ღ":#
- return u"gh"
- if char == u"ყ":#
- return u"q'"
- if char == u"შ":
- return u"sh"
- if char == u"ჩ":
- return u"ch"
- if char == u"ც":
- return u"ts"
- if char == u"ძ":
- return u"dz"
- if char == u"წ":#
- return u"ts'"
- if char == u"ჭ":#
- return u"ch'"
- if char == u"ხ":
- return u"kh"
- if char == u"ჯ":#
- return u"j"
- if char == u"ჰ":
- return u"h"
- if char == u"ჳ":
- return u"w"
- if char == u"ჵ":
- return u"o"
- if char == u"ჶ":
- return u"f"
+ # Japanese and Chinese punctuation and typography
+ for char in u"・·":
+ self.trans[char] = u" "
+ for char in u"〃『』《》":
+ self.trans[char] = u'"'
+ for char in u"「」〈〉〘〙〚〛":
+ self.trans[char] = u"'"
+ for char in u"(〔":
+ self.trans[char] = u"("
+ for char in u")〕":
+ self.trans[char] = u")"
+ for char in u"[【〖":
+ self.trans[char] = u"["
+ for char in u"]】〗":
+ self.trans[char] = u"]"
+ for char in u"{":
+ self.trans[char] = u"{"
+ for char in u"}":
+ self.trans[char] = u"}"
+ for char in u"っ":
+ self.trans[char] = u":"
+ for char in u"ー":
+ self.trans[char] = u"h"
+ for char in u"゛":
+ self.trans[char] = u"'"
+ for char in u"゜":
+ self.trans[char] = u"p"
+ for char in u"。":
+ self.trans[char] = u". "
+ for char in u"、":
+ self.trans[char] = u", "
+ for char in u"・":
+ self.trans[char] = u" "
+ for char in u"〆":
+ self.trans[char] = u"shime"
+ for char in u"〜":
+ self.trans[char] = u"-"
+ for char in u"…":
+ self.trans[char] = u"..."
+ for char in u"‥":
+ self.trans[char] = u".."
+ for char in u"ヶ":
+ self.trans[char] = u"months"
+ for char in u"•◦":
+ self.trans[char] = u"_"
+ for char in u"※*":
+ self.trans[char] = u"*"
+ for char in u"Ⓧ":
+ self.trans[char] = u"(X)"
+ for char in u"Ⓨ":
+ self.trans[char] = u"(Y)"
+ for char in u"!":
+ self.trans[char] = u"!"
+ for char in u"?":
+ self.trans[char] = u"?"
+ for char in u";":
+ self.trans[char] = u";"
+ for char in u":":
+ self.trans[char] = u":"
+ for char in u"。":
+ self.trans[char] = u"."
+ for char in u",、":
+ self.trans[char] = u","
- # Devanagari
- if char in u"पप":
- return u"p"
- if char in u"अ":
- return u"a"
- if char in u"आा":
- return u"aa"
- if char == u"प":
- return u"pa"
- if char in u"इि":
- return u"i"
- if char in u"ईी":
- return u"ii"
- if char in u"उु":
- return u"u"
- if char in u"ऊू":
- return u"uu"
- if char in u"एे":
- return u"e"
- if char in u"ऐै":
- return u"ai"
- if char in u"ओो":
- return u"o"
- if char in u"औौ":
- return u"au"
- if char in u"ऋृर":
- return u"r"
- if char in u"ॠॄ":
- return u"rr"
- if char in u"ऌॢल":
- return u"l"
- if char in u"ॡॣ":
- return u"ll"
- if char == u"क":
- return u"k"
- if char == u"ख":
- return u"kh"
- if char == u"ग":
- return u"g"
- if char == u"घ":
- return u"gh"
- if char == u"ङ":
- return u"ng"
- if char == u"च":
- return u"c"
- if char == u"छ":
- return u"ch"
- if char == u"ज":
- return u"j"
- if char == u"झ":
- return u"jh"
- if char == u"ञ":
- return u"ñ"
- if char in u"टत":
- return u"t"
- if char in u"ठथ":
- return u"th"
- if char in u"डद":
- return u"d"
- if char in u"ढध":
- return u"dh"
- if char in u"णन":
- return u"n"
- if char == u"फ":
- return u"ph"
- if char == u"ब":
- return u"b"
- if char == u"भ":
- return u"bh"
- if char == u"म":
- return u"m"
- if char == u"य":
- return u"y"
- if char == u"व":
- return u"v"
- if char == u"श":
- return u"sh"
- if char in u"षस":
- return u"s"
- if char == u"ह":
- return u"h"
- if char == u"क":
- return u"x"
- if char == u"त":
- return u"tr"
- if char == u"ज":
- return u"gj"
- if char == u"क़":
- return u"q"
- if char == u"फ":
- return u"f"
- if char == u"ख":
- return u"hh"
- if char == u"H":
- return u"gh"
- if char == u"ज":
- return u"z"
- if char in u"डढ":
- return u"r"
- # Devanagari ligatures (possibly incomplete and/or incorrect)
- if char == u"ख्":
- return u"khn"
- if char == u"त":
- return u"tn"
- if char == u"द्":
- return u"dn"
- if char == u"श":
- return u"cn"
- if char == u"ह्":
- return u"fn"
- if char in u"अँ":
- return u"m"
- if char in u"॒॑":
- return u""
- if char == u"०":
- return u"0"
- if char == u"१":
- return u"1"
- if char == u"२":
- return u"2"
- if char == u"३":
- return u"3"
- if char == u"४":
- return u"4"
- if char == u"५":
- return u"5"
- if char == u"६":
- return u"6"
- if char == u"७":
- return u"7"
- if char == u"८":
- return u"8"
- if char == u"९":
- return u"9"
+ # Georgian
+ for char in u"ა":
+ self.trans[char] = u"a"
+ for char in u"ბ":
+ self.trans[char] = u"b"
+ for char in u"გ":
+ self.trans[char] = u"g"
+ for char in u"დ":
+ self.trans[char] = u"d"
+ for char in u"ეჱ":
+ self.trans[char] = u"e"
+ for char in u"ვ":
+ self.trans[char] = u"v"
+ for char in u"ზ":
+ self.trans[char] = u"z"
+ for char in u"თ":#
+ self.trans[char] = u"th"
+ for char in u"ი":
+ self.trans[char] = u"i"
+ for char in u"კ":#
+ self.trans[char] = u"k"
+ for char in u"ლ":
+ self.trans[char] = u"l"
+ for char in u"მ":
+ self.trans[char] = u"m"
+ for char in u"ნ":
+ self.trans[char] = u"n"
+ for char in u"ო":
+ self.trans[char] = u"o"
+ for char in u"პ":#
+ self.trans[char] = u"p"
+ for char in u"ჟ":#
+ self.trans[char] = u"zh"
+ for char in u"რ":
+ self.trans[char] = u"r"
+ for char in u"ს":
+ self.trans[char] = u"s"
+ for char in u"ტ":#
+ self.trans[char] = u"t"
+ for char in u"უ":
+ self.trans[char] = u"u"
+ for char in u"ფ":#
+ self.trans[char] = u"ph"
+ for char in u"ქ":#
+ self.trans[char] = u"q"
+ for char in u"ღ":#
+ self.trans[char] = u"gh"
+ for char in u"ყ":#
+ self.trans[char] = u"q'"
+ for char in u"შ":
+ self.trans[char] = u"sh"
+ for char in u"ჩ":
+ self.trans[char] = u"ch"
+ for char in u"ც":
+ self.trans[char] = u"ts"
+ for char in u"ძ":
+ self.trans[char] = u"dz"
+ for char in u"წ":#
+ self.trans[char] = u"ts'"
+ for char in u"ჭ":#
+ self.trans[char] = u"ch'"
+ for char in u"ხ":
+ self.trans[char] = u"kh"
+ for char in u"ჯ":#
+ self.trans[char] = u"j"
+ for char in u"ჰ":
+ self.trans[char] = u"h"
+ for char in u"ჳ":
+ self.trans[char] = u"w"
+ for char in u"ჵ":
+ self.trans[char] = u"o"
+ for char in u"ჶ":
+ self.trans[char] = u"f"
- # Armenian
- if char == u"Ա":
- return u"A"
- if char == u"ա":
- return u"a"
- if char == u"Բ":
- return u"B"
- if char == u"բ":
- return u"b"
- if char == u"Գ":
- return u"G"
- if char == u"գ":
- return u"g"
- if char == u"Դ":
- return u"D"
- if char == u"դ":
- return u"d"
- if char == u"Ե":
- return u"Je"
- if char == u"ե":
- return u"e"
- if char == u"Զ":
- return u"Z"
- if char == u"զ":
- return u"z"
- if char == u"Է":
- return u"É"
- if char == u"է":
- return u"é"
- if char == u"Ը":
- return u"Ë"
- if char == u"ը":
- return u"ë"
- if char == u"Թ":
- return u"Th"
- if char == u"թ":
- return u"th"
- if char == u"Ժ":
- return u"Zh"
- if char == u"ժ":
- return u"zh"
- if char == u"Ի":
- return u"I"
- if char == u"ի":
- return u"i"
- if char == u"Լ":
- return u"L"
- if char == u"լ":
- return u"l"
- if char == u"Խ":
- return u"Ch"
- if char == u"խ":
- return u"ch"
- if char == u"Ծ":
- return u"Ts"
- if char == u"ծ":
- return u"ts"
- if char == u"Կ":
- return u"K"
- if char == u"կ":
- return u"k"
- if char == u"Հ":
- return u"H"
- if char == u"հ":
- return u"h"
- if char == u"Ձ":
- return u"Dz"
- if char == u"ձ":
- return u"dz"
- if char == u"Ղ":
- return u"R"
- if char == u"ղ":
- return u"r"
- if char == u"Ճ":
- return u"Cz"
- if char == u"ճ":
- return u"cz"
- if char == u"Մ":
- return u"M"
- if char == u"մ":
- return u"m"
- if char == u"Յ":
- return u"J"
- if char == u"յ":
- return u"j"
- if char == u"Ն":
- return u"N"
- if char == u"ն":
- return u"n"
- if char == u"Շ":
- return u"S"
- if char == u"շ":
- return u"s"
- if char == u"Շ":
- return u"Vo"
- if char == u"շ":
- return u"o"
- if char == u"Չ":
- return u"Tsh"
- if char == u"չ":
- return u"tsh"
- if char == u"Պ":
- return u"P"
- if char == u"պ":
- return u"p"
- if char == u"Ջ":
- return u"Dz"
- if char == u"ջ":
- return u"dz"
- if char == u"Ռ":
- return u"R"
- if char == u"ռ":
- return u"r"
- if char == u"Ս":
- return u"S"
- if char == u"ս":
- return u"s"
- if char == u"Վ":
- return u"V"
- if char == u"վ":
- return u"v"
- if char == u"Տ":
- return u"T'"
- if char == u"տ":
- return u"t'"
- if char == u"Ր":
- return u"R"
- if char == u"ր":
- return u"r"
- if char == u"Ց":
- return u"Tsh"
- if char == u"ց":
- return u"tsh"
- if char == u"Ւ":
- return u"V"
- if char == u"ւ":
- return u"v"
- if char == u"Փ":
- return u"Ph"
- if char == u"փ":
- return u"ph"
- if char == u"Ք":
- return u"Kh"
- if char == u"ք":
- return u"kh"
- if char == u"Օ":
- return u"O"
- if char == u"օ":
- return u"o"
- if char == u"Ֆ":
- return u"F"
- if char == u"ֆ":
- return u"f"
- if char == u"և":
- return u"&"
- if char == u"՟":
- return u"."
- if char == u"՞":
- return u"?"
- if char == u"՝":
- return u";"
- if char == u"՛":
- return u""
+ # Devanagari
+ for char in u"पप":
+ self.trans[char] = u"p"
+ for char in u"अ":
+ self.trans[char] = u"a"
+ for char in u"आा":
+ self.trans[char] = u"aa"
+ for char in u"प":
+ self.trans[char] = u"pa"
+ for char in u"इि":
+ self.trans[char] = u"i"
+ for char in u"ईी":
+ self.trans[char] = u"ii"
+ for char in u"उु":
+ self.trans[char] = u"u"
+ for char in u"ऊू":
+ self.trans[char] = u"uu"
+ for char in u"एे":
+ self.trans[char] = u"e"
+ for char in u"ऐै":
+ self.trans[char] = u"ai"
+ for char in u"ओो":
+ self.trans[char] = u"o"
+ for char in u"औौ":
+ self.trans[char] = u"au"
+ for char in u"ऋृर":
+ self.trans[char] = u"r"
+ for char in u"ॠॄ":
+ self.trans[char] = u"rr"
+ for char in u"ऌॢल":
+ self.trans[char] = u"l"
+ for char in u"ॡॣ":
+ self.trans[char] = u"ll"
+ for char in u"क":
+ self.trans[char] = u"k"
+ for char in u"ख":
+ self.trans[char] = u"kh"
+ for char in u"ग":
+ self.trans[char] = u"g"
+ for char in u"घ":
+ self.trans[char] = u"gh"
+ for char in u"ङ":
+ self.trans[char] = u"ng"
+ for char in u"च":
+ self.trans[char] = u"c"
+ for char in u"छ":
+ self.trans[char] = u"ch"
+ for char in u"ज":
+ self.trans[char] = u"j"
+ for char in u"झ":
+ self.trans[char] = u"jh"
+ for char in u"ञ":
+ self.trans[char] = u"ñ"
+ for char in u"टत":
+ self.trans[char] = u"t"
+ for char in u"ठथ":
+ self.trans[char] = u"th"
+ for char in u"डद":
+ self.trans[char] = u"d"
+ for char in u"ढध":
+ self.trans[char] = u"dh"
+ for char in u"णन":
+ self.trans[char] = u"n"
+ for char in u"फ":
+ self.trans[char] = u"ph"
+ for char in u"ब":
+ self.trans[char] = u"b"
+ for char in u"भ":
+ self.trans[char] = u"bh"
+ for char in u"म":
+ self.trans[char] = u"m"
+ for char in u"य":
+ self.trans[char] = u"y"
+ for char in u"व":
+ self.trans[char] = u"v"
+ for char in u"श":
+ self.trans[char] = u"sh"
+ for char in u"षस":
+ self.trans[char] = u"s"
+ for char in u"ह":
+ self.trans[char] = u"h"
+ for char in u"क":
+ self.trans[char] = u"x"
+ for char in u"त":
+ self.trans[char] = u"tr"
+ for char in u"ज":
+ self.trans[char] = u"gj"
+ for char in u"क़":
+ self.trans[char] = u"q"
+ for char in u"फ":
+ self.trans[char] = u"f"
+ for char in u"ख":
+ self.trans[char] = u"hh"
+ for char in u"H":
+ self.trans[char] = u"gh"
+ for char in u"ज":
+ self.trans[char] = u"z"
+ for char in u"डढ":
+ self.trans[char] = u"r"
+ # Devanagari ligatures (possibly incomplete and/or incorrect)
+ for char in u"ख्":
+ self.trans[char] = u"khn"
+ for char in u"त":
+ self.trans[char] = u"tn"
+ for char in u"द्":
+ self.trans[char] = u"dn"
+ for char in u"श":
+ self.trans[char] = u"cn"
+ for char in u"ह्":
+ self.trans[char] = u"fn"
+ for char in u"अँ":
+ self.trans[char] = u"m"
+ for char in u"॒॑":
+ self.trans[char] = u""
+ for char in u"०":
+ self.trans[char] = u"0"
+ for char in u"१":
+ self.trans[char] = u"1"
+ for char in u"२":
+ self.trans[char] = u"2"
+ for char in u"३":
+ self.trans[char] = u"3"
+ for char in u"४":
+ self.trans[char] = u"4"
+ for char in u"५":
+ self.trans[char] = u"5"
+ for char in u"६":
+ self.trans[char] = u"6"
+ for char in u"७":
+ self.trans[char] = u"7"
+ for char in u"८":
+ self.trans[char] = u"8"
+ for char in u"९":
+ self.trans[char] = u"9"
- # Tamil
- if char == u"க்":
- return u"k"
- if char in u"ஙண்ந்ன்":
- return u"n"
- if char == u"ச":
- return u"c"
- if char == u"ஞ்":
- return u"ñ"
- if char == u"ட்":
- return u"th"
- if char == u"த":
- return u"t"
- if char == u"ப":
- return u"p"
- if char == u"ம்":
- return u"m"
- if char == u"ய்":
- return u"y"
- if char in u"ர்ழ்ற":
- return u"r"
- if char in u"ல்ள":
- return u"l"
- if char == u"வ்":
- return u"v"
- if char == u"ஜ":
- return u"j"
- if char == u"ஷ":
- return u"sh"
- if char == u"ஸ":
- return u"s"
- if char == u"ஹ":
- return u"h"
- if char == u"க்ஷ":
- return u"x"
- if char == u"அ":
- return u"a"
- if char == u"ஆ":
- return u"aa"
- if char == u"இ":
- return u"i"
- if char == u"ஈ":
- return u"ii"
- if char == u"உ":
- return u"u"
- if char == u"ஊ":
- return u"uu"
- if char == u"எ":
- return u"e"
- if char == u"ஏ":
- return u"ee"
- if char == u"ஐ":
- return u"ai"
- if char == u"ஒ":
- return u"o"
- if char == u"ஓ":
- return u"oo"
- if char == u"ஔ":
- return u"au"
- if char == u"ஃ":
- return ""
+ # Armenian
+ for char in u"Ա":
+ self.trans[char] = u"A"
+ for char in u"ա":
+ self.trans[char] = u"a"
+ for char in u"Բ":
+ self.trans[char] = u"B"
+ for char in u"բ":
+ self.trans[char] = u"b"
+ for char in u"Գ":
+ self.trans[char] = u"G"
+ for char in u"գ":
+ self.trans[char] = u"g"
+ for char in u"Դ":
+ self.trans[char] = u"D"
+ for char in u"դ":
+ self.trans[char] = u"d"
+ for char in u"Ե":
+ self.trans[char] = u"Je"
+ for char in u"ե":
+ self.trans[char] = u"e"
+ for char in u"Զ":
+ self.trans[char] = u"Z"
+ for char in u"զ":
+ self.trans[char] = u"z"
+ for char in u"Է":
+ self.trans[char] = u"É"
+ for char in u"է":
+ self.trans[char] = u"é"
+ for char in u"Ը":
+ self.trans[char] = u"Ë"
+ for char in u"ը":
+ self.trans[char] = u"ë"
+ for char in u"Թ":
+ self.trans[char] = u"Th"
+ for char in u"թ":
+ self.trans[char] = u"th"
+ for char in u"Ժ":
+ self.trans[char] = u"Zh"
+ for char in u"ժ":
+ self.trans[char] = u"zh"
+ for char in u"Ի":
+ self.trans[char] = u"I"
+ for char in u"ի":
+ self.trans[char] = u"i"
+ for char in u"Լ":
+ self.trans[char] = u"L"
+ for char in u"լ":
+ self.trans[char] = u"l"
+ for char in u"Խ":
+ self.trans[char] = u"Ch"
+ for char in u"խ":
+ self.trans[char] = u"ch"
+ for char in u"Ծ":
+ self.trans[char] = u"Ts"
+ for char in u"ծ":
+ self.trans[char] = u"ts"
+ for char in u"Կ":
+ self.trans[char] = u"K"
+ for char in u"կ":
+ self.trans[char] = u"k"
+ for char in u"Հ":
+ self.trans[char] = u"H"
+ for char in u"հ":
+ self.trans[char] = u"h"
+ for char in u"Ձ":
+ self.trans[char] = u"Dz"
+ for char in u"ձ":
+ self.trans[char] = u"dz"
+ for char in u"Ղ":
+ self.trans[char] = u"R"
+ for char in u"ղ":
+ self.trans[char] = u"r"
+ for char in u"Ճ":
+ self.trans[char] = u"Cz"
+ for char in u"ճ":
+ self.trans[char] = u"cz"
+ for char in u"Մ":
+ self.trans[char] = u"M"
+ for char in u"մ":
+ self.trans[char] = u"m"
+ for char in u"Յ":
+ self.trans[char] = u"J"
+ for char in u"յ":
+ self.trans[char] = u"j"
+ for char in u"Ն":
+ self.trans[char] = u"N"
+ for char in u"ն":
+ self.trans[char] = u"n"
+ for char in u"Շ":
+ self.trans[char] = u"S"
+ for char in u"շ":
+ self.trans[char] = u"s"
+ for char in u"Շ":
+ self.trans[char] = u"Vo"
+ for char in u"շ":
+ self.trans[char] = u"o"
+ for char in u"Չ":
+ self.trans[char] = u"Tsh"
+ for char in u"չ":
+ self.trans[char] = u"tsh"
+ for char in u"Պ":
+ self.trans[char] = u"P"
+ for char in u"պ":
+ self.trans[char] = u"p"
+ for char in u"Ջ":
+ self.trans[char] = u"Dz"
+ for char in u"ջ":
+ self.trans[char] = u"dz"
+ for char in u"Ռ":
+ self.trans[char] = u"R"
+ for char in u"ռ":
+ self.trans[char] = u"r"
+ for char in u"Ս":
+ self.trans[char] = u"S"
+ for char in u"ս":
+ self.trans[char] = u"s"
+ for char in u"Վ":
+ self.trans[char] = u"V"
+ for char in u"վ":
+ self.trans[char] = u"v"
+ for char in u"Տ":
+ self.trans[char] = u"T'"
+ for char in u"տ":
+ self.trans[char] = u"t'"
+ for char in u"Ր":
+ self.trans[char] = u"R"
+ for char in u"ր":
+ self.trans[char] = u"r"
+ for char in u"Ց":
+ self.trans[char] = u"Tsh"
+ for char in u"ց":
+ self.trans[char] = u"tsh"
+ for char in u"Ւ":
+ self.trans[char] = u"V"
+ for char in u"ւ":
+ self.trans[char] = u"v"
+ for char in u"Փ":
+ self.trans[char] = u"Ph"
+ for char in u"փ":
+ self.trans[char] = u"ph"
+ for char in u"Ք":
+ self.trans[char] = u"Kh"
+ for char in u"ք":
+ self.trans[char] = u"kh"
+ for char in u"Օ":
+ self.trans[char] = u"O"
+ for char in u"օ":
+ self.trans[char] = u"o"
+ for char in u"Ֆ":
+ self.trans[char] = u"F"
+ for char in u"ֆ":
+ self.trans[char] = u"f"
+ for char in u"և":
+ self.trans[char] = u"&"
+ for char in u"՟":
+ self.trans[char] = u"."
+ for char in u"՞":
+ self.trans[char] = u"?"
+ for char in u"՝":
+ self.trans[char] = u";"
+ for char in u"՛":
+ self.trans[char] = u""
- # Bengali
- if char == u"অ":
- return u"ô"
- if char in u"আা":
- return u"a"
- if char in u"ইিঈী":
- return u"i"
- if char in u"উুঊূ":
- return u"u"
- if char in u"ঋৃ":
- return u"ri"
- if char in u"এেয়":
- return u"e"
- if char in u"ঐৈ":
- return u"oi"
- if char in u"ওো":
- return u"o"
- if char in u"ঔৌ":
- return "ou"
- if char == u"্":
- return u""
- if char == u"ৎ":
- return u"t"
- if char == u"ং":
- return u"n"
- if char == u"ঃ":
- return u"h"
- if char == u"ঁ":
- return u"ñ"
- if char == u"ক":
- return u"k"
- if char == u"খ":
- return u"kh"
- if char == u"গ":
- return u"g"
- if char == u"ঘ":
- return u"gh"
- if char == u"ঙ":
- return u"ng"
- if char == u"চ":
- return u"ch"
- if char == u"ছ":
- return u"chh"
- if char in u"জ":
- return u"j"
- if char == u"ঝ":
- return u"jh"
- if char == u"ঞ":
- return u"n"
- if char in u"টত":
- return u"t"
- if char in u"ঠথ":
- return u"th"
- if char in u"ডদ":
- return u"d"
- if char in u"ঢধ":
- return u"dh"
- if char in u"ণন":
- return u"n"
- if char == u"প":
- return u"p"
- if char == u"ফ":
- return u"ph"
- if char == u"ব":
- return u"b"
- if char == u"ভ":
- return u"bh"
- if char == u"ম":
- return u"m"
- if char == u"য":
- return u"dzh"
- if char == u"র":
- return u"r"
- if char == u"ল":
- return u"l"
- if char == u"শ":
- return u"s"
- if char == u"হ":
- return u"h"
- if char == u"য়":
- return u"-"
- if char == u"ড়":
- return u"r"
- if char == u"ঢ":
- return u"rh"
- if char == u"০":
- return u"0"
- if char == u"১":
- return u"1"
- if char == u"২":
- return u"2"
- if char == u"৩":
- return u"3"
- if char == u"৪":
- return u"4"
- if char == u"৫":
- return u"5"
- if char == u"৬":
- return u"6"
- if char == u"৭":
- return u"7"
- if char == u"৮":
- return u"8"
- if char == u"৯":
- return u"9"
-
- # Thai (because of complications of the alphabet, transliterations
- # are very imprecise here)
- if char == u"ก":
- return u"k"
- if char in u"ขฃคฅฆ":
- return u"kh"
- if char == u"ง":
- return u"ng"
- if char in u"จฉชฌ":
- return u"ch"
- if char in u"ซศษส":
- return u"s"
- if char in u"ญย":
- return u"y"
- if char in u"ฎด":
- return u"d"
- if char in u"ฏต":
- return u"t"
- if char in u"ฐฑฒถทธ":
- return u"th"
- if char in u"ณน":
- return u"n"
- if char == u"บ":
- return u"b"
- if char == u"ป":
- return u"p"
- if char in u"ผพภ":
- return u"ph"
- if char in u"ฝฟ":
- return u"f"
- if char in u"ม":
- return u"m"
- if char == u"ร":
- return u"r"
- if char == u"ฤ":
- return u"rue"
- if char in u"ๅ":
- return u":"
- if char in u"ลฬ":
- return u"l"
- if char == u"ฦ":
- return u"lue"
- if char == u"ว":
- return u"w"
- if char in u"หฮ":
- return u"h"
- if char == u"อ":
- return u""
- if char == u"ร":
- return u"ü"
- if char == u"ว":
- return u"ua"
- if char in u"อว–โิ":
- return u"o"
- if char in u"ะัา":
- return u"a"
- if char in u"ว":
- return u"u"
- if char == u"ำ":
- return u"am"
- if char == u"ิ":
- return u"i"
- if char == u"ี":
- return u"i:"
- if char == u"ึ":
- return u"ue"
- if char == u"ื":
- return u"ue:"
- if char == u"ุ":
- return u"u"
- if char == u"ู":
- return u"u:"
- if char in u"เ็":
- return u"e"
- if char == u"แ":
- return u"ae"
- if char in u"ใไ":
- return u"ai"
- if char in u"่้๊๋็์":
- return u""
- if char in u"ฯ":
- return u"."
- if char in u"ๆ":
- return u"(2)"
-
- return default
+ # Tamil
+ for char in u"க்":
+ self.trans[char] = u"k"
+ for char in u"ஙண்ந்ன்":
+ self.trans[char] = u"n"
+ for char in u"ச":
+ self.trans[char] = u"c"
+ for char in u"ஞ்":
+ self.trans[char] = u"ñ"
+ for char in u"ட்":
+ self.trans[char] = u"th"
+ for char in u"த":
+ self.trans[char] = u"t"
+ for char in u"ப":
+ self.trans[char] = u"p"
+ for char in u"ம்":
+ self.trans[char] = u"m"
+ for char in u"ய்":
+ self.trans[char] = u"y"
+ for char in u"ர்ழ்ற":
+ self.trans[char] = u"r"
+ for char in u"ல்ள":
+ self.trans[char] = u"l"
+ for char in u"வ்":
+ self.trans[char] = u"v"
+ for char in u"ஜ":
+ self.trans[char] = u"j"
+ for char in u"ஷ":
+ self.trans[char] = u"sh"
+ for char in u"ஸ":
+ self.trans[char] = u"s"
+ for char in u"ஹ":
+ self.trans[char] = u"h"
+ for char in u"க்ஷ":
+ self.trans[char] = u"x"
+ for char in u"அ":
+ self.trans[char] = u"a"
+ for char in u"ஆ":
+ self.trans[char] = u"aa"
+ for char in u"இ":
+ self.trans[char] = u"i"
+ for char in u"ஈ":
+ self.trans[char] = u"ii"
+ for char in u"உ":
+ self.trans[char] = u"u"
+ for char in u"ஊ":
+ self.trans[char] = u"uu"
+ for char in u"எ":
+ self.trans[char] = u"e"
+ for char in u"ஏ":
+ self.trans[char] = u"ee"
+ for char in u"ஐ":
+ self.trans[char] = u"ai"
+ for char in u"ஒ":
+ self.trans[char] = u"o"
+ for char in u"ஓ":
+ self.trans[char] = u"oo"
+ for char in u"ஔ":
+ self.trans[char] = u"au"
+ for char in u"ஃ":
+ self.trans[char] = ""
+
+ # Bengali
+ for char in u"অ":
+ self.trans[char] = u"ô"
+ for char in u"আা":
+ self.trans[char] = u"a"
+ for char in u"ইিঈী":
+ self.trans[char] = u"i"
+ for char in u"উুঊূ":
+ self.trans[char] = u"u"
+ for char in u"ঋৃ":
+ self.trans[char] = u"ri"
+ for char in u"এেয়":
+ self.trans[char] = u"e"
+ for char in u"ঐৈ":
+ self.trans[char] = u"oi"
+ for char in u"ওো":
+ self.trans[char] = u"o"
+ for char in u"ঔৌ":
+ self.trans[char] = "ou"
+ for char in u"্":
+ self.trans[char] = u""
+ for char in u"ৎ":
+ self.trans[char] = u"t"
+ for char in u"ং":
+ self.trans[char] = u"n"
+ for char in u"ঃ":
+ self.trans[char] = u"h"
+ for char in u"ঁ":
+ self.trans[char] = u"ñ"
+ for char in u"ক":
+ self.trans[char] = u"k"
+ for char in u"খ":
+ self.trans[char] = u"kh"
+ for char in u"গ":
+ self.trans[char] = u"g"
+ for char in u"ঘ":
+ self.trans[char] = u"gh"
+ for char in u"ঙ":
+ self.trans[char] = u"ng"
+ for char in u"চ":
+ self.trans[char] = u"ch"
+ for char in u"ছ":
+ self.trans[char] = u"chh"
+ for char in u"জ":
+ self.trans[char] = u"j"
+ for char in u"ঝ":
+ self.trans[char] = u"jh"
+ for char in u"ঞ":
+ self.trans[char] = u"n"
+ for char in u"টত":
+ self.trans[char] = u"t"
+ for char in u"ঠথ":
+ self.trans[char] = u"th"
+ for char in u"ডদ":
+ self.trans[char] = u"d"
+ for char in u"ঢধ":
+ self.trans[char] = u"dh"
+ for char in u"ণন":
+ self.trans[char] = u"n"
+ for char in u"প":
+ self.trans[char] = u"p"
+ for char in u"ফ":
+ self.trans[char] = u"ph"
+ for char in u"ব":
+ self.trans[char] = u"b"
+ for char in u"ভ":
+ self.trans[char] = u"bh"
+ for char in u"ম":
+ self.trans[char] = u"m"
+ for char in u"য":
+ self.trans[char] = u"dzh"
+ for char in u"র":
+ self.trans[char] = u"r"
+ for char in u"ল":
+ self.trans[char] = u"l"
+ for char in u"শ":
+ self.trans[char] = u"s"
+ for char in u"হ":
+ self.trans[char] = u"h"
+ for char in u"য়":
+ self.trans[char] = u"-"
+ for char in u"ড়":
+ self.trans[char] = u"r"
+ for char in u"ঢ":
+ self.trans[char] = u"rh"
+ for char in u"০":
+ self.trans[char] = u"0"
+ for char in u"১":
+ self.trans[char] = u"1"
+ for char in u"২":
+ self.trans[char] = u"2"
+ for char in u"৩":
+ self.trans[char] = u"3"
+ for char in u"৪":
+ self.trans[char] = u"4"
+ for char in u"৫":
+ self.trans[char] = u"5"
+ for char in u"৬":
+ self.trans[char] = u"6"
+ for char in u"৭":
+ self.trans[char] = u"7"
+ for char in u"৮":
+ self.trans[char] = u"8"
+ for char in u"৯":
+ self.trans[char] = u"9"
+
+ # Thai (because of complications of the alphabet, self.transliterations
+ # are very imprecise here)
+ for char in u"ก":
+ self.trans[char] = u"k"
+ for char in u"ขฃคฅฆ":
+ self.trans[char] = u"kh"
+ for char in u"ง":
+ self.trans[char] = u"ng"
+ for char in u"จฉชฌ":
+ self.trans[char] = u"ch"
+ for char in u"ซศษส":
+ self.trans[char] = u"s"
+ for char in u"ญย":
+ self.trans[char] = u"y"
+ for char in u"ฎด":
+ self.trans[char] = u"d"
+ for char in u"ฏต":
+ self.trans[char] = u"t"
+ for char in u"ฐฑฒถทธ":
+ self.trans[char] = u"th"
+ for char in u"ณน":
+ self.trans[char] = u"n"
+ for char in u"บ":
+ self.trans[char] = u"b"
+ for char in u"ป":
+ self.trans[char] = u"p"
+ for char in u"ผพภ":
+ self.trans[char] = u"ph"
+ for char in u"ฝฟ":
+ self.trans[char] = u"f"
+ for char in u"ม":
+ self.trans[char] = u"m"
+ for char in u"ร":
+ self.trans[char] = u"r"
+ for char in u"ฤ":
+ self.trans[char] = u"rue"
+ for char in u"ๅ":
+ self.trans[char] = u":"
+ for char in u"ลฬ":
+ self.trans[char] = u"l"
+ for char in u"ฦ":
+ self.trans[char] = u"lue"
+ for char in u"ว":
+ self.trans[char] = u"w"
+ for char in u"หฮ":
+ self.trans[char] = u"h"
+ for char in u"อ":
+ self.trans[char] = u""
+ for char in u"ร":
+ self.trans[char] = u"ü"
+ for char in u"ว":
+ self.trans[char] = u"ua"
+ for char in u"อว–โิ":
+ self.trans[char] = u"o"
+ for char in u"ะัา":
+ self.trans[char] = u"a"
+ for char in u"ว":
+ self.trans[char] = u"u"
+ for char in u"ำ":
+ self.trans[char] = u"am"
+ for char in u"ิ":
+ self.trans[char] = u"i"
+ for char in u"ี":
+ self.trans[char] = u"i:"
+ for char in u"ึ":
+ self.trans[char] = u"ue"
+ for char in u"ื":
+ self.trans[char] = u"ue:"
+ for char in u"ุ":
+ self.trans[char] = u"u"
+ for char in u"ู":
+ self.trans[char] = u"u:"
+ for char in u"เ็":
+ self.trans[char] = u"e"
+ for char in u"แ":
+ self.trans[char] = u"ae"
+ for char in u"ใไ":
+ self.trans[char] = u"ai"
+ for char in u"่้๊๋็์":
+ self.trans[char] = u""
+ for char in u"ฯ":
+ self.trans[char] = u"."
+ for char in u"ๆ":
+ self.trans[char] = u"(2)"
+
+
+ def transliterate(self, char, default="?", prev="-", next="-"):
+ if char in self.trans:
+ return self.trans[char]
+ #Arabic
+ if char == u"◌":
+ return prev
+ #Japanese
+ if char == u"ッ":
+ return self.transliterate(next)[0]
+ if char in u"々仝ヽヾゝゞ〱〲〳〵〴〵":
+ return prev
+ return default
+
More information about the Pywikipedia-l
mailing list