Revision: 6275 Author: a_engels Date: 2009-01-21 20:21:19 +0000 (Wed, 21 Jan 2009)
Log Message: ----------- speeding up transliteration by using a dictionary instead of a sequence of elifs
Modified Paths: -------------- trunk/pywikipedia/userinterfaces/terminal_interface.py trunk/pywikipedia/userinterfaces/transliteration.py
Modified: trunk/pywikipedia/userinterfaces/terminal_interface.py =================================================================== --- trunk/pywikipedia/userinterfaces/terminal_interface.py 2009-01-21 19:19:12 UTC (rev 6274) +++ trunk/pywikipedia/userinterfaces/terminal_interface.py 2009-01-21 20:21:19 UTC (rev 6275) @@ -16,6 +16,8 @@ except ImportError: ctypes_found = False
+transliterator = transliteration.transliterator() + def getDefaultTextColorInWindows(): """ This method determines the default text color and saves its color @@ -195,9 +197,9 @@ # original question marks. if codecedText[i] == '?' and text[i] != u'?': try: - transliterated = transliteration.trans(text[i], default = '?', prev = prev, next = text[i+1]) + transliterated = transliterator.transliterate(text[i], default = '?', prev = prev, next = text[i+1]) except IndexError: - transliterated = transliteration.trans(text[i], default = '?', prev = prev, next = ' ') + transliterated = transliterator.transliterate(text[i], default = '?', prev = prev, next = ' ') # transliteration was successful. The replacement # could consist of multiple letters. # mark the transliterated letters in yellow.
Modified: trunk/pywikipedia/userinterfaces/transliteration.py =================================================================== --- trunk/pywikipedia/userinterfaces/transliteration.py 2009-01-21 19:19:12 UTC (rev 6274) +++ trunk/pywikipedia/userinterfaces/transliteration.py 2009-01-21 20:21:19 UTC (rev 6275) @@ -1,1902 +1,1357 @@ -# -*- coding: utf-8 -*- -def trans(char, default = '?', prev = '-', next = '-'): - # Give a transliteration for char, or default if none is known - # Accented etc. Latin characters - if char in u"ÀÁÂẦẤẪẨẬÃĀĂẰẮẴẶẲȦǠẠḀȂĄǍẢ": - return u"A" - if char in u"ȀǞ": - return u"Ä" - if char == u"Ǻ": - return u"Å" - if char == u"Ä": - return u"Ae" - if char == u"Å": - return u"Aa" - if char in u"àáâầấẫẩậãāăằắẵặẳȧǡạḁȃąǎảẚ": - return u"a" - if char in u"ȁǟ": - return u"ä" - if char == u"ǻ": - return u"å" - if char == u"ä": - return u"ae" - if char == u"å": - return u"aa" - if char in u"ḂḄḆƁƂ": - return u"B" - if char in u"ḃḅḇƀɓƃ": - return u"b" - if char in u"ĆĈĊÇČƇ": - return u"C" - if char in u"ćĉċçčƈȼ": - return u"c" - if char == u"Ḉ": - return u"Ç" - if char == u"ḉ": - return u"ç" - if char == u"Ð": - return u"Dh" - if char == u"ð": - return u"dh" - if char in u"ĎḊḌḎḐḒĐƉƊƋ": - return u"D" - if char in u"ďḋḍḏḑḓđɖɗƌ": - return u"d" - if char in u"ÈȄÉÊḚËĒḔḖĔĖẸE̩ȆȨḜĘĚẼḘẺ": - return u"E" - if char in u"ỀẾỄỆỂ": - return u"Ê" - if char in u"èȅéêḛëēḕḗĕėẹe̩ȇȩḝęěẽḙẻ": - return u"e" - if char in u"ềếễệể": - return u"ê" - if char in u"ḞƑ": - return u"F" - if char in u"ḟƒ": - return u"f" - if char in u"ǴḠĞĠĢǦǤƓ": - return u"G" - if char in u"ǵḡğġģǧǥɠ": - return u"g" - if char == u"Ĝ": - return u"Gx" - if char == u"ĝ": - return u"gx" - if char in u"ḢḤḦȞḨḪH̱ĦǶ": - return u"H" - if char in u"ḣḥḧȟḩḫ̱ẖħƕ": - return u"h" - if char in u"IÌȈÍÎĨḬÏḮĪĬȊĮǏİỊỈƗ": - return u"I" - if char in u"ıìȉíîĩḭïḯīĭȋįǐiịỉɨ": - return u"i" - if char in u"ĴJ": - return u"J" - if char in u"ɟĵ̌ǰ": - return u"j" - if char in u"ḰǨĶḲḴƘ": - return u"K" - if char in u"ḱǩķḳḵƙ": - return u"k" - if char in u"ĹĻĽḶḸḺḼȽŁ": - return u"L" - if char in u"ĺļľḷḹḻḽƚłɫ": - return u"l" - if char in u"ḾṀṂ": - return u"M" - if char in u"ḿṁṃɱ": - return u"m" - if char in u"ǸŃÑŅŇṄṆṈṊŊƝɲȠ": - return u"N" - if char in u"ǹńñņňṅṇṉṋŋɲƞ": - return u"n" - if char in u"ÒÓÔÕṌṎȬÖŌṐṒŎǑȮȰỌǪǬƠỜỚỠỢỞỎƟØǾ": - return u"O" - if char in u"òóôõṍṏȭöōṑṓŏǒȯȱọǫǭơờớỡợởỏɵøǿ": - return u"o" - if char in u"ȌŐȪ": - return u"Ö" - if char in u"ȍőȫ": - return u"ö" - if char in u"ỒỐỖỘỔȎ": - return u"Ô" - if char in u"ồốỗộổȏ": - return u"ô" - if char in u"ṔṖƤ": - return u"P" - if char in u"ṕṗƥ": - return u"p" - if char == u"ᵽ": - return u"q" - if char in u"ȐŔŖŘȒṘṚṜṞ": - return u"R" - if char in u"ȑŕŗřȓṙṛṝṟɽ": - return u"r" - if char in u"ŚṤŞȘŠṦṠṢṨ": - return u"S" - if char in u"śṥşșšṧṡṣṩȿ": - return u"s" - if char == u"Ŝ": - return u"Sx" - if char == u"ŝ": - return u"sx" - if char in u"ŢȚŤṪṬṮṰŦƬƮ": - return u"T" - if char in u"ţțťṫṭṯṱŧȾƭʈ": - return u"t" - if char in u"ÙÚŨṸṴÜṲŪṺŬỤŮŲǓṶỦƯỮỰỬ": - return u"U" - if char in u"ùúũṹṵüṳūṻŭụůųǔṷủưữựửʉ": - return u"u" - if char in u"ȔŰǛǗǕǙ": - return u"Ü" - if char in u"ȕűǜǘǖǚ": - return u"ü" - if char == u"Û": - return u"Ux" - if char == u"û": - return u"ux" - if char == u"Ȗ": - return u"Û" - if char == u"ȗ": - return u"û" - if char == u"Ừ": - return u"Ù" - if char == u"ừ": - return u"ù" - if char == u"Ứ": - return u"Ú" - if char == u"ứ": - return u"ú" - if char in u"ṼṾ": - return u"V" - if char in u"ṽṿ": - return u"v" - if char in u"ẀẂŴẄẆẈ": - return u"W" - if char in u"ẁẃŵẅẇẉ": - return u"w" - if char in u"ẊẌ": - return u"X" - if char in u"ẋẍ": - return u"x" - if char in u"ỲÝŶŸỸȲẎỴỶƳ": - return u"Y" - if char in u"ỳýŷÿỹȳẏỵỷƴ": - return u"y" - if char in u"ŹẐŻẒŽẔƵȤ": - return u"Z" - if char in u"źẑżẓžẕƶȥ": - return u"z" - if char == u"ɀ": - return u"zv" - - # Latin: extended Latin alphabet - if char == u"ɑ": - return u"a" - if char in u"ÆǼǢ": - return u"AE" - if char in u"æǽǣ": - return u"ae" - if char == u"Ð": - return u"Dh" - if char == u"ð": - return u"dh" - if char in u"ƎƏƐ": - return u"E" - if char in u"ǝəɛ": - return u"e" - if char in u"ƔƢ": - return u"G" - if char in u"ᵷɣƣᵹ": - return u"g" - if char == u"Ƅ": - return u"H" - if char == u"ƅ": - return u"h" - if char == u"Ƕ": - return u"Wh" - if char == u"ƕ": - return u"wh" - if char == u"Ɩ": - return u"I" - if char == u"ɩ": - return u"i" - if char == u"Ŋ": - return u"Ng" - if char == u"ŋ": - return u"ng" - if char == u"Œ": - return u"OE" - if char == u"œ": - return u"oe" - if char == u"Ɔ": - return u"O" - if char == u"ɔ": - return u"o" - if char == u"Ȣ": - return u"Ou" - if char == u"ȣ": - return u"ou" - if char == u"Ƽ": - return u"Q" - if char in u"ĸƽ": - return u"q" - if char == u"ȹ": - return u"qp" - if char == u"": - return u"r" - if char == u"ſ": - return u"s" - if char == u"ß": - return u"ss" - if char == u"Ʃ": - return u"Sh" - if char == u"ʃᶋ": - return u"sh" - if char == u"Ʉ": - return u"U" - if char == u"ʉ": - return u"u" - if char == u"Ʌ": - return u"V" - if char == u"ʌ": - return u"v" - if char in u"ƜǷ": - return u"W" - if char in u"ɯƿ": - return u"w" - if char == u"Ȝ": - return u"Y" - if char == u"ȝ": - return u"y" - if char == u"IJ": - return u"IJ" - if char == u"ij": - return u"ij" - if char == u"Ƨ": - return u"Z" - if char in u"ʮƨ": - return u"z" - if char == u"Ʒ": - return u"Zh" - if char == u"ʒ": - return u"zh" - if char == u"Ǯ": - return u"Dzh" - if char == u"ǯ": - return u"dzh" - if char in u"ƸƹʔˀɁɂ": - return u"'" - if char in u"Þ": - return u"Th" - if char in u"þ": - return u"th" - if char in u"Cʗǃ": - return u"!" +# -*- coding: utf-8 -*- +class transliterator(object): + def __init__(self): + self.trans = {} + for char in u"ÀÁÂẦẤẪẨẬÃĀĂẰẮẴẶẲȦǠẠḀȂĄǍẢ": + self.trans[char] = u"A" + for char in u"ȀǞ": + self.trans[char] = u"Ä" + self.trans[u"Ǻ"] = u"Å" + self.trans[u"Ä"] = u"Ae" + self.trans[u"Å"] = u"Aa" + for char in u"àáâầấẫẩậãāăằắẵặẳȧǡạḁȃąǎảẚ": + self.trans[char] = u"a" + for char in u"ȁǟ": + self.trans[char] = u"ä" + self.trans[u"ǻ"] = u"å" + self.trans[u"ä"] = u"ae" + self.trans[u"å"] = u"aa" + for char in u"ḂḄḆƁƂ": + self.trans[char] = u"B" + for char in u"ḃḅḇƀɓƃ": + self.trans[char] = u"b" + for char in u"ĆĈĊÇČƇ": + self.trans[char] = u"C" + for char in u"ćĉċçčƈȼ": + self.trans[char] = u"c" + self.trans[u"Ḉ"] = u"Ç" + self.trans[u"ḉ"] = u"ç" + self.trans[u"Ð"] = u"Dh" + self.trans[u"ð"] = u"dh" + for char in u"ĎḊḌḎḐḒĐƉƊƋ": + self.trans[char] = u"D" + for char in u"ďḋḍḏḑḓđɖɗƌ": + self.trans[char] = u"d" + for char in u"ÈȄÉÊḚËĒḔḖĔĖẸE̩ȆȨḜĘĚẼḘẺ": + self.trans[char] = u"E" + for char in u"ỀẾỄỆỂ": + self.trans[char] = u"Ê" + for char in u"èȅéêḛëēḕḗĕėẹe̩ȇȩḝęěẽḙẻ": + self.trans[char] = u"e" + for char in u"ềếễệể": + self.trans[char] = u"ê" + for char in u"ḞƑ": + self.trans[char] = u"F" + for char in u"ḟƒ": + self.trans[char] = u"f" + for char in u"ǴḠĞĠĢǦǤƓ": + self.trans[char] = u"G" + for char in u"ǵḡğġģǧǥɠ": + self.trans[char] = u"g" + self.trans[u"Ĝ"] = u"Gx" + self.trans[u"ĝ"] = u"gx" + for char in u"ḢḤḦȞḨḪH̱ĦǶ": + self.trans[char] = u"H" + for char in u"ḣḥḧȟḩḫ̱ẖħƕ": + self.trans[char] = u"h" + for char in u"IÌȈÍÎĨḬÏḮĪĬȊĮǏİỊỈƗ": + self.trans[char] = u"I" + for char in u"ıìȉíîĩḭïḯīĭȋįǐiịỉɨ": + self.trans[char] = u"i" + for char in u"ĴJ": + self.trans[char] = u"J" + for char in u"ɟĵ̌ǰ": + self.trans[char] = u"j" + for char in u"ḰǨĶḲḴƘ": + self.trans[char] = u"K" + for char in u"ḱǩķḳḵƙ": + self.trans[char] = u"k" + for char in u"ĹĻĽḶḸḺḼȽŁ": + self.trans[char] = u"L" + for char in u"ĺļľḷḹḻḽƚłɫ": + self.trans[char] = u"l" + for char in u"ḾṀṂ": + self.trans[char] = u"M" + for char in u"ḿṁṃɱ": + self.trans[char] = u"m" + for char in u"ǸŃÑŅŇṄṆṈṊŊƝɲȠ": + self.trans[char] = u"N" + for char in u"ǹńñņňṅṇṉṋŋɲƞ": + self.trans[char] = u"n" + for char in u"ÒÓÔÕṌṎȬÖŌṐṒŎǑȮȰỌǪǬƠỜỚỠỢỞỎƟØǾ": + self.trans[char] = u"O" + for char in u"òóôõṍṏȭöōṑṓŏǒȯȱọǫǭơờớỡợởỏɵøǿ": + self.trans[char] = u"o" + for char in u"ȌŐȪ": + self.trans[char] = u"Ö" + for char in u"ȍőȫ": + self.trans[char] = u"ö" + for char in u"ỒỐỖỘỔȎ": + self.trans[char] = u"Ô" + for char in u"ồốỗộổȏ": + self.trans[char] = u"ô" + for char in u"ṔṖƤ": + self.trans[char] = u"P" + for char in u"ṕṗƥ": + self.trans[char] = u"p" + self.trans[u"ᵽ"] = u"q" + for char in u"ȐŔŖŘȒṘṚṜṞ": + self.trans[char] = u"R" + for char in u"ȑŕŗřȓṙṛṝṟɽ": + self.trans[char] = u"r" + for char in u"ŚṤŞȘŠṦṠṢṨ": + self.trans[char] = u"S" + for char in u"śṥşșšṧṡṣṩȿ": + self.trans[char] = u"s" + self.trans[u"Ŝ"] = u"Sx" + self.trans[u"ŝ"] = u"sx" + for char in u"ŢȚŤṪṬṮṰŦƬƮ": + self.trans[char] = u"T" + for char in u"ţțťṫṭṯṱŧȾƭʈ": + self.trans[char] = u"t" + for char in u"ÙÚŨṸṴÜṲŪṺŬỤŮŲǓṶỦƯỮỰỬ": + self.trans[char] = u"U" + for char in u"ùúũṹṵüṳūṻŭụůųǔṷủưữựửʉ": + self.trans[char] = u"u" + for char in u"ȔŰǛǗǕǙ": + self.trans[char] = u"Ü" + for char in u"ȕűǜǘǖǚ": + self.trans[char] = u"ü" + self.trans[u"Û"] = u"Ux" + self.trans[u"û"] = u"ux" + self.trans[u"Ȗ"] = u"Û" + self.trans[u"ȗ"] = u"û" + self.trans[u"Ừ"] = u"Ù" + self.trans[u"ừ"] = u"ù" + self.trans[u"Ứ"] = u"Ú" + self.trans[u"ứ"] = u"ú" + for char in u"ṼṾ": + self.trans[char] = u"V" + for char in u"ṽṿ": + self.trans[char] = u"v" + for char in u"ẀẂŴẄẆẈ": + self.trans[char] = u"W" + for char in u"ẁẃŵẅẇẉ": + self.trans[char] = u"w" + for char in u"ẊẌ": + self.trans[char] = u"X" + for char in u"ẋẍ": + self.trans[char] = u"x" + for char in u"ỲÝŶŸỸȲẎỴỶƳ": + self.trans[char] = u"Y" + for char in u"ỳýŷÿỹȳẏỵỷƴ": + self.trans[char] = u"y" + for char in u"ŹẐŻẒŽẔƵȤ": + self.trans[char] = u"Z" + for char in u"źẑżẓžẕƶȥ": + self.trans[char] = u"z" + self.trans[u"ɀ"] = u"zv" + + # Latin: extended Latin alphabet + self.trans[u"ɑ"] = u"a" + for char in u"ÆǼǢ": + self.trans[char] = u"AE" + for char in u"æǽǣ": + self.trans[char] = u"ae" + self.trans[u"Ð"] = u"Dh" + self.trans[u"ð"] = u"dh" + for char in u"ƎƏƐ": + self.trans[char] = u"E" + for char in u"ǝəɛ": + self.trans[char] = u"e" + for char in u"ƔƢ": + self.trans[char] = u"G" + for char in u"ᵷɣƣᵹ": + self.trans[char] = u"g" + self.trans[u"Ƅ"] = u"H" + self.trans[u"ƅ"] = u"h" + self.trans[u"Ƕ"] = u"Wh" + self.trans[u"ƕ"] = u"wh" + self.trans[u"Ɩ"] = u"I" + self.trans[u"ɩ"] = u"i" + self.trans[u"Ŋ"] = u"Ng" + self.trans[u"ŋ"] = u"ng" + self.trans[u"Œ"] = u"OE" + self.trans[u"œ"] = u"oe" + self.trans[u"Ɔ"] = u"O" + self.trans[u"ɔ"] = u"o" + self.trans[u"Ȣ"] = u"Ou" + self.trans[u"ȣ"] = u"ou" + self.trans[u"Ƽ"] = u"Q" + for char in u"ĸƽ": + self.trans[char] = u"q" + self.trans[u"ȹ"] = u"qp" + self.trans[u""] = u"r" + self.trans[u"ſ"] = u"s" + self.trans[u"ß"] = u"ss" + self.trans[u"Ʃ"] = u"Sh" + for char in u"ʃᶋ": + self.trans[char] = u"sh" + self.trans[u"Ʉ"] = u"U" + self.trans[u"ʉ"] = u"u" + self.trans[u"Ʌ"] = u"V" + self.trans[u"ʌ"] = u"v" + for char in u"ƜǷ": + self.trans[char] = u"W" + for char in u"ɯƿ": + self.trans[char] = u"w" + self.trans[u"Ȝ"] = u"Y" + self.trans[u"ȝ"] = u"y" + self.trans[u"IJ"] = u"IJ" + self.trans[u"ij"] = u"ij" + self.trans[u"Ƨ"] = u"Z" + for char in u"ʮƨ": + self.trans[char] = u"z" + self.trans[u"Ʒ"] = u"Zh" + self.trans[u"ʒ"] = u"zh" + self.trans[u"Ǯ"] = u"Dzh" + self.trans[u"ǯ"] = u"dzh" + for char in u"ƸƹʔˀɁɂ": + self.trans[char] = u"'" + for char in u"Þ": + self.trans[char] = u"Th" + for char in u"þ": + self.trans[char] = u"th" + for char in u"Cʗǃ": + self.trans[char] = u"!"
- #Punctuation and typography - if char in u"«»“”„¨": - return u'"' - if char in u"‘’′": - return u"'" - if char == u"•": - return u"*" - if char == u"@": - return u"(at)" - if char == u"¤": - return u"$" - if char == u"¢": - return u"c" - if char == u"€": - return u"E" - if char == u"£": - return u"L" - if char == u"¥": - return u"yen" - if char == u"†": - return u"+" - if char == u"‡": - return u"++" - if char == u"°": - return u":" - if char == u"¡": - return u"!" - if char == u"¿": - return u"?" - if char == u"‰": - return u"o/oo" - if char == u"‱": - return u"o/ooo" - if char in u"¶§": - return u">" - if char in u"…": - return u"..." - if char in u"‒–—―": - return u"-" - if char in u"·": - return u" " - if char == u"¦": - return u"|" - if char == u"⁂": - return u"***" - if char == u"◊": - return u"<>" - if char == u"‽": - return u"?!" - if char == u"؟": - return u";-)" - + #Punctuation and typography + for char in u"«»“”„¨": + self.trans[char] = u'"' + for char in u"‘’′": + self.trans[char] = u"'" + self.trans[u"•"] = u"*" + self.trans[u"@"] = u"(at)" + self.trans[u"¤"] = u"$" + self.trans[u"¢"] = u"c" + self.trans[u"€"] = u"E" + self.trans[u"£"] = u"L" + self.trans[u"¥"] = u"yen" + self.trans[u"†"] = u"+" + self.trans[u"‡"] = u"++" + self.trans[u"°"] = u":" + self.trans[u"¡"] = u"!" + self.trans[u"¿"] = u"?" + self.trans[u"‰"] = u"o/oo" + self.trans[u"‱"] = u"o/ooo" + for char in u"¶§": + self.trans[char] = u">" + for char in u"…": + self.trans[char] = u"..." + for char in u"‒–—―": + self.trans[char] = u"-" + for char in u"·": + self.trans[char] = u" " + self.trans[u"¦"] = u"|" + self.trans[u"⁂"] = u"***" + self.trans[u"◊"] = u"<>" + self.trans[u"‽"] = u"?!" + self.trans[u"؟"] = u";-)"
- # Cyrillic - if char == u"А": - return u"A" - if char == u"а": - return u"a" - if char == u"Б": - return u"B" - if char == u"б": - return u"b" - if char == u"В": - return u"V" - if char == u"в": - return u"v" - if char == u"Г": - return u"G" - if char == u"г": - return u"g" - if char == u"Д": - return u"D" - if char == u"д": - return u"d" - if char == u"Е": - return u"E" - if char == u"е": - return u"e" - if char == u"Ж": - return u"Zh" - if char == u"ж": - return u"zh" - if char == u"З": - return u"Z" - if char == u"з": - return u"z" - if char == u"И": - return u"I" - if char == u"и": - return u"i" - if char == u"Й": - return u"J" - if char == u"й": - return u"j" - if char == u"К": - return u"K" - if char == u"к": - return u"k" - if char == u"Л": - return u"L" - if char == u"л": - return u"l" - if char == u"М": - return u"M" - if char == u"м": - return u"m" - if char == u"Н": - return u"N" - if char == u"н": - return u"n" - if char == u"О": - return u"O" - if char == u"о": - return u"o" - if char == u"П": - return u"P" - if char == u"п": - return u"p" - if char == u"Р": - return u"R" - if char == u"р": - return u"r" - if char == u"С": - return u"S" - if char == u"с": - return u"s" - if char == u"Т": - return u"T" - if char == u"т": - return u"t" - if char in u"У": - return u"U" - if char == u"у": - return u"u" - if char == u"Ф": - return u"F" - if char == u"ф": - return u"f" - if char in u"ХΧ": - if prev.lower() == prev: - return u"Kh" - else: - return u"KH" - if char == u"х": - return u"kh" - if char == u"Ц": - return u"C" - if char == u"ц": - return u"c" - if char == u"Ч": - return u"Ch" - if char == u"ч": - return u"ch" - if char == u"Ш": - return u"Sh" - if char == u"ш": - return u"sh" - if char == u"Щ": - return u"Shch" - if char == u"щ": - return u"shch" - if char in u"Ьь": - return u"'" - if char in u"Ъъ": - return '"' - if char == u"Ю": - return u"Yu" - if char == u"ю": - return u"yu" - if char == u"Я": - return u"Ya" - if char == u"я": - return u"ya" - # Additional Cyrillic letters, most occuring in only one or a few languages - if char == u"Ы": - return u"Y" - if char == u"ы": - return u"y" - if char == u"Ё": - return u"Ë" - if char == u"ё": - return u"ë" - if char in u"ЭЀ": - return u"È" - if char in u"эѐ": - return u"è" - if char == u"І": - return u"I" - if char == u"і": - return u"i" - if char == u"Ї": - return u"Ji" - if char == u"ї": - return u"ji" - if char == u"Є": - return u"Je" - if char == u"є": - return u"je" - if char in u"ҐҜ": - return u"G" - if char in u"ґҝ": - return u"g" - if char == u"Ђ": - return u"Dj" - if char == u"ђ": - return u"dj" - if char in u"ЈӤҊ": - return u"J" - if char in u"јӥҋ": - return u"j" - if char == u"Ӣ": - return u"Y" - if char == u"ӣ": - return u"y" - if char == u"Љ": - return u"Lj" - if char == u"љ": - return u"lj" - if char == u"Њ": - return u"Nj" - if char == u"њ": - return u"nj" - if char == u"Ћ": - return u"Cj" - if char == u"ћ": - return u"cj" - if char in u"ЏӁӜҶ": - return u"Dzh" - if char in u"џӂӝҷ": - return u"dzh" - if char == u"Җ": - return u"Zhj" - if char == u"җ": - return u"zhj" - if char in u"ЅӞӠӋҸ": - return u"Dz" - if char in u"ѕӟӡӌҹ": - return u"dz" - if char == u"Ѓ": - return u"Gj" - if char == u"ѓ": - return u"gj" - if char == u"Ќ": - return u"Kj" - if char == u"ќ": - return u"kj" - if char in u"ҒӶҔ": - return u"G" - if char in u"ғӷҕ": - return u"g" - if char == u"Ӣ": - return u"Ii" - if char == u"ӣ": - return u"ii" - if char in u"ҚҞҠӃ": - return u"Q" - if char == u"қҟҡӄ": - return u"q" - if char == u"Ӯ": - return u"U" - if char == u"ӯ": - return u"u" - if char == u"Ҳ": - return u"H" - if char == u"ҳ": - return u"h" - if char == u"Ҷ": - return u"Dz" - if char == u"ҷ": - return u"dz" - if char in u"ӨӪ": - return u"Ô" - if char in u"өӫ": - return u"ô" - if char == u"Ү": - return u"Y" - if char == u"ү": - return u"y" - if char == u"Һ": - return u"H" - if char == u"һ": - return u"h" - if char in u"ӘӔ": - return u"AE" - if char == u"ә": - return u"ae" - if char == u"ӚӬ": - return u"Ë" - if char == u"ӛӭ": - return u"ë" - if char == u"Җ": - return u"Zhj" - if char == u"җ": - return u"zhj" - if char == u"ҢҤӉӇ": - return u"Ng" - if char == u"ңҥӊӈ": - return u"ng" - if char == u"Ұ": - return u"U" - if char == u"ұ": - return u"u" - if char == u"ў": - return u"ù" - if char == u"Ў": - return u"Ù" - if char == u"ѝ": - return u"ì" - if char == u"Ѝ": - return u"Ì" - if char == u"Ӑ": - return u"A" - if char == u"ă": - return u"a" - if char == u"Ӓ": - return u"Ä" - if char == u"ä": - return u"ä" - if char in u"ӖѢҌ": - return u"E" - if char in u"ӗѣҍ": - return u"e" - if char == u"ҼҾ": - return u"Ts" - if char == u"ҽҿ": - return u"ts" - if char == u"Ҙ": - return u"Dh" - if char == u"ҙ": - return u"dh" - if char in u"Ӏӏ": - return u"" - if char == u"Ӆ": - return u"L" - if char == u"ӆ": - return u"l" - if char == u"Ӎ": - return u"M" - if char == u"ӎ": - return u"m" - if char == u"Ӧ": - return u"Ö" - if char == u"ӧ": - return u"ö" - if char == u"Ҩ": - return u"u" - if char == u"ҩ": - return u"u" - if char == u"Ҧ": - return u"Ph" - if char == u"ҧ": - return u"ph" - if char == u"Ҏ": - return u"R" - if char == u"ҏ": - return u"r" - if char == u"Ҫ": - return u"Th" - if char == u"ҫ": - return u"th" - if char == u"Ҭ": - return u"T" - if char == u"ҭ": - return u"t" - if char in u"ӲӰҮ": - return u"Ü" - if char in u"ӳӱү": - return u"ü" - if char == u"Ӯ": - return u"Û" - if char == u"ӯ": - return u"û" - if char == u"ҰӸ": - return u"U" - if char == u"ұӹ": - return u"u" - if char == u"Ҵ": - return u"Tts" - if char == u"ҵ": - return u"tts" - if char == u"Ӵ": - return u"Ch" - if char == u"ӵ": - return u"ch" + # Cyrillic + self.trans.update({u"А" : u"A", u"а" : u"a", u"Б" : u"B", u"б" : u"b", + u"В" : u"V", u"в" : u"v", u"Г" : u"G", u"г" : u"g", + u"Д" : u"D", u"д" : u"d", u"Е" : u"E", u"е" : u"e", + u"Ж" : u"Zh", u"ж" : u"zh", u"З" : u"Z", u"з" : u"z", + u"И" : u"I", u"и" : u"i", u"Й" : u"J", u"й" : u"j", + u"К" : u"K", u"к" : u"k", u"Л" : u"L", u"л" : u"l", + u"М" : u"M", u"м" : u"m", u"Н" : u"N", u"н" : u"n", + u"О" : u"O", u"о" : u"o", u"П" : u"P", u"п" : u"p", + u"Р" : u"R", u"р" : u"r", u"С" : u"S", u"с" : u"s", + u"Т" : u"T", u"т" : u"t", u"У" : u"U", u"у" : u"u", + u"Ф" : u"F", u"ф" : u"f", u"х" : u"kh", u"Ц" : u"C", + u"ц" : u"c", u"Ч" : u"Ch", u"ч" : u"ch", u"Ш" : u"Sh", + u"ш" : u"sh", u"Щ" : u"Shch", u"щ" : u"shch", u"Ь" : u"'", + u"ь" : "'", u"Ъ" : u'"', u"ъ" : '"', u"Ю" : u"Yu", + u"ю" : u"yu", u"Я" : u"Ya", u"я" : u"ya", u"Х" : u"Kh", + u"Χ" : u"Kh"})
- # Archaic Cyrillic letters - if char == u"Ѹ": - return u"Ou" - if char == u"ѹ": - return u"ou" - if char in u"ѠѺ": - return u"O" - if char in u"ѡѻ": - return u"o" - if char == u"Ѿ": - return u"Ot" - if char == u"ѿ": - return u"ot" - if char == u"Ѣ": - return u"E" - if char == u"ѣ": - return u"e" - if char in u"ѤѦ": - return u"Ei" - if char in u"ѥѧ": - return u"ei" - if char == u"Ѫ": - return u"Ai" - if char == u"ѫ": - return u"ai" - if char == u"Ѯ": - return u"X" - if char == u"ѯ": - return u"x" - if char == u"Ѱ": - return u"Ps" - if char == u"ѱ": - return u"ps" - if char == u"Ѳ": - return u"Th" - if char == u"ѳ": - return u"th" - if char in u"ѴѶ": - return u"Ü" - if char == u"ѵ": - return u"ü" - + # Additional Cyrillic letters, most occuring in only one or a few languages + self.trans.update({u"Ы" : u"Y", u"ы" : u"y", u"Ё" : u"Ë", u"ё" : u"ë", + u"Э" : u"È", u"Ѐ" : u"È", u"э" : u"è", u"ѐ" : u"è", + u"І" : u"I", u"і" : u"i", u"Ї" : u"Ji", u"ї" : u"ji", + u"Є" : u"Je", u"є" : u"je", u"Ґ" : u"G", u"Ҝ" : u"G", + u"ґ" : u"g", u"ҝ" : u"g", u"Ђ" : u"Dj", u"ђ" : u"dj", + u"Ӣ" : u"Y", u"ӣ" : u"y", u"Љ" : u"Lj", u"љ" : u"lj", + u"Њ" : u"Nj", u"њ" : u"nj", u"Ћ" : u"Cj", u"ћ" : u"cj", + u"Җ" : u"Zhj", u"җ" : u"zhj", u"Ѓ" : u"Gj", u"ѓ" : u"gj", + u"Ќ" : u"Kj", u"ќ" : u"kj", u"Ӣ" : u"Ii", u"ӣ" : u"ii", + u"Ӯ" : u"U", u"ӯ" : u"u", u"Ҳ" : u"H", u"ҳ" : u"h", + u"Ҷ" : u"Dz",u"ҷ" : u"dz", u"Ө" :u"Ô", u"Ӫ" : u"Ô", + u"ө" : u"ô", u"ӫ" : u"ô", u"Ү": u"Y", u"ү": u"y", u"Һ": u"H", + u"һ": u"h", u"Ә": u"AE", u"Ӕ": u"AE", u"ә": u"ae", + u"Ӛ": u"Ë", u"Ӭ": u"Ë", u"ӛ": u"ë", u"ӭ": u"ë", u"Җ": u"Zhj", + u"җ": u"zhj", u"Ұ": u"U", u"ұ": u"u", u"ў": u"ù", u"Ў": u"Ù", + u"ѝ": u"ì", u"Ѝ": u"Ì", u"Ӑ": u"A", u"ă": u"a", u"Ӓ": u"Ä", + u"ä": u"ä", u"Ҽ" : u"Ts", u"Ҿ": u"Ts", u"ҽ": u"ts", u"ҿ": u"ts", + u"Ҙ": u"Dh", u"ҙ": u"dh", u"Ӏ": u"", u"ӏ": u"", u"Ӆ": u"L", + u"ӆ": u"l", u"Ӎ": u"M", u"ӎ": u"m", u"Ӧ": u"Ö", u"ӧ": u"ö", + u"Ҩ": u"u", u"ҩ": u"u", u"Ҧ": u"Ph", u"ҧ": u"ph", u"Ҏ": u"R", + u"ҏ": u"r", u"Ҫ": u"Th", u"ҫ": u"th", u"Ҭ": u"T", u"ҭ": u"t", + u"Ӯ": u"Û", u"ӯ": u"û", u"Ұ": u"U", u"Ӹ": u"U", u"ұ": u"u", + u"ӹ": u"u", u"Ҵ": u"Tts", u"ҵ": u"tts", u"Ӵ": u"Ch", u"ӵ": u"ch"}) + + for char in u"ЈӤҊ": + self.trans[char] = u"J" + for char in u"јӥҋ": + self.trans[char] = u"j" + for char in u"ЏӁӜҶ": + self.trans[char] = u"Dzh" + for char in u"џӂӝҷ": + self.trans[char] = u"dzh" + for char in u"ЅӞӠӋҸ": + self.trans[char] = u"Dz" + for char in u"ѕӟӡӌҹ": + self.trans[char] = u"dz" + for char in u"ҒӶҔ": + self.trans[char] = u"G" + for char in u"ғӷҕ": + self.trans[char] = u"g" + for char in u"ҚҞҠӃ": + self.trans[char] = u"Q" + for char in u"қҟҡӄ": + self.trans[char] = u"q" + for char in u"ҢҤӉӇ": + self.trans[char] = u"Ng" + for char in u"ңҥӊӈ": + self.trans[char] = u"ng" + for char in u"ӖѢҌ": + self.trans[char] = u"E" + for char in u"ӗѣҍ": + self.trans[char] = u"e" + for char in u"ӲӰҮ": + self.trans[char] = u"Ü" + for char in u"ӳӱү": + self.trans[char] = u"ü"
- # Hebrew alphabet - if char in u"אע": - return u"'" - if char == u"ב": - return u"b" - if char == u"ג": - return u"g" - if char == u"ד": - return u"d" - if char == u"ה": - return u"h" - if char == u"ו": - return u"v" - if char == u"ז": - return u"z" - if char == u"ח": - return u"kh" - if char == u"ט": - return u"t" - if char == u"י": - return u"y" - if char in u"ךכ": - return u"k" - if char == u"ל": - return u"l" - if char in u"םמ": - return u"m" - if char in u"ןנ": - return u"n" - if char == u"ס": - return u"s" - if char in u"ףפ": - return u"ph" - if char in u"ץצ": - return u"ts" - if char == u"ק": - return u"q" - if char == u"ר": - return u"r" - if char == u"ש": - return u"sh" - if char == u"ת": - return u"th" - - # Arab alphabet - if char in u"اﺍﺎ": - return u"a" - if char in u"بﺏﺐﺒﺑ": - return u"b" - if char in u"تﺕﺖﺘﺗ": - return u"t" - if char in u"ثﺙﺚﺜﺛ": - return u"th" - if char in u"جﺝﺞﺠﺟ": - return u"g" - if char in u"حﺡﺢﺤﺣ": - return u"h" - if char in u"خﺥﺦﺨﺧ": - return u"kh" - if char in u"دﺩﺪ": - return u"d" - if char in u"ذﺫﺬ": - return u"dh" - if char in u"رﺭﺮ": - return u"r" - if char in u"زﺯﺰ": - return u"z" - if char in u"سﺱﺲﺴﺳ": - return u"s" - if char in u"شﺵﺶﺸﺷ": - return u"sh" - if char in u"صﺹﺺﺼﺻ": - return u"s" - if char in u"ضﺽﺾﻀﺿ": - return u"d" - if char in u"طﻁﻂﻄﻃ": - return u"t" - if char in u"ظﻅﻆﻈﻇ": - return u"z" - if char in u"عﻉﻊﻌﻋ": - return u"'" - if char in u"غﻍﻎﻐﻏ": - return u"gh" - if char in u"فﻑﻒﻔﻓ": - return u"f" - if char in u"قﻕﻖﻘﻗ": - return u"q" - if char in u"كﻙﻚﻜﻛک": - return u"k" - if char in u"لﻝﻞﻠﻟ": - return u"l" - if char in u"مﻡﻢﻤﻣ": - return u"m" - if char in u"نﻥﻦﻨﻧ": - return u"n" - if char in u"هﻩﻪﻬﻫ": - return u"h" - if char in u"وﻭﻮ": - return u"w" - if char in u"یيﻱﻲﻴﻳ": - return u"y" - # Arabic - additional letters, modified letters and ligatures - if char == u"ﺀ": - return u"'" - if char in u"آﺁﺂ": - return u"'a" - if char in u"ةﺓﺔ": - return u"th" - if char in u"ىﻯﻰ": - return u"á" - if char in u"یﯼﯽﯿﯾ": - return u"y" - if char == u"؟": - return u"?" - # Arabic - ligatures - if char in u"ﻻﻼ": - return u"la" - if char == u"ﷲ": - return u"llah" - if char in u"إأ": - return u"a'" - if char == u"ؤ": - return u"w'" - if char == u"ئ": - return u"y'" - if char == u"◌": - return prev - if char in u"◌◌": - return u"" # indicates absence of vowels - # Arabic vowels - if char == u"◌": - return u"a" - if char == u"◌": - return u"u" - if char == u"◌": - return u"i" - if char == u"◌": - return u"a" - if char == u"◌": - return u"ay" - if char == u"◌": - return u"ay" - if char == u"◌": - return u"u" - if char == u"◌": - return u"iy" - # Arab numerals - if char in u"٠۰": - return u"0" - if char in u"١۱": - return u"1" - if char in u"٢۲": - return u"2" - if char in u"٣۳": - return u"3" - if char in u"٤۴": - return u"4" - if char in u"٥۵": - return u"5" - if char in u"٦۶": - return u"6" - if char in u"٧۷": - return u"7" - if char in u"٨۸": - return u"8" - if char in u"٩۹": - return u"9" - # Perso-Arabic - if char in u"پﭙﭙپ": - return u"p" - if char in u"چچچچ": - return u"ch" - if char in u"ژژ": - return u"zh" - if char in u"گﮔﮕﮓ": - return u"g" + # Archaic Cyrillic letters + self.trans.update({u"Ѹ": u"Ou", u"ѹ": u"ou", u"Ѡ": u"O", u"Ѻ": u"O", u"ѡ": u"o", + u"ѻ": u"o", u"Ѿ": u"Ot", u"ѿ": u"ot", u"Ѣ": u"E", u"ѣ": u"e", + u"Ѥ": u"Ei", u"Ѧ": u"Ei", u"ѥ": u"ei", u"ѧ": u"ei", u"Ѫ": u"Ai", + u"ѫ": u"ai", u"Ѯ": u"X", u"ѯ": u"x", u"Ѱ": u"Ps", u"ѱ": u"ps", + u"Ѳ": u"Th", u"ѳ": u"th", u"Ѵ": u"Ü", u"Ѷ": u"Ü", u"ѵ": u"ü"})
- # Greek - if char == u"Α": - return u"A" - if char == u"α": - return u"a" - if char == u"Β": - return u"B" - if char == u"β": - return u"b" - if char == u"Γ": - return u"G" - if char == u"γ": - return u"g" - if char == u"Δ": - return u"D" - if char == u"δ": - return u"d" - if char == u"Ε": - return u"E" - if char == u"ε": - return u"e" - if char == u"Ζ": - return u"Z" - if char == u"ζ": - return u"z" - if char == u"Η": - return u"I" - if char == u"η": - return u"i" - if char == u"Θ": - if prev.lower() == prev: - return u"Th" - else: - return u"TH" - if char == u"θ": - return u"th" - if char == u"Ι": - return u"I" - if char == u"ι": - return u"i" - if char == u"Κ": - return u"K" - if char == u"κ": - return u"k" - if char == u"Λ": - return u"L" - if char == u"λ": - return u"l" - if char == u"Μ": - return u"M" - if char == u"μ": - return u"m" - if char == u"Ν": - return u"N" - if char == u"ν": - return u"n" - if char == u"Ξ": - return u"X" - if char == u"ξ": - return u"x" - if char == u"Ο": - return u"O" - if char == u"ο": - return u"o" - if char == u"Π": - return u"P" - if char == u"π": - return u"p" - if char == u"Ρ": - return u"R" - if char == u"ρ": - return u"r" - if char == u"Σ": - return u"S" - if char in u"σς": - return u"s" - if char == u"Τ": - return u"T" - if char == u"τ": - return u"t" - if char == u"Υ": - return u"Y" - if char == u"υ": - return u"y" - if char == u"Φ": - return u"F" - if char == u"φ": - return u"f" - if char == u"Ψ": - if prev.lower() == prev: - return u"Ps" - else: - return u"PS" - if char == u"ψ": - return u"ps" - if char == u"Ω": - return u"O" - if char == u"ω": - return u"o" - # Greek: Special and old characters - if char == u"ϗ": - return u"&" - if char == u"Ϛ": - if prev.lower() == prev: - return u"St" - else: - return u"ST" - if char == u"ϛ": - return u"st" - if char in u"ϘϞ": - return u"Q" - if char in u"ϙϟ": - return u"q" - if char == u"Ϻ": - return u"S" - if char == u"ϻ": - return u"s" - if char == u"Ϡ": - if prev.lower() == prev: - return u"Ss" - else: - return u"SS" - if char == u"ϡ": - return u"ss" - if char == u"Ϸ": - if prev.lower() == prev: - return u"Sh" - else: - return u"SH" - if char == u"ϸ": - return u"sh" - if char == u"·": - return u":" - # Greek: Accented characters - if char == u"Ά": - return u"Á" - if char == u"ά": - return u"á" - if char in u"ΈΉ": - return u"É" - if char in u"έή": - return u"é" - if char == u"Ί": - return u"Í" - if char == u"ί": - return u"í" - if char == u"Ϊ": - return u"Ï" - if char in u"ϊΐ": - return u"ï" - if char == u"Ό": - return u"Ó" - if char == u"ό": - return u"ó" - if char == u"Ύ": - return u"Ý" - if char == u"ύ": - return u"ý" - if char == u"Ϋ": - return u"Y" - if char in u"ϋΰ": - return u"ÿ" - if char == u"Ώ": - return u"Ó" - if char == u"ώ": - return u"ó" + # Hebrew alphabet + for char in u"אע": + self.trans[char] = u"'" + self.trans[u"ב"] = u"b" + self.trans[u"ג"] = u"g" + self.trans[u"ד"] = u"d" + self.trans[u"ה"] = u"h" + self.trans[u"ו"] = u"v" + self.trans[u"ז"] = u"z" + self.trans[u"ח"] = u"kh" + self.trans[u"ט"] = u"t" + self.trans[u"י"] = u"y" + for char in u"ךכ": + self.trans[char] = u"k" + self.trans[u"ל"] = u"l" + for char in u"םמ": + self.trans[char] = u"m" + for char in u"ןנ": + self.trans[char] = u"n" + self.trans[u"ס"] = u"s" + for char in u"ףפ": + self.trans[char] = u"ph" + for char in u"ץצ": + self.trans[char] = u"ts" + self.trans[u"ק"] = u"q" + self.trans[u"ר"] = u"r" + self.trans[u"ש"] = u"sh" + self.trans[u"ת"] = u"th" + + # Arab alphabet + for char in u"اﺍﺎ": + self.trans[char] = u"a" + for char in u"بﺏﺐﺒﺑ": + self.trans[char] = u"b" + for char in u"تﺕﺖﺘﺗ": + self.trans[char] = u"t" + for char in u"ثﺙﺚﺜﺛ": + self.trans[char] = u"th" + for char in u"جﺝﺞﺠﺟ": + self.trans[char] = u"g" + for char in u"حﺡﺢﺤﺣ": + self.trans[char] = u"h" + for char in u"خﺥﺦﺨﺧ": + self.trans[char] = u"kh" + for char in u"دﺩﺪ": + self.trans[char] = u"d" + for char in u"ذﺫﺬ": + self.trans[char] = u"dh" + for char in u"رﺭﺮ": + self.trans[char] = u"r" + for char in u"زﺯﺰ": + self.trans[char] = u"z" + for char in u"سﺱﺲﺴﺳ": + self.trans[char] = u"s" + for char in u"شﺵﺶﺸﺷ": + self.trans[char] = u"sh" + for char in u"صﺹﺺﺼﺻ": + self.trans[char] = u"s" + for char in u"ضﺽﺾﻀﺿ": + self.trans[char] = u"d" + for char in u"طﻁﻂﻄﻃ": + self.trans[char] = u"t" + for char in u"ظﻅﻆﻈﻇ": + self.trans[char] = u"z" + for char in u"عﻉﻊﻌﻋ": + self.trans[char] = u"'" + for char in u"غﻍﻎﻐﻏ": + self.trans[char] = u"gh" + for char in u"فﻑﻒﻔﻓ": + self.trans[char] = u"f" + for char in u"قﻕﻖﻘﻗ": + self.trans[char] = u"q" + for char in u"كﻙﻚﻜﻛک": + self.trans[char] = u"k" + for char in u"لﻝﻞﻠﻟ": + self.trans[char] = u"l" + for char in u"مﻡﻢﻤﻣ": + self.trans[char] = u"m" + for char in u"نﻥﻦﻨﻧ": + self.trans[char] = u"n" + for char in u"هﻩﻪﻬﻫ": + self.trans[char] = u"h" + for char in u"وﻭﻮ": + self.trans[char] = u"w" + for char in u"یيﻱﻲﻴﻳ": + self.trans[char] = u"y" + # Arabic - additional letters, modified letters and ligatures + self.trans[u"ﺀ"] = u"'" + for char in u"آﺁﺂ": + self.trans[char] = u"'a" + for char in u"ةﺓﺔ": + self.trans[char] = u"th" + for char in u"ىﻯﻰ": + self.trans[char] = u"á" + for char in u"یﯼﯽﯿﯾ": + self.trans[char] = u"y" + self.trans[u"؟"] = u"?" + # Arabic - ligatures + for char in u"ﻻﻼ": + self.trans[char] = u"la" + self.trans[u"ﷲ"] = u"llah" + for char in u"إأ": + self.trans[char] = u"a'" + self.trans[u"ؤ"] = u"w'" + self.trans[u"ئ"] = u"y'" + for char in u"◌◌": + self.trans[char] = u"" # indicates absence of vowels + # Arabic vowels + self.trans[u"◌"] = u"a" + self.trans[u"◌"] = u"u" + self.trans[u"◌"] = u"i" + self.trans[u"◌"] = u"a" + self.trans[u"◌"] = u"ay" + self.trans[u"◌"] = u"ay" + self.trans[u"◌"] = u"u" + self.trans[u"◌"] = u"iy" + # Arab numerals + for char in u"٠۰": + self.trans[char] = u"0" + for char in u"١۱": + self.trans[char] = u"1" + for char in u"٢۲": + self.trans[char] = u"2" + for char in u"٣۳": + self.trans[char] = u"3" + for char in u"٤۴": + self.trans[char] = u"4" + for char in u"٥۵": + self.trans[char] = u"5" + for char in u"٦۶": + self.trans[char] = u"6" + for char in u"٧۷": + self.trans[char] = u"7" + for char in u"٨۸": + self.trans[char] = u"8" + for char in u"٩۹": + self.trans[char] = u"9" + # Perso-Arabic + for char in u"پﭙﭙپ": + self.trans[char] = u"p" + for char in u"چچچچ": + self.trans[char] = u"ch" + for char in u"ژژ": + self.trans[char] = u"zh" + for char in u"گﮔﮕﮓ": + self.trans[char] = u"g"
- # Japanese (katakana and hiragana) - if char in u"アァあ": - return u"a" - if char in u"イィい": - return u"i" - if char in u"ウう": - return u"u" - if char in u"エェえ": - return u"e" - if char in u"オォお": - return u"o" - if char in u"ャや": - return u"ya" - if char in u"ュゆ": - return u"yu" - if char in u"ョよ": - return u"yo" - if char in u"カか": - return u"ka" - if char in u"キき": - return u"ki" - if char in u"クく": - return u"ku" - if char in u"ケけ": - return u"ke" - if char in u"コこ": - return u"ko" - if char in u"サさ": - return u"sa" - if char in u"シし": - return u"shi" - if char in u"スす": - return u"su" - if char in u"セせ": - return u"se" - if char in u"ソそ": - return u"so" - if char in u"タた": - return u"ta" - if char in u"チち": - return u"chi" - if char in u"ツつ": - return u"tsu" - if char in u"テて": - return u"te" - if char in u"トと": - return u"to" - if char in u"ナな": - return u"na" - if char in u"ニに": - return u"ni" - if char in u"ヌぬ": - return u"nu" - if char in u"ネね": - return u"ne" - if char in u"ノの": - return u"no" - if char in u"ハは": - return u"ha" - if char in u"ヒひ": - return u"hi" - if char in u"フふ": - return u"fu" - if char in u"ヘへ": - return u"he" - if char in u"ホほ": - return u"ho" - if char in u"マま": - return u"ma" - if char in u"ミみ": - return u"mi" - if char in u"ムむ": - return u"mu" - if char in u"メめ": - return u"me" - if char in u"モも": - return u"mo" - if char in u"ラら": - return u"ra" - if char in u"リり": - return u"ri" - if char in u"ルる": - return u"ru" - if char in u"レれ": - return u"re" - if char in u"ロろ": - return u"ro" - if char in u"ワわ": - return u"wa" - if char in u"ヰゐ": - return u"wi" - if char in u"ヱゑ": - return u"we" - if char in u"ヲを": - return u"wo" - if char in u"ンん": - return u"n" - if char in u"ガが": - return u"ga" - if char in u"ギぎ": - return u"gi" - if char in u"グぐ": - return u"gu" - if char in u"ゲげ": - return u"ge" - if char in u"ゴご": - return u"go" - if char in u"ザざ": - return u"za" - if char in u"ジじ": - return u"ji" - if char in u"ズず": - return u"zu" - if char in u"ゼぜ": - return u"ze" - if char in u"ゾぞ": - return u"zo" - if char in u"ダだ": - return u"da" - if char in u"ヂぢ": - return u"dji" - if char in u"ヅづ": - return u"dzu" - if char in u"デで": - return u"de" - if char in u"ドど": - return u"do" - if char in u"バば": - return u"ba" - if char in u"ビび": - return u"bi" - if char in u"ブぶ": - return u"bu" - if char in u"ベべ": - return u"be" - if char in u"ボぼ": - return u"bo" - if char in u"パぱ": - return u"pa" - if char in u"ピぴ": - return u"pi" - if char in u"プぷ": - return u"pu" - if char in u"ペぺ": - return u"pe" - if char in u"ポぽ": - return u"po" - if char in u"ヴゔ": - return u"vu" - if char == u"ヷ": - return u"va" - if char == u"ヸ": - return u"vi" - if char == u"ヹ": - return u"ve" - if char == u"ヺ": - return u"vo" - if char == u"ッ": - return trans(next)[0] + # Greek + self.trans.update({u"Α": u"A", u"α": u"a", u"Β": u"B", u"β": u"b", u"Γ": u"G", + u"γ": u"g", u"Δ": u"D", u"δ": u"d", u"Ε": u"E", u"ε": u"e", + u"Ζ": u"Z", u"ζ": u"z", u"Η": u"I", u"η": u"i", u"θ": u"th", + u"Θ": u"Th", u"Ι": u"I", u"ι": u"i", u"Κ": u"K", u"κ": u"k", + u"Λ": u"L", u"λ": u"l", u"Μ": u"M", u"μ": u"m", u"Ν": u"N", + u"ν": u"n", u"Ξ": u"X", u"ξ": u"x", u"Ο": u"O", u"ο": u"o", + u"Π": u"P", u"π": u"p", u"Ρ": u"R", u"ρ": u"r", u"Σ": u"S", + u"σ": u"s", u"ς": u"s", u"Τ": u"T", u"τ": u"t", u"Υ": u"Y", + u"υ": u"y", u"Φ": u"F", u"φ": u"f", u"Ψ": u"Ps", u"ψ": u"ps", + u"Ω": u"O", u"ω": u"o", u"ϗ": u"&", u"Ϛ": u"St", u"ϛ": u"st", + u"Ϙ": u"Q", u"Ϟ": u"Q", u"ϙ": u"q", u"ϟ": u"q", u"Ϻ": u"S", + u"ϻ": u"s", u"Ϡ": u"Ss", u"ϡ": u"ss", u"Ϸ": u"Sh", u"ϸ": u"sh", + u"·": u":", u"Ά": u"Á", u"ά": u"á", u"Έ": u"É", u"Ή": u"É", + u"έ": u"é", u"ή": u"é", u"Ί": u"Í", u"ί": u"í", u"Ϊ": u"Ï", + u"ϊ": u"ï", u"ΐ": u"ï", u"Ό": u"Ó", u"ό": u"ó", u"Ύ": u"Ý", + u"ύ": u"ý", u"Ϋ": u"Y", u"ϋ": u"ÿ", u"ΰ": u"ÿ", u"Ώ": u"Ó", + u"ώ": u"ó"})
- # Japanese and Chinese punctuation and typography - if char == u"・·": - return u" " - if char == u"々仝ヽヾゝゞ〱〲〳〵〴〵": - return prev - if char in u"〃『』《》": - return u'"' - if char in u"「」〈〉〘〙〚〛": - return u"'" - if char in u"(〔": - return u"(" - if char in u")〕": - return u")" - if char in u"[【〖": - return u"[" - if char in u"]】〗": - return u"]" - if char == u"{": - return u"{" - if char == u"}": - return u"}" - if char == u"っ": - return u":" - if char == u"ー": - return u"h" - if char == u"゛": - return u"'" - if char == u"゜": - return u"p" - if char == u"。": - return u". " - if char == u"、": - return u", " - if char == u"・": - return u" " - if char == u"〆": - return u"shime" - if char == u"〜": - return u"-" - if char == u"…": - return u"..." - if char == u"‥": - return u".." - if char == u"ヶ": - return u"months" - if char in u"•◦": - return u"_" - if char in u"※*": - return u"*" - if char == u"Ⓧ": - return u"(X)" - if char == u"Ⓨ": - return u"(Y)" - if char == u"!": - return u"!" - if char == u"?": - return u"?" - if char == u";": - return u";" - if char == u":": - return u":" - if char == u"。": - return u"." - if char in u",、": - return u"," + # Japanese (katakana and hiragana) + for char in u"アァあ": + self.trans[char] = u"a" + for char in u"イィい": + self.trans[char] = u"i" + for char in u"ウう": + self.trans[char] = u"u" + for char in u"エェえ": + self.trans[char] = u"e" + for char in u"オォお": + self.trans[char] = u"o" + for char in u"ャや": + self.trans[char] = u"ya" + for char in u"ュゆ": + self.trans[char] = u"yu" + for char in u"ョよ": + self.trans[char] = u"yo" + for char in u"カか": + self.trans[char] = u"ka" + for char in u"キき": + self.trans[char] = u"ki" + for char in u"クく": + self.trans[char] = u"ku" + for char in u"ケけ": + self.trans[char] = u"ke" + for char in u"コこ": + self.trans[char] = u"ko" + for char in u"サさ": + self.trans[char] = u"sa" + for char in u"シし": + self.trans[char] = u"shi" + for char in u"スす": + self.trans[char] = u"su" + for char in u"セせ": + self.trans[char] = u"se" + for char in u"ソそ": + self.trans[char] = u"so" + for char in u"タた": + self.trans[char] = u"ta" + for char in u"チち": + self.trans[char] = u"chi" + for char in u"ツつ": + self.trans[char] = u"tsu" + for char in u"テて": + self.trans[char] = u"te" + for char in u"トと": + self.trans[char] = u"to" + for char in u"ナな": + self.trans[char] = u"na" + for char in u"ニに": + self.trans[char] = u"ni" + for char in u"ヌぬ": + self.trans[char] = u"nu" + for char in u"ネね": + self.trans[char] = u"ne" + for char in u"ノの": + self.trans[char] = u"no" + for char in u"ハは": + self.trans[char] = u"ha" + for char in u"ヒひ": + self.trans[char] = u"hi" + for char in u"フふ": + self.trans[char] = u"fu" + for char in u"ヘへ": + self.trans[char] = u"he" + for char in u"ホほ": + self.trans[char] = u"ho" + for char in u"マま": + self.trans[char] = u"ma" + for char in u"ミみ": + self.trans[char] = u"mi" + for char in u"ムむ": + self.trans[char] = u"mu" + for char in u"メめ": + self.trans[char] = u"me" + for char in u"モも": + self.trans[char] = u"mo" + for char in u"ラら": + self.trans[char] = u"ra" + for char in u"リり": + self.trans[char] = u"ri" + for char in u"ルる": + self.trans[char] = u"ru" + for char in u"レれ": + self.trans[char] = u"re" + for char in u"ロろ": + self.trans[char] = u"ro" + for char in u"ワわ": + self.trans[char] = u"wa" + for char in u"ヰゐ": + self.trans[char] = u"wi" + for char in u"ヱゑ": + self.trans[char] = u"we" + for char in u"ヲを": + self.trans[char] = u"wo" + for char in u"ンん": + self.trans[char] = u"n" + for char in u"ガが": + self.trans[char] = u"ga" + for char in u"ギぎ": + self.trans[char] = u"gi" + for char in u"グぐ": + self.trans[char] = u"gu" + for char in u"ゲげ": + self.trans[char] = u"ge" + for char in u"ゴご": + self.trans[char] = u"go" + for char in u"ザざ": + self.trans[char] = u"za" + for char in u"ジじ": + self.trans[char] = u"ji" + for char in u"ズず": + self.trans[char] = u"zu" + for char in u"ゼぜ": + self.trans[char] = u"ze" + for char in u"ゾぞ": + self.trans[char] = u"zo" + for char in u"ダだ": + self.trans[char] = u"da" + for char in u"ヂぢ": + self.trans[char] = u"dji" + for char in u"ヅづ": + self.trans[char] = u"dzu" + for char in u"デで": + self.trans[char] = u"de" + for char in u"ドど": + self.trans[char] = u"do" + for char in u"バば": + self.trans[char] = u"ba" + for char in u"ビび": + self.trans[char] = u"bi" + for char in u"ブぶ": + self.trans[char] = u"bu" + for char in u"ベべ": + self.trans[char] = u"be" + for char in u"ボぼ": + self.trans[char] = u"bo" + for char in u"パぱ": + self.trans[char] = u"pa" + for char in u"ピぴ": + self.trans[char] = u"pi" + for char in u"プぷ": + self.trans[char] = u"pu" + for char in u"ペぺ": + self.trans[char] = u"pe" + for char in u"ポぽ": + self.trans[char] = u"po" + for char in u"ヴゔ": + self.trans[char] = u"vu" + self.trans[u"ヷ"] = u"va" + self.trans[u"ヸ"] = u"vi" + self.trans[u"ヹ"] = u"ve" + self.trans[u"ヺ"] = u"vo"
- # Georgian - if char == u"ა": - return u"a" - if char == u"ბ": - return u"b" - if char == u"გ": - return u"g" - if char == u"დ": - return u"d" - if char in u"ეჱ": - return u"e" - if char == u"ვ": - return u"v" - if char == u"ზ": - return u"z" - if char == u"თ":# - return u"th" - if char == u"ი": - return u"i" - if char == u"კ":# - return u"k" - if char == u"ლ": - return u"l" - if char == u"მ": - return u"m" - if char == u"ნ": - return u"n" - if char == u"ო": - return u"o" - if char == u"პ":# - return u"p" - if char == u"ჟ":# - return u"zh" - if char == u"რ": - return u"r" - if char == u"ს": - return u"s" - if char == u"ტ":# - return u"t" - if char == u"უ": - return u"u" - if char == u"ფ":# - return u"ph" - if char == u"ქ":# - return u"q" - if char == u"ღ":# - return u"gh" - if char == u"ყ":# - return u"q'" - if char == u"შ": - return u"sh" - if char == u"ჩ": - return u"ch" - if char == u"ც": - return u"ts" - if char == u"ძ": - return u"dz" - if char == u"წ":# - return u"ts'" - if char == u"ჭ":# - return u"ch'" - if char == u"ხ": - return u"kh" - if char == u"ჯ":# - return u"j" - if char == u"ჰ": - return u"h" - if char == u"ჳ": - return u"w" - if char == u"ჵ": - return u"o" - if char == u"ჶ": - return u"f" + # Japanese and Chinese punctuation and typography + for char in u"・·": + self.trans[char] = u" " + for char in u"〃『』《》": + self.trans[char] = u'"' + for char in u"「」〈〉〘〙〚〛": + self.trans[char] = u"'" + for char in u"(〔": + self.trans[char] = u"(" + for char in u")〕": + self.trans[char] = u")" + for char in u"[【〖": + self.trans[char] = u"[" + for char in u"]】〗": + self.trans[char] = u"]" + for char in u"{": + self.trans[char] = u"{" + for char in u"}": + self.trans[char] = u"}" + for char in u"っ": + self.trans[char] = u":" + for char in u"ー": + self.trans[char] = u"h" + for char in u"゛": + self.trans[char] = u"'" + for char in u"゜": + self.trans[char] = u"p" + for char in u"。": + self.trans[char] = u". " + for char in u"、": + self.trans[char] = u", " + for char in u"・": + self.trans[char] = u" " + for char in u"〆": + self.trans[char] = u"shime" + for char in u"〜": + self.trans[char] = u"-" + for char in u"…": + self.trans[char] = u"..." + for char in u"‥": + self.trans[char] = u".." + for char in u"ヶ": + self.trans[char] = u"months" + for char in u"•◦": + self.trans[char] = u"_" + for char in u"※*": + self.trans[char] = u"*" + for char in u"Ⓧ": + self.trans[char] = u"(X)" + for char in u"Ⓨ": + self.trans[char] = u"(Y)" + for char in u"!": + self.trans[char] = u"!" + for char in u"?": + self.trans[char] = u"?" + for char in u";": + self.trans[char] = u";" + for char in u":": + self.trans[char] = u":" + for char in u"。": + self.trans[char] = u"." + for char in u",、": + self.trans[char] = u","
- # Devanagari - if char in u"पप": - return u"p" - if char in u"अ": - return u"a" - if char in u"आा": - return u"aa" - if char == u"प": - return u"pa" - if char in u"इि": - return u"i" - if char in u"ईी": - return u"ii" - if char in u"उु": - return u"u" - if char in u"ऊू": - return u"uu" - if char in u"एे": - return u"e" - if char in u"ऐै": - return u"ai" - if char in u"ओो": - return u"o" - if char in u"औौ": - return u"au" - if char in u"ऋृर": - return u"r" - if char in u"ॠॄ": - return u"rr" - if char in u"ऌॢल": - return u"l" - if char in u"ॡॣ": - return u"ll" - if char == u"क": - return u"k" - if char == u"ख": - return u"kh" - if char == u"ग": - return u"g" - if char == u"घ": - return u"gh" - if char == u"ङ": - return u"ng" - if char == u"च": - return u"c" - if char == u"छ": - return u"ch" - if char == u"ज": - return u"j" - if char == u"झ": - return u"jh" - if char == u"ञ": - return u"ñ" - if char in u"टत": - return u"t" - if char in u"ठथ": - return u"th" - if char in u"डद": - return u"d" - if char in u"ढध": - return u"dh" - if char in u"णन": - return u"n" - if char == u"फ": - return u"ph" - if char == u"ब": - return u"b" - if char == u"भ": - return u"bh" - if char == u"म": - return u"m" - if char == u"य": - return u"y" - if char == u"व": - return u"v" - if char == u"श": - return u"sh" - if char in u"षस": - return u"s" - if char == u"ह": - return u"h" - if char == u"क": - return u"x" - if char == u"त": - return u"tr" - if char == u"ज": - return u"gj" - if char == u"क़": - return u"q" - if char == u"फ": - return u"f" - if char == u"ख": - return u"hh" - if char == u"H": - return u"gh" - if char == u"ज": - return u"z" - if char in u"डढ": - return u"r" - # Devanagari ligatures (possibly incomplete and/or incorrect) - if char == u"ख्": - return u"khn" - if char == u"त": - return u"tn" - if char == u"द्": - return u"dn" - if char == u"श": - return u"cn" - if char == u"ह्": - return u"fn" - if char in u"अँ": - return u"m" - if char in u"॒॑": - return u"" - if char == u"०": - return u"0" - if char == u"१": - return u"1" - if char == u"२": - return u"2" - if char == u"३": - return u"3" - if char == u"४": - return u"4" - if char == u"५": - return u"5" - if char == u"६": - return u"6" - if char == u"७": - return u"7" - if char == u"८": - return u"8" - if char == u"९": - return u"9" + # Georgian + for char in u"ა": + self.trans[char] = u"a" + for char in u"ბ": + self.trans[char] = u"b" + for char in u"გ": + self.trans[char] = u"g" + for char in u"დ": + self.trans[char] = u"d" + for char in u"ეჱ": + self.trans[char] = u"e" + for char in u"ვ": + self.trans[char] = u"v" + for char in u"ზ": + self.trans[char] = u"z" + for char in u"თ":# + self.trans[char] = u"th" + for char in u"ი": + self.trans[char] = u"i" + for char in u"კ":# + self.trans[char] = u"k" + for char in u"ლ": + self.trans[char] = u"l" + for char in u"მ": + self.trans[char] = u"m" + for char in u"ნ": + self.trans[char] = u"n" + for char in u"ო": + self.trans[char] = u"o" + for char in u"პ":# + self.trans[char] = u"p" + for char in u"ჟ":# + self.trans[char] = u"zh" + for char in u"რ": + self.trans[char] = u"r" + for char in u"ს": + self.trans[char] = u"s" + for char in u"ტ":# + self.trans[char] = u"t" + for char in u"უ": + self.trans[char] = u"u" + for char in u"ფ":# + self.trans[char] = u"ph" + for char in u"ქ":# + self.trans[char] = u"q" + for char in u"ღ":# + self.trans[char] = u"gh" + for char in u"ყ":# + self.trans[char] = u"q'" + for char in u"შ": + self.trans[char] = u"sh" + for char in u"ჩ": + self.trans[char] = u"ch" + for char in u"ც": + self.trans[char] = u"ts" + for char in u"ძ": + self.trans[char] = u"dz" + for char in u"წ":# + self.trans[char] = u"ts'" + for char in u"ჭ":# + self.trans[char] = u"ch'" + for char in u"ხ": + self.trans[char] = u"kh" + for char in u"ჯ":# + self.trans[char] = u"j" + for char in u"ჰ": + self.trans[char] = u"h" + for char in u"ჳ": + self.trans[char] = u"w" + for char in u"ჵ": + self.trans[char] = u"o" + for char in u"ჶ": + self.trans[char] = u"f"
- # Armenian - if char == u"Ա": - return u"A" - if char == u"ա": - return u"a" - if char == u"Բ": - return u"B" - if char == u"բ": - return u"b" - if char == u"Գ": - return u"G" - if char == u"գ": - return u"g" - if char == u"Դ": - return u"D" - if char == u"դ": - return u"d" - if char == u"Ե": - return u"Je" - if char == u"ե": - return u"e" - if char == u"Զ": - return u"Z" - if char == u"զ": - return u"z" - if char == u"Է": - return u"É" - if char == u"է": - return u"é" - if char == u"Ը": - return u"Ë" - if char == u"ը": - return u"ë" - if char == u"Թ": - return u"Th" - if char == u"թ": - return u"th" - if char == u"Ժ": - return u"Zh" - if char == u"ժ": - return u"zh" - if char == u"Ի": - return u"I" - if char == u"ի": - return u"i" - if char == u"Լ": - return u"L" - if char == u"լ": - return u"l" - if char == u"Խ": - return u"Ch" - if char == u"խ": - return u"ch" - if char == u"Ծ": - return u"Ts" - if char == u"ծ": - return u"ts" - if char == u"Կ": - return u"K" - if char == u"կ": - return u"k" - if char == u"Հ": - return u"H" - if char == u"հ": - return u"h" - if char == u"Ձ": - return u"Dz" - if char == u"ձ": - return u"dz" - if char == u"Ղ": - return u"R" - if char == u"ղ": - return u"r" - if char == u"Ճ": - return u"Cz" - if char == u"ճ": - return u"cz" - if char == u"Մ": - return u"M" - if char == u"մ": - return u"m" - if char == u"Յ": - return u"J" - if char == u"յ": - return u"j" - if char == u"Ն": - return u"N" - if char == u"ն": - return u"n" - if char == u"Շ": - return u"S" - if char == u"շ": - return u"s" - if char == u"Շ": - return u"Vo" - if char == u"շ": - return u"o" - if char == u"Չ": - return u"Tsh" - if char == u"չ": - return u"tsh" - if char == u"Պ": - return u"P" - if char == u"պ": - return u"p" - if char == u"Ջ": - return u"Dz" - if char == u"ջ": - return u"dz" - if char == u"Ռ": - return u"R" - if char == u"ռ": - return u"r" - if char == u"Ս": - return u"S" - if char == u"ս": - return u"s" - if char == u"Վ": - return u"V" - if char == u"վ": - return u"v" - if char == u"Տ": - return u"T'" - if char == u"տ": - return u"t'" - if char == u"Ր": - return u"R" - if char == u"ր": - return u"r" - if char == u"Ց": - return u"Tsh" - if char == u"ց": - return u"tsh" - if char == u"Ւ": - return u"V" - if char == u"ւ": - return u"v" - if char == u"Փ": - return u"Ph" - if char == u"փ": - return u"ph" - if char == u"Ք": - return u"Kh" - if char == u"ք": - return u"kh" - if char == u"Օ": - return u"O" - if char == u"օ": - return u"o" - if char == u"Ֆ": - return u"F" - if char == u"ֆ": - return u"f" - if char == u"և": - return u"&" - if char == u"՟": - return u"." - if char == u"՞": - return u"?" - if char == u"՝": - return u";" - if char == u"՛": - return u"" + # Devanagari + for char in u"पप": + self.trans[char] = u"p" + for char in u"अ": + self.trans[char] = u"a" + for char in u"आा": + self.trans[char] = u"aa" + for char in u"प": + self.trans[char] = u"pa" + for char in u"इि": + self.trans[char] = u"i" + for char in u"ईी": + self.trans[char] = u"ii" + for char in u"उु": + self.trans[char] = u"u" + for char in u"ऊू": + self.trans[char] = u"uu" + for char in u"एे": + self.trans[char] = u"e" + for char in u"ऐै": + self.trans[char] = u"ai" + for char in u"ओो": + self.trans[char] = u"o" + for char in u"औौ": + self.trans[char] = u"au" + for char in u"ऋृर": + self.trans[char] = u"r" + for char in u"ॠॄ": + self.trans[char] = u"rr" + for char in u"ऌॢल": + self.trans[char] = u"l" + for char in u"ॡॣ": + self.trans[char] = u"ll" + for char in u"क": + self.trans[char] = u"k" + for char in u"ख": + self.trans[char] = u"kh" + for char in u"ग": + self.trans[char] = u"g" + for char in u"घ": + self.trans[char] = u"gh" + for char in u"ङ": + self.trans[char] = u"ng" + for char in u"च": + self.trans[char] = u"c" + for char in u"छ": + self.trans[char] = u"ch" + for char in u"ज": + self.trans[char] = u"j" + for char in u"झ": + self.trans[char] = u"jh" + for char in u"ञ": + self.trans[char] = u"ñ" + for char in u"टत": + self.trans[char] = u"t" + for char in u"ठथ": + self.trans[char] = u"th" + for char in u"डद": + self.trans[char] = u"d" + for char in u"ढध": + self.trans[char] = u"dh" + for char in u"णन": + self.trans[char] = u"n" + for char in u"फ": + self.trans[char] = u"ph" + for char in u"ब": + self.trans[char] = u"b" + for char in u"भ": + self.trans[char] = u"bh" + for char in u"म": + self.trans[char] = u"m" + for char in u"य": + self.trans[char] = u"y" + for char in u"व": + self.trans[char] = u"v" + for char in u"श": + self.trans[char] = u"sh" + for char in u"षस": + self.trans[char] = u"s" + for char in u"ह": + self.trans[char] = u"h" + for char in u"क": + self.trans[char] = u"x" + for char in u"त": + self.trans[char] = u"tr" + for char in u"ज": + self.trans[char] = u"gj" + for char in u"क़": + self.trans[char] = u"q" + for char in u"फ": + self.trans[char] = u"f" + for char in u"ख": + self.trans[char] = u"hh" + for char in u"H": + self.trans[char] = u"gh" + for char in u"ज": + self.trans[char] = u"z" + for char in u"डढ": + self.trans[char] = u"r" + # Devanagari ligatures (possibly incomplete and/or incorrect) + for char in u"ख्": + self.trans[char] = u"khn" + for char in u"त": + self.trans[char] = u"tn" + for char in u"द्": + self.trans[char] = u"dn" + for char in u"श": + self.trans[char] = u"cn" + for char in u"ह्": + self.trans[char] = u"fn" + for char in u"अँ": + self.trans[char] = u"m" + for char in u"॒॑": + self.trans[char] = u"" + for char in u"०": + self.trans[char] = u"0" + for char in u"१": + self.trans[char] = u"1" + for char in u"२": + self.trans[char] = u"2" + for char in u"३": + self.trans[char] = u"3" + for char in u"४": + self.trans[char] = u"4" + for char in u"५": + self.trans[char] = u"5" + for char in u"६": + self.trans[char] = u"6" + for char in u"७": + self.trans[char] = u"7" + for char in u"८": + self.trans[char] = u"8" + for char in u"९": + self.trans[char] = u"9"
- # Tamil - if char == u"க்": - return u"k" - if char in u"ஙண்ந்ன்": - return u"n" - if char == u"ச": - return u"c" - if char == u"ஞ்": - return u"ñ" - if char == u"ட்": - return u"th" - if char == u"த": - return u"t" - if char == u"ப": - return u"p" - if char == u"ம்": - return u"m" - if char == u"ய்": - return u"y" - if char in u"ர்ழ்ற": - return u"r" - if char in u"ல்ள": - return u"l" - if char == u"வ்": - return u"v" - if char == u"ஜ": - return u"j" - if char == u"ஷ": - return u"sh" - if char == u"ஸ": - return u"s" - if char == u"ஹ": - return u"h" - if char == u"க்ஷ": - return u"x" - if char == u"அ": - return u"a" - if char == u"ஆ": - return u"aa" - if char == u"இ": - return u"i" - if char == u"ஈ": - return u"ii" - if char == u"உ": - return u"u" - if char == u"ஊ": - return u"uu" - if char == u"எ": - return u"e" - if char == u"ஏ": - return u"ee" - if char == u"ஐ": - return u"ai" - if char == u"ஒ": - return u"o" - if char == u"ஓ": - return u"oo" - if char == u"ஔ": - return u"au" - if char == u"ஃ": - return "" + # Armenian + for char in u"Ա": + self.trans[char] = u"A" + for char in u"ա": + self.trans[char] = u"a" + for char in u"Բ": + self.trans[char] = u"B" + for char in u"բ": + self.trans[char] = u"b" + for char in u"Գ": + self.trans[char] = u"G" + for char in u"գ": + self.trans[char] = u"g" + for char in u"Դ": + self.trans[char] = u"D" + for char in u"դ": + self.trans[char] = u"d" + for char in u"Ե": + self.trans[char] = u"Je" + for char in u"ե": + self.trans[char] = u"e" + for char in u"Զ": + self.trans[char] = u"Z" + for char in u"զ": + self.trans[char] = u"z" + for char in u"Է": + self.trans[char] = u"É" + for char in u"է": + self.trans[char] = u"é" + for char in u"Ը": + self.trans[char] = u"Ë" + for char in u"ը": + self.trans[char] = u"ë" + for char in u"Թ": + self.trans[char] = u"Th" + for char in u"թ": + self.trans[char] = u"th" + for char in u"Ժ": + self.trans[char] = u"Zh" + for char in u"ժ": + self.trans[char] = u"zh" + for char in u"Ի": + self.trans[char] = u"I" + for char in u"ի": + self.trans[char] = u"i" + for char in u"Լ": + self.trans[char] = u"L" + for char in u"լ": + self.trans[char] = u"l" + for char in u"Խ": + self.trans[char] = u"Ch" + for char in u"խ": + self.trans[char] = u"ch" + for char in u"Ծ": + self.trans[char] = u"Ts" + for char in u"ծ": + self.trans[char] = u"ts" + for char in u"Կ": + self.trans[char] = u"K" + for char in u"կ": + self.trans[char] = u"k" + for char in u"Հ": + self.trans[char] = u"H" + for char in u"հ": + self.trans[char] = u"h" + for char in u"Ձ": + self.trans[char] = u"Dz" + for char in u"ձ": + self.trans[char] = u"dz" + for char in u"Ղ": + self.trans[char] = u"R" + for char in u"ղ": + self.trans[char] = u"r" + for char in u"Ճ": + self.trans[char] = u"Cz" + for char in u"ճ": + self.trans[char] = u"cz" + for char in u"Մ": + self.trans[char] = u"M" + for char in u"մ": + self.trans[char] = u"m" + for char in u"Յ": + self.trans[char] = u"J" + for char in u"յ": + self.trans[char] = u"j" + for char in u"Ն": + self.trans[char] = u"N" + for char in u"ն": + self.trans[char] = u"n" + for char in u"Շ": + self.trans[char] = u"S" + for char in u"շ": + self.trans[char] = u"s" + for char in u"Շ": + self.trans[char] = u"Vo" + for char in u"շ": + self.trans[char] = u"o" + for char in u"Չ": + self.trans[char] = u"Tsh" + for char in u"չ": + self.trans[char] = u"tsh" + for char in u"Պ": + self.trans[char] = u"P" + for char in u"պ": + self.trans[char] = u"p" + for char in u"Ջ": + self.trans[char] = u"Dz" + for char in u"ջ": + self.trans[char] = u"dz" + for char in u"Ռ": + self.trans[char] = u"R" + for char in u"ռ": + self.trans[char] = u"r" + for char in u"Ս": + self.trans[char] = u"S" + for char in u"ս": + self.trans[char] = u"s" + for char in u"Վ": + self.trans[char] = u"V" + for char in u"վ": + self.trans[char] = u"v" + for char in u"Տ": + self.trans[char] = u"T'" + for char in u"տ": + self.trans[char] = u"t'" + for char in u"Ր": + self.trans[char] = u"R" + for char in u"ր": + self.trans[char] = u"r" + for char in u"Ց": + self.trans[char] = u"Tsh" + for char in u"ց": + self.trans[char] = u"tsh" + for char in u"Ւ": + self.trans[char] = u"V" + for char in u"ւ": + self.trans[char] = u"v" + for char in u"Փ": + self.trans[char] = u"Ph" + for char in u"փ": + self.trans[char] = u"ph" + for char in u"Ք": + self.trans[char] = u"Kh" + for char in u"ք": + self.trans[char] = u"kh" + for char in u"Օ": + self.trans[char] = u"O" + for char in u"օ": + self.trans[char] = u"o" + for char in u"Ֆ": + self.trans[char] = u"F" + for char in u"ֆ": + self.trans[char] = u"f" + for char in u"և": + self.trans[char] = u"&" + for char in u"՟": + self.trans[char] = u"." + for char in u"՞": + self.trans[char] = u"?" + for char in u"՝": + self.trans[char] = u";" + for char in u"՛": + self.trans[char] = u""
- # Bengali - if char == u"অ": - return u"ô" - if char in u"আা": - return u"a" - if char in u"ইিঈী": - return u"i" - if char in u"উুঊূ": - return u"u" - if char in u"ঋৃ": - return u"ri" - if char in u"এেয়": - return u"e" - if char in u"ঐৈ": - return u"oi" - if char in u"ওো": - return u"o" - if char in u"ঔৌ": - return "ou" - if char == u"্": - return u"" - if char == u"ৎ": - return u"t" - if char == u"ং": - return u"n" - if char == u"ঃ": - return u"h" - if char == u"ঁ": - return u"ñ" - if char == u"ক": - return u"k" - if char == u"খ": - return u"kh" - if char == u"গ": - return u"g" - if char == u"ঘ": - return u"gh" - if char == u"ঙ": - return u"ng" - if char == u"চ": - return u"ch" - if char == u"ছ": - return u"chh" - if char in u"জ": - return u"j" - if char == u"ঝ": - return u"jh" - if char == u"ঞ": - return u"n" - if char in u"টত": - return u"t" - if char in u"ঠথ": - return u"th" - if char in u"ডদ": - return u"d" - if char in u"ঢধ": - return u"dh" - if char in u"ণন": - return u"n" - if char == u"প": - return u"p" - if char == u"ফ": - return u"ph" - if char == u"ব": - return u"b" - if char == u"ভ": - return u"bh" - if char == u"ম": - return u"m" - if char == u"য": - return u"dzh" - if char == u"র": - return u"r" - if char == u"ল": - return u"l" - if char == u"শ": - return u"s" - if char == u"হ": - return u"h" - if char == u"য়": - return u"-" - if char == u"ড়": - return u"r" - if char == u"ঢ": - return u"rh" - if char == u"০": - return u"0" - if char == u"১": - return u"1" - if char == u"২": - return u"2" - if char == u"৩": - return u"3" - if char == u"৪": - return u"4" - if char == u"৫": - return u"5" - if char == u"৬": - return u"6" - if char == u"৭": - return u"7" - if char == u"৮": - return u"8" - if char == u"৯": - return u"9" - - # Thai (because of complications of the alphabet, transliterations - # are very imprecise here) - if char == u"ก": - return u"k" - if char in u"ขฃคฅฆ": - return u"kh" - if char == u"ง": - return u"ng" - if char in u"จฉชฌ": - return u"ch" - if char in u"ซศษส": - return u"s" - if char in u"ญย": - return u"y" - if char in u"ฎด": - return u"d" - if char in u"ฏต": - return u"t" - if char in u"ฐฑฒถทธ": - return u"th" - if char in u"ณน": - return u"n" - if char == u"บ": - return u"b" - if char == u"ป": - return u"p" - if char in u"ผพภ": - return u"ph" - if char in u"ฝฟ": - return u"f" - if char in u"ม": - return u"m" - if char == u"ร": - return u"r" - if char == u"ฤ": - return u"rue" - if char in u"ๅ": - return u":" - if char in u"ลฬ": - return u"l" - if char == u"ฦ": - return u"lue" - if char == u"ว": - return u"w" - if char in u"หฮ": - return u"h" - if char == u"อ": - return u"" - if char == u"ร": - return u"ü" - if char == u"ว": - return u"ua" - if char in u"อว–โิ": - return u"o" - if char in u"ะัา": - return u"a" - if char in u"ว": - return u"u" - if char == u"ำ": - return u"am" - if char == u"ิ": - return u"i" - if char == u"ี": - return u"i:" - if char == u"ึ": - return u"ue" - if char == u"ื": - return u"ue:" - if char == u"ุ": - return u"u" - if char == u"ู": - return u"u:" - if char in u"เ็": - return u"e" - if char == u"แ": - return u"ae" - if char in u"ใไ": - return u"ai" - if char in u"่้๊๋็์": - return u"" - if char in u"ฯ": - return u"." - if char in u"ๆ": - return u"(2)" - - return default + # Tamil + for char in u"க்": + self.trans[char] = u"k" + for char in u"ஙண்ந்ன்": + self.trans[char] = u"n" + for char in u"ச": + self.trans[char] = u"c" + for char in u"ஞ்": + self.trans[char] = u"ñ" + for char in u"ட்": + self.trans[char] = u"th" + for char in u"த": + self.trans[char] = u"t" + for char in u"ப": + self.trans[char] = u"p" + for char in u"ம்": + self.trans[char] = u"m" + for char in u"ய்": + self.trans[char] = u"y" + for char in u"ர்ழ்ற": + self.trans[char] = u"r" + for char in u"ல்ள": + self.trans[char] = u"l" + for char in u"வ்": + self.trans[char] = u"v" + for char in u"ஜ": + self.trans[char] = u"j" + for char in u"ஷ": + self.trans[char] = u"sh" + for char in u"ஸ": + self.trans[char] = u"s" + for char in u"ஹ": + self.trans[char] = u"h" + for char in u"க்ஷ": + self.trans[char] = u"x" + for char in u"அ": + self.trans[char] = u"a" + for char in u"ஆ": + self.trans[char] = u"aa" + for char in u"இ": + self.trans[char] = u"i" + for char in u"ஈ": + self.trans[char] = u"ii" + for char in u"உ": + self.trans[char] = u"u" + for char in u"ஊ": + self.trans[char] = u"uu" + for char in u"எ": + self.trans[char] = u"e" + for char in u"ஏ": + self.trans[char] = u"ee" + for char in u"ஐ": + self.trans[char] = u"ai" + for char in u"ஒ": + self.trans[char] = u"o" + for char in u"ஓ": + self.trans[char] = u"oo" + for char in u"ஔ": + self.trans[char] = u"au" + for char in u"ஃ": + self.trans[char] = "" + + # Bengali + for char in u"অ": + self.trans[char] = u"ô" + for char in u"আা": + self.trans[char] = u"a" + for char in u"ইিঈী": + self.trans[char] = u"i" + for char in u"উুঊূ": + self.trans[char] = u"u" + for char in u"ঋৃ": + self.trans[char] = u"ri" + for char in u"এেয়": + self.trans[char] = u"e" + for char in u"ঐৈ": + self.trans[char] = u"oi" + for char in u"ওো": + self.trans[char] = u"o" + for char in u"ঔৌ": + self.trans[char] = "ou" + for char in u"্": + self.trans[char] = u"" + for char in u"ৎ": + self.trans[char] = u"t" + for char in u"ং": + self.trans[char] = u"n" + for char in u"ঃ": + self.trans[char] = u"h" + for char in u"ঁ": + self.trans[char] = u"ñ" + for char in u"ক": + self.trans[char] = u"k" + for char in u"খ": + self.trans[char] = u"kh" + for char in u"গ": + self.trans[char] = u"g" + for char in u"ঘ": + self.trans[char] = u"gh" + for char in u"ঙ": + self.trans[char] = u"ng" + for char in u"চ": + self.trans[char] = u"ch" + for char in u"ছ": + self.trans[char] = u"chh" + for char in u"জ": + self.trans[char] = u"j" + for char in u"ঝ": + self.trans[char] = u"jh" + for char in u"ঞ": + self.trans[char] = u"n" + for char in u"টত": + self.trans[char] = u"t" + for char in u"ঠথ": + self.trans[char] = u"th" + for char in u"ডদ": + self.trans[char] = u"d" + for char in u"ঢধ": + self.trans[char] = u"dh" + for char in u"ণন": + self.trans[char] = u"n" + for char in u"প": + self.trans[char] = u"p" + for char in u"ফ": + self.trans[char] = u"ph" + for char in u"ব": + self.trans[char] = u"b" + for char in u"ভ": + self.trans[char] = u"bh" + for char in u"ম": + self.trans[char] = u"m" + for char in u"য": + self.trans[char] = u"dzh" + for char in u"র": + self.trans[char] = u"r" + for char in u"ল": + self.trans[char] = u"l" + for char in u"শ": + self.trans[char] = u"s" + for char in u"হ": + self.trans[char] = u"h" + for char in u"য়": + self.trans[char] = u"-" + for char in u"ড়": + self.trans[char] = u"r" + for char in u"ঢ": + self.trans[char] = u"rh" + for char in u"০": + self.trans[char] = u"0" + for char in u"১": + self.trans[char] = u"1" + for char in u"২": + self.trans[char] = u"2" + for char in u"৩": + self.trans[char] = u"3" + for char in u"৪": + self.trans[char] = u"4" + for char in u"৫": + self.trans[char] = u"5" + for char in u"৬": + self.trans[char] = u"6" + for char in u"৭": + self.trans[char] = u"7" + for char in u"৮": + self.trans[char] = u"8" + for char in u"৯": + self.trans[char] = u"9" + + # Thai (because of complications of the alphabet, self.transliterations + # are very imprecise here) + for char in u"ก": + self.trans[char] = u"k" + for char in u"ขฃคฅฆ": + self.trans[char] = u"kh" + for char in u"ง": + self.trans[char] = u"ng" + for char in u"จฉชฌ": + self.trans[char] = u"ch" + for char in u"ซศษส": + self.trans[char] = u"s" + for char in u"ญย": + self.trans[char] = u"y" + for char in u"ฎด": + self.trans[char] = u"d" + for char in u"ฏต": + self.trans[char] = u"t" + for char in u"ฐฑฒถทธ": + self.trans[char] = u"th" + for char in u"ณน": + self.trans[char] = u"n" + for char in u"บ": + self.trans[char] = u"b" + for char in u"ป": + self.trans[char] = u"p" + for char in u"ผพภ": + self.trans[char] = u"ph" + for char in u"ฝฟ": + self.trans[char] = u"f" + for char in u"ม": + self.trans[char] = u"m" + for char in u"ร": + self.trans[char] = u"r" + for char in u"ฤ": + self.trans[char] = u"rue" + for char in u"ๅ": + self.trans[char] = u":" + for char in u"ลฬ": + self.trans[char] = u"l" + for char in u"ฦ": + self.trans[char] = u"lue" + for char in u"ว": + self.trans[char] = u"w" + for char in u"หฮ": + self.trans[char] = u"h" + for char in u"อ": + self.trans[char] = u"" + for char in u"ร": + self.trans[char] = u"ü" + for char in u"ว": + self.trans[char] = u"ua" + for char in u"อว–โิ": + self.trans[char] = u"o" + for char in u"ะัา": + self.trans[char] = u"a" + for char in u"ว": + self.trans[char] = u"u" + for char in u"ำ": + self.trans[char] = u"am" + for char in u"ิ": + self.trans[char] = u"i" + for char in u"ี": + self.trans[char] = u"i:" + for char in u"ึ": + self.trans[char] = u"ue" + for char in u"ื": + self.trans[char] = u"ue:" + for char in u"ุ": + self.trans[char] = u"u" + for char in u"ู": + self.trans[char] = u"u:" + for char in u"เ็": + self.trans[char] = u"e" + for char in u"แ": + self.trans[char] = u"ae" + for char in u"ใไ": + self.trans[char] = u"ai" + for char in u"่้๊๋็์": + self.trans[char] = u"" + for char in u"ฯ": + self.trans[char] = u"." + for char in u"ๆ": + self.trans[char] = u"(2)" + + + def transliterate(self, char, default="?", prev="-", next="-"): + if char in self.trans: + return self.trans[char] + #Arabic + if char == u"◌": + return prev + #Japanese + if char == u"ッ": + return self.transliterate(next)[0] + if char in u"々仝ヽヾゝゞ〱〲〳〵〴〵": + return prev + return default +