http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10047
Revision: 10047
Author: a_engels
Date: 2012-03-26 09:35:37 +0000 (Mon, 26 Mar 2012)
Log Message:
-----------
1. Use the configuration value of the encoding for transliteration purposes, even if it is
overwritten in a subclass
2. Allow multi-step transliteration (x transliterates to y, which however cannot be shown
either, then use the transliteration of y as the transliteration for x)
3. Add transliteration for Lao alphabet
Modified Paths:
--------------
trunk/pywikipedia/userinterfaces/terminal_interface_base.py
trunk/pywikipedia/userinterfaces/transliteration.py
Modified: trunk/pywikipedia/userinterfaces/terminal_interface_base.py
===================================================================
--- trunk/pywikipedia/userinterfaces/terminal_interface_base.py 2012-03-26 08:10:48 UTC
(rev 10046)
+++ trunk/pywikipedia/userinterfaces/terminal_interface_base.py 2012-03-26 09:35:37 UTC
(rev 10047)
@@ -10,7 +10,7 @@
import traceback, re, sys
import wikipedia
-transliterator = transliteration.transliterator()
+transliterator = transliteration.transliterator(config.console_encoding)
colors = [
'default',
@@ -40,6 +40,7 @@
self.stdout = sys.stdout
self.stderr = sys.stderr
self.encoding = config.console_encoding
+ self.realencoding = config.console_encoding
def printNonColorized(self, text, targetStream):
# We add *** after the text as a whole if anything needed to be colorized.
@@ -69,7 +70,7 @@
# Encode our unicode string in the encoding used by the user's console,
# and decode it back to unicode. Then we can see which characters
# can't be represented in the console encoding.
- codecedText = text.encode(self.encoding,
'replace').decode(self.encoding)
+ codecedText = text.encode(self.realencoding,
'replace').decode(self.realencoding)
transliteratedText = ''
# Note: A transliteration replacement might be longer than the original
# character, e.g. ч is transliterated to ch.
Modified: trunk/pywikipedia/userinterfaces/transliteration.py
===================================================================
--- trunk/pywikipedia/userinterfaces/transliteration.py 2012-03-26 08:10:48 UTC (rev
10046)
+++ trunk/pywikipedia/userinterfaces/transliteration.py 2012-03-26 09:35:37 UTC (rev
10047)
@@ -3,7 +3,7 @@
class transliterator(object):
- def __init__(self):
+ def __init__(self, encoding):
self.trans = {}
for char in u"ÀÁÂẦẤẪẨẬÃĀĂẰẮẴẶẲȦǠẠḀȂĄǍẢ":
self.trans[char] = u"A"
@@ -1594,7 +1594,76 @@
self.trans[u"౼"] = u"1/16"
self.trans[u"౽"] = u"1/8"
self.trans[u"౾"] = u"3/16"
-
+ # Lao - note: pronounciation in initial position is used;
+ # different pronounciation in final position is ignored
+ self.trans[u"ກ"] = "k"
+ for char in u"ຂຄ":
+ self.trans[char] = "kh"
+ self.trans[u"ງ"] = "ng"
+ self.trans[u"ຈ"] = "ch"
+ for char in u"ສຊ":
+ self.trans[char] = "s"
+ self.trans[u"ຍ"] = "ny"
+ self.trans[u"ດ"] = "d"
+ self.trans[u"ຕ"] = "t"
+ for char in u"ຖທ":
+ self.trans[char] = "th"
+ self.trans[u"ນ"] = "n"
+ self.trans[u"ບ"] = "b"
+ self.trans[u"ປ"] = "p"
+ for char in u"ຜພ":
+ self.trans[char] = "ph"
+ for char in u"ຝຟ":
+ self.trans[char] = "f"
+ for char in u"ມໝ":
+ self.trans[char] = "m"
+ self.trans[u"ຢ"] = "y"
+ for char in u"ຣຼ":
+ self.trans[char] = "r"
+ for char in u"ລຼ":
+ self.trans[char] = "l"
+ self.trans[u"ວ"] = "v"
+ for char in u"ຮ":
+ self.trans[char] = "h"
+ self.trans[u"ອ"] = "'"
+ for char in u"ະັ":
+ self.trans[char] = "a"
+ self.trans[u"ິ"] = "i"
+ self.trans[u"ຶ"] = "ue"
+ self.trans[u"ຸ"] = "u"
+ self.trans[u"ເ"] = u"é"
+ self.trans[u"ແ"] = u"è"
+ for char in u"ໂົາໍ":
+ self.trans[char] = "o"
+ self.trans[u"ຽ"] = "ia"
+ self.trans[u"ເຶ"] = "uea"
+ self.trans[u"ຍ"] = "i"
+ for char in u"ໄໃ":
+ self.trans[char] = "ai"
+ self.trans[u"ຳ"] = "am"
+ self.trans[u"າ"] = "aa"
+ self.trans[u"ີ"] = "ii"
+ self.trans[u"ື"] = "yy"
+ self.trans[u"ູ"] = "uu"
+ self.trans[u"ເ"] = "e"
+ self.trans[u"ແ"] = "ei"
+ self.trans[u"໐"] = "0"
+ self.trans[u"໑"] = "1"
+ self.trans[u"໒"] = "2"
+ self.trans[u"໓"] = "3"
+ self.trans[u"໔"] = "4"
+ self.trans[u"໕"] = "5"
+ self.trans[u"໖"] = "6"
+ self.trans[u"໗"] = "7"
+ self.trans[u"໘"] = "8"
+ self.trans[u"໙"] = "9"
+ for char in self.trans:
+ value = self.trans[char]
+ if value == "?": continue
+ while value.encode(encoding, 'replace').decode(encoding) ==
"?" and value in self.trans:
+ value = self.trans[value]
+ self.trans[char] = value
+
def transliterate(self, char, default="?", prev="-",
next="-"):
if char in self.trans:
return self.trans[char]
@@ -1606,5 +1675,11 @@
return self.transliterate(next)[0]
if char in u"々仝ヽヾゝゞ〱〲〳〵〴〵":
return prev
+ #Lao
+ if char == u"ຫ":
+ if next in u"ງຍນຣລຼຼວ":
+ return ""
+ else:
+ return "h"
return default