[Pywikipedia-l] SVN: [4801] trunk/pywikipedia/wikipedia.py
rotem at svn.wikimedia.org
rotem at svn.wikimedia.org
Sat Jan 5 08:29:03 UTC 2008
Revision: 4801
Author: rotem
Date: 2008-01-05 08:29:03 +0000 (Sat, 05 Jan 2008)
Log Message:
-----------
(patch 1862810) wikipedia.py:html2unicode : html chars from #128 to #159
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-01-05 08:18:48 UTC (rev 4800)
+++ trunk/pywikipedia/wikipedia.py 2008-01-05 08:29:03 UTC (rev 4801)
@@ -3387,6 +3387,40 @@
# This regular expression will match any decimal and hexadecimal entity and
# also entities that might be named entities.
entityR = re.compile(r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));')
+ #These characters are Html-illegal, but sadly you *can* find some of these and
+ #converting them to unichr(decimal) is unsuitable
+ convertIllegalHtmlEntities = {
+ 128 : 8364, # €
+ 130 : 8218, # ‚
+ 131 : 402, # ƒ
+ 132 : 8222, # „
+ 133 : 8230, # …
+ 134 : 8224, # †
+ 135 : 8225, # ‡
+ 136 : 710, # ˆ
+ 137 : 8240, # ‰
+ 138 : 352, # Š
+ 139 : 8249, # ‹
+ 140 : 338, # Œ
+ 142 : 381, # Ž
+ 145 : 8216, # ‘
+ 146 : 8217, # ’
+ 147 : 8220, # “
+ 148 : 8221, # ”
+ 149 : 8226, # •
+ 150 : 8211, # –
+ 151 : 8212, # —
+ 152 : 732, # ˜
+ 153 : 8482, # ™
+ 154 : 353, # š
+ 155 : 8250, # ›
+ 156 : 339, # œ
+ 158 : 382, # ž
+ 159 : 376 # Ÿ
+ }
+ #ensuring that illegal   and , which have no known values,
+ #don't get converted to unichr(129), unichr(141) or unichr(157)
+ ignore = set(ignore) | set([129, 141, 157])
result = u''
i = 0
found = True
@@ -3405,6 +3439,10 @@
# We found a known HTML entity.
unicodeCodepoint = htmlentitydefs.name2codepoint[name]
result += text[:match.start()]
+ try:
+ unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint]
+ except KeyError:
+ pass
if unicodeCodepoint and unicodeCodepoint not in ignore and (WIDEBUILD or unicodeCodepoint < 65534):
result += unichr(unicodeCodepoint)
else:
More information about the Pywikipedia-l
mailing list