Revision: 8339
Author: xqt
Date: 2010-06-26 13:48:42 +0000 (Sat, 26 Jun 2010)
Log Message:
-----------
charset detection from http header (patch bug#3021568)
Modified Paths:
--------------
trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py 2010-06-26 13:20:38 UTC (rev 8338)
+++ trunk/pywikipedia/reflinks.py 2010-06-26 13:48:42 UTC (rev 8339)
@@ -580,25 +580,32 @@
meta_content = self.META_CONTENT.search(linkedpagetext)
enc = []
+ # use charset from http header
+ s = self.CHARSET.search(contentType)
if meta_content:
tag = meta_content.group()
# Prefer the contentType from the HTTP header :
if not contentType:
contentType = tag
- s = self.CHARSET.search(tag)
- if s:
- tmp = s.group('enc').strip("\"'
").lower()
- naked = re.sub('[ _\-]', '', tmp)
- # Convert to python correct encoding names
- if naked == "gb2312":
- enc.append("gbk")
- elif naked == "shiftjis":
- enc.append("shift jis 2004")
- enc.append("cp932")
- elif naked == "xeucjp":
- enc.append("euc-jp")
- else:
- enc.append(tmp)
+ if not s:
+ # use charset from html
+ s = self.CHARSET.search(tag)
+ if s:
+ tmp = s.group('enc').strip("\"'
").lower()
+ naked = re.sub('[ _\-]', '', tmp)
+ # Convert to python correct encoding names
+ if naked == "gb2312":
+ enc.append("gbk")
+ elif naked == "shiftjis":
+ enc.append("shift jis 2004")
+ enc.append("cp932")
+ elif naked == "xeucjp":
+ enc.append("euc-jp")
+ else:
+ enc.append(tmp)
+ else:
+ wikipedia.output(u'No charset found for %s' % ref.link)
+ #continue # do not process pages without charset
if not contentType:
wikipedia.output(u'No content-type found for %s' % ref.link)
continue
Show replies by date