Atomstrom/readability/encoding.py

22 lines
603 B
Python

import re
import chardet
def get_encoding(page):
text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess
try:
diff = text.decode(enc, 'ignore').encode(enc)
sizes = len(diff), len(text)
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
return enc
except UnicodeDecodeError:
pass
res = chardet.detect(text)
enc = res['encoding']
#print '->', enc, "%.2f" % res['confidence']
if enc == 'MacCyrillic':
enc = 'cp1251'
return enc