22 lines
603 B
Python
22 lines
603 B
Python
|
import re
|
||
|
import chardet
|
||
|
|
||
|
def get_encoding(page):
|
||
|
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||
|
enc = 'utf-8'
|
||
|
if not text.strip() or len(text) < 10:
|
||
|
return enc # can't guess
|
||
|
try:
|
||
|
diff = text.decode(enc, 'ignore').encode(enc)
|
||
|
sizes = len(diff), len(text)
|
||
|
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
|
||
|
return enc
|
||
|
except UnicodeDecodeError:
|
||
|
pass
|
||
|
res = chardet.detect(text)
|
||
|
enc = res['encoding']
|
||
|
#print '->', enc, "%.2f" % res['confidence']
|
||
|
if enc == 'MacCyrillic':
|
||
|
enc = 'cp1251'
|
||
|
return enc
|