forked from timbertson/python-readability
-
Notifications
You must be signed in to change notification settings - Fork 353
Expand file tree
/
Copy pathencoding.py
More file actions
23 lines (21 loc) · 703 Bytes
/
encoding.py
File metadata and controls
23 lines (21 loc) · 703 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from bs4 import UnicodeDammit
def get_encoding(page):
# Pass in html to UnicodeDammit for encoding detection
page = UnicodeDammit(page)
enc = page.original_encoding
return enc
# def custom_decode(encoding):
# """Overrides encoding when charset declaration
# or charset determination is a subset of a larger
# charset. Created because of issues with Chinese websites"""
# encoding = encoding.lower()
# alternates = {
# 'big5': 'big5hkscs',
# 'gb2312': 'gb18030',
# 'ascii': 'utf-8',
# 'MacCyrillic': 'cp1251',
# }
# if encoding in alternates:
# return alternates[encoding]
# else:
# return encoding