-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlenc
executable file
·90 lines (84 loc) · 3.33 KB
/
htmlenc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys, os, re, encodings, urllib.request, urllib.parse, logging
from extras import ColoredArgParser
from extras import PrettyVTable
from extras import urlscheme
from extras import trydecodingbytes
from extras import FileRead
from extras import init_colored_logger
from extras import ColoredSetting
from extras import xstr
def is_valid_file_or_url(parser, f):
path = f
scheme = urlscheme(path)
if scheme == 'file':
path = urllib.parse.urlsplit(path).path
if path != '-' and not os.path.isfile(path) and not scheme in ('http', 'https'):
parser.error("%s: No such file" % path)
else:
return f
def parse_args():
parser = ColoredArgParser(description = 'Get the character set encoding for HTML.')
parser.add_argument('-t', '--timeout',
dest = 'timeout',
help = 'Timeout in seconds for blocking operation.')
parser.add_argument('--color',
choices = ['auto', 'always', 'never'],
default = 'auto',
dest = 'colored',
help = 'When to colored, the default is auto.')
parser.add_argument('paths',
nargs = '*',
type = lambda f: is_valid_file_or_url(parser, f),
default = '-',
help = 'html files or urls.')
return parser.parse_args()
def find_charset(html_string):
regex_metas = [
'''<meta\s+http-equiv=['"]?content-type['"]?\s+content=['"]?[^'"]*charset=([^'"]+)''',
'''<meta\s+charset=['"]?([^'"]+)''',
'''<meta\s+http-equiv=['"]?charset['"]?\s+content=['"]?([^'"]+)''',
'''<meta\s+content=['"]?[^'"]*charset=([^'"]+)['"]?\s+http-equiv=['"]?content-type['"]?''',
'''<meta\s+content=['"]?([^'"]+)['"]?\s+http-equiv=['"]?charset['"]?''',
]
for regex in regex_metas:
m = re.compile(regex, re.I).search(html_string)
if m:
meta_encoding = m.group(1)
normalized_name = encodings.normalize_encoding(meta_encoding).lower()
if normalized_name in encodings.aliases.aliases.keys() or normalized_name in encodings.aliases.aliases.values():
return normalized_name
return normalized_name
return None
def main(args = None):
if args is None:
return 0
ColoredSetting(args.colored)
init_colored_logger()
pt = PrettyVTable(enable_painting = ColoredSetting().is_colorize(sys.stdout))
pt.add_column(alignment = '>')
pt.add_column(alignment = '<')
pt.add_field('File')
pt.add_field('HTML charset')
pt.add_field('File encoding')
for p in args.paths:
path = p
scheme = urlscheme(path)
if scheme == 'file':
path = urllib.parse.urlsplit(path).path
elif scheme in ('http', 'https'):
try:
respond = urllib.request.urlopen(path, timeout = int(args.timeout) if args.timeout else None)
page = respond.read()
except Exception as e:
logging.getLogger(__name__).error(path + ': ' + str(e))
continue
else:
page = next(iter(FileRead(path, mode = 'rb', prompt_when_stdin = 'Press Ctrl-D when finished.')))
page, fileenc = trydecodingbytes(page)
htmlenc = find_charset(page)
pt.add_record(repr(p), xstr(htmlenc), xstr(fileenc))
print(pt)
if __name__ == '__main__':
sys.exit(main(parse_args()))