-
Notifications
You must be signed in to change notification settings - Fork 8
/
clean.py
31 lines (28 loc) · 937 Bytes
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/usr/bin/env python
#coding: utf-8
from sys import argv
from urlparse import urlparse
if __name__ == '__main__':
domains = []
dirty = open(argv[1]).read().splitlines()
clean = open(argv[1].replace('.txt', '_clean.txt'), 'w')
for line in dirty:
try:
modif_line = line.decode('utf8')
except: pass
else:
try:
domain = urlparse(modif_line).netloc.replace('www.', '')
if '=' and '?' in line:
if domain not in domains:
domains.append(domain)
clean.write('%s\n' % line)
#print 'Write url: %s' % line
else:
pass
#print 'Find dublicate:%s' % line
else:
pass
#print 'Bad url: %s' % line
except: pass
clean.close()