-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_pairs.py
54 lines (41 loc) · 1.08 KB
/
find_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re, csv,os
alexa = 'top-5k.csv'
apks = 'top5k-apps.csv'
OUTFILE = 'app_to_web.csv'
MISSFILE = 'app_no_matching.csv'
def writefile(OUTFILE):
newFile = not os.path.isfile(OUTFILE)
f = open(OUTFILE,'a')
writer = csv.writer(f)
if newFile:
writer.writerow(['app', 'site'])
return f,writer
def openfile(infile):
newFile = not os.path.isfile(infile)
f = open(infile,'rb')
reader = csv.reader(f,delimiter=';')
return reader
def rewrite(app):
site = ''
match = re.findall("([^\.]*)",app)
if len(match) > 0:
site = site + match[2] + "." + match[0]
print "site: " + site
return site
def findsite(appname):
sites = openfile(alexa)
match_sites = set()
for row in sites:
site = row[1].strip().lower()
match= re.search(appname,site)
if match:
if site not in match_sites:
match_sites.add(site)
return match_sites
apps = openfile(apks)
for app in apps:
# print app[0]
appid = app[0]
print appid
f,writer = writefile('reverse-site.csv')
writer.writerow([appid,app[2],rewrite(appid)])