-
Notifications
You must be signed in to change notification settings - Fork 8
/
data_count_aol.py
36 lines (29 loc) · 854 Bytes
/
data_count_aol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python3
import csv
from gzip import open
import string
import re
lenrecords = 0
records = set()
queries = set()
items = set()
separators = '[{}]'.format(string.punctuation + string.whitespace)
for i in range(10):
filename = 'data/aol/aol-{}.txt.gz'.format(str(i+1).zfill(2))
with open(filename, 'rt') as file:
reader = csv.reader(file, delimiter='\t')
header = next(reader)
for line in reader:
anonID, query = line[:2]
lenrecords += int(int(anonID) not in records)
records.add(int(anonID))
queries.add(query)
items |= set(re.split(separators, query))
if lenrecords % 1000 == 0:
output = '\rrecords: {} queries: {} items: {}'.format(
lenrecords, len(queries), len(items))
print(output, end='')
print()
print("records:", len(records))
print("queries:", len(queries))
print("items:", len(items))