-
Notifications
You must be signed in to change notification settings - Fork 2
/
compute_overlap.py
61 lines (42 loc) · 1.75 KB
/
compute_overlap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import json
import string
from nltk import ngrams
def pct_novel_ngrams_in_y(x,y,nmax):
# remove punctuation and lowercase
x = x.translate(str.maketrans('', '', string.punctuation)).lower()
y = y.translate(str.maketrans('', '', string.punctuation)).lower()
percs = dict()
for n in range(1,nmax+1):
ngrams_x = set(ngrams(x.split(),n))
ngrams_y = set(ngrams(y.split(),n))
if len(ngrams_y) == 0:
percs[n] = 'NA'
else:
percs[n] = round(100*len(ngrams_y.difference(ngrams_x))/len(ngrams_y),1)
return percs
# = = = = =
nmax = 4 # greatest n-gram order to consider
min_size = 20
path_to_docs = './data/docs/parsed/'
docnames = os.listdir(path_to_docs)
results = dict()
for counter,docname in enumerate(docnames):
with open(path_to_docs + docname, 'r', encoding='utf8') as file:
doc = json.load(file)
article = doc['article']
title = doc['title']
heading = doc['heading']
# empty (or too short) articles won't have an entry in 'results'
if len(article.split()) > min_size:
to_save = dict()
# whenever the field is too short to have at least one nmax-gram, NA is returned
to_save['t'] = pct_novel_ngrams_in_y(article,title,nmax)
to_save['h'] = pct_novel_ngrams_in_y(article,heading,nmax)
to_save['t+h'] = pct_novel_ngrams_in_y(article,title + ' ' + heading,nmax)
results[docname] = to_save
if counter % round(len(docnames)/10) == 0:
print(counter)
print(len(docnames) - len(results), 'too short documents')
with open('./data/overlap/overlaps.json', 'w', encoding='utf8') as file:
json.dump(results, file, sort_keys=True, indent=4, ensure_ascii=False)