-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_name_variants.py
127 lines (101 loc) · 3.98 KB
/
extract_name_variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import Levenshtein as ls
import csv, json
import glob, time
from os import path
import re
import sys
"""
Read a csv file with person names (first_name(s) last_name: William Ofori Atta)
and election dates and return a dictionary with all names listed for each date
in the file.
"""
def read_names(infile):
df = pd.read_csv(infile, header=0, sep='\t')
name_dic = {}
for idx, row in df.iterrows():
if row.date not in name_dic:
name_dic[str(row.date)] = []
name_dic[str(row.date)].append(row.leader_name)
return name_dic
"""
Extract name variants for the given names from the speeches
that have a similarity below the threshold.
"""
def compute_name_variants(dic, full_name, df, threshold):
# Extract speech text from the dataframe
text = []
for idx, row in df.iterrows():
text.append(row.speech)
# Extract the vocabulary (unigrams, bigrams, trigrams)
# => keeps upper-cased letters but ignores punctuation.
vectorizer = CountVectorizer(lowercase=False, ngram_range=(1,3))
X = vectorizer.fit_transform(text)
vocab = vectorizer.get_feature_names()
# Now use python-Levenshtein to compute the minimal edit distance between
# a) the full name and b) the last name and all strings (unigrams, bigrams
# and trigrams) in the vocabulary.
new_variants = []
for name in dic[full_name]:
for v in vocab:
sim = ls.ratio(name, v)
if sim > threshold and sim <1.0:
if (v, sim) not in new_variants:
new_variants.append((v, sim))
return new_variants
if __name__ == "__main__":
# Input file with leader names and dates (year-month, e.g. 198612).
# The dates specify the files from which we extract the name variants
# (e.g. for 198612, we extract variants from the speeches in the
# corpus file <corpus/198612.csv>).
name_file = 'leader_names.csv'
# Directory with speeches from which we extract the name variants.
# Input files have file names yyyymm.csv (e.g. 198612.csv).
# Each csv file has a header line <,speech>.
corpus_path = 'corpus/'
# Output file in json format, with extracted name variants and
# edit distance scores.
results_file = 'name_variants.json'
threshold = 0.85 # => for higher recall, decrease threshold
# => for higher precision, increase threshold
#####
# First, we read in the leader names with election dates for
# which we want to extract name variants. We use the election
# dates to filter the corpus input used to extract name variants
# (we only look for name variants in corpus files that are
# relevant for the elections we are interested in).
name_dic = read_names(name_file)
old_variants = {}
new_variants = {}
for date in name_dic:
for full_name in name_dic[date]:
last_name = full_name.split(" ")[-1]
# start looking for spelling variants of full_name and last_name
# (e.g.: "Krishna Prasad Bhattarai", "Bhattarai")
if full_name not in old_variants:
old_variants[full_name] = [full_name, last_name]
new_variants[full_name] = {}
#####
# Next, we read in the corpus file for this name item
# (from csv files with speeches in the directory <corpus_path>
# that were given in the time period specified by <date>).
csv_path = corpus_path + date + '.csv'
# Check if the file exists
if path.exists(csv_path):
try:
df = pd.read_csv(csv_path)
except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
except:
print("Unexpected error:", sys.exc_info()[0], csv_path)
print("\tExtract name variants for: ", full_name, "\tfrom: ", date)
new = compute_name_variants(old_variants, full_name, df, threshold)
for next_var, score in new:
if next_var not in new_variants[full_name]:
new_variants[full_name][next_var] = score
else:
print("ERROR: file path doesn't exist: ", csv_path)
# Write extracted spelling variants to json file
with open(results_file, 'w', encoding='utf-8') as f:
json.dump(new_variants, f, ensure_ascii=False, indent=3)