-
Notifications
You must be signed in to change notification settings - Fork 0
/
ResumeParserModel.py
63 lines (48 loc) · 1.66 KB
/
ResumeParserModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
from sklearn.model_selection import train_test_split
def get_spacy_doc(file, data):
nlp = spacy.blank("en")
db = DocBin()
for text, annot in tqdm(data):
doc = nlp.make_doc(text)
annot = annot["entities"]
ents = []
entity_indices = []
for start, end, label in annot:
skip_entity = False
for idx in range(start, end):
if idx in entity_indices:
skip_entity = True
break
if skip_entity == True:
continue
entity_indices = entity_indices + list(range(start, end))
try:
span = doc.char_span(start, end, label=label, alignment_mode='strict')
except:
continue
entity_indices = entity_indices + list(range(start, end))
if span is None:
err_data = str([start, end]) + " " + str(text) + '\n'
file.write(err_data)
else:
ents.append(span)
try:
doc.ents = ents
db.add(doc)
except:
pass
return db
file = open(file="Resources/Datasets/dataset.json", mode='r')
cv_data = json.load(file)
train, test = train_test_split(cv_data, test_size=0.3)
# print(len(train), '\n', train[0], '\n', len(test), '\n', test[0])
file = open(file='Resources/Models/train_file.txt', mode='w', encoding='utf-8')
db = get_spacy_doc(file, train)
db.to_disk('Resources/Models/train_data.spacy')
db = get_spacy_doc(file, test)
db.to_disk("Resources/Models/test_data.spacy")
file.close()