-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
45 lines (31 loc) · 1.63 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import spacy
# Load the NER model
nlp = spacy.load('en_ner_bc5cdr_md')
# Increase the max_length to avoid the error "ValueError: [E088] Text of length 1200001 exceeds maximum of 1000000."
nlp.max_length = 1200000
text = """Aspirin is a nonsteroidal antiinflammatory drug and this drug is used as an analgesic, antipyretic, and in the treatment of rheumatoid arthritis and acute rheumatic fever.
Aspirin at high dosages may increase vitamin C excretion in the urine and decrease vitamin C absorption in the small intestine.
Vitamin C may protect the stomach mucosa from aspirin-induced injury, presumably by inhibiting inducible nitric oxide synthase expression(Konturek et al., 2006).
Further, supplemental vitamin E in high amounts may enhance aspirin’s antiplatelet effects"
"""
doc = nlp(text)
chemicals = []
diseases = []
for ent in doc.ents:
ent_label = ent.label_
#convert to lowercase to avoid case sensitive duplication
ent_text = ent.text.lower()
#remove extended ascii characters
ent_text = ent_text.encode('ascii', 'ignore').decode('ascii')
#remove special characters except - and /
ent_text = ''.join(e for e in ent_text if e.isalnum() or e == ' ' or e == '-' or e == '/')
ent_text = ent_text.strip()
#if starts with - or /, remove it
if ent_text.startswith('-') or ent_text.startswith('/'):
ent_text = ent_text[1:].strip()
if ent_label == 'CHEMICAL' and ent_text not in chemicals:
chemicals.append(ent_text)
if ent_label == 'DISEASE' and ent_text not in diseases:
diseases.append(ent_text)
print('Chemicals', chemicals)
print('Diseases', diseases)