-
Notifications
You must be signed in to change notification settings - Fork 2
/
rdf_builder.py
84 lines (69 loc) · 3.33 KB
/
rdf_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import json
import os
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import RDF, XSD
# URImap defines which types are being used for when they have not been provided
uriMap = {
"clinicalStudyType": URIRef('http://purl.obolibrary.org/obo/OPMI_0000418'),
"hasIdentifier": URIRef('http://semanticscience.org/resource/SIO_000671'),
"hasTitle": URIRef('http://purl.org/dc/elements/1.1/title'),
"partOf": URIRef('http://purl.obolibrary.org/obo/BFO_0000050'),
"investigation": URIRef('http://purl.obolibrary.org/obo/CTO_0000108'),
"hasLabel": URIRef('http://www.w3.org/2000/01/rdf-schema#Label'),
"hasPhase": URIRef('http://purl.obolibrary.org/obo/CTO_0000125'),
"hasStatus": URIRef('http://purl.obolibrary.org/obo/OPMI_0000622'),
"investigatesCondition": URIRef('http://investigatesCondition'),
"medicalCondition": URIRef('http://purl.obolibrary.org/obo/OPMI_0000282'),
}
def turn_to_rdf(g, json_filename):
with open(json_filename) as f:
d = json.load(f)
# this list of IDs will be added to a CSV output
id_list = []
for i in d:
id = i['ctid']
rdf_study = URIRef("https://clinicaltrials.gov/study/" + id)
g.add((rdf_study, RDF.type, uriMap['clinicalStudyType']))
g.add((rdf_study, uriMap['hasIdentifier'], Literal(id, datatype=XSD.string)))
# write the list of IDs to a CSV file that can be used downstream
id_list.append(id)
title = i['title']
g.add((rdf_study, uriMap['hasTitle'], Literal(title, datatype=XSD.string)))
cids = i['cids']
intervention = i['interventions']
rdfint = URIRef('http://' + str(hash(intervention)))
g.add((rdfint, uriMap['partOf'], rdf_study))
g.add((rdfint, RDF.type, uriMap['investigation']))
g.add((rdfint, uriMap['hasLabel'], Literal(intervention, datatype=XSD.string)))
for cid in cids:
g.add((rdfint, uriMap['hasIdentifier'], URIRef("https://pubchem.ncbi.nlm.nih.gov/compound/" + str(cid))))
try:
phase = i['phase']
g.add((rdf_study, uriMap['hasPhase'], Literal(phase, datatype=XSD.string)))
except KeyError:
print("Phase not found in " + id)
status = i['status']
g.add((rdf_study, uriMap['hasStatus'], Literal(status, datatype=XSD.string)))
try:
conditions = i['conditions']
if type(conditions) != list:
conditions = [conditions]
for con in conditions:
medical_condition = URIRef('http://medical_condition' + str(hash(con)))
g.add((medical_condition, RDF.type, uriMap['medicalCondition']))
g.add((medical_condition, uriMap['hasLabel'], Literal(con, datatype=XSD.string)))
g.add((rdf_study, uriMap['investigatesCondition'], medical_condition))
except Exception as e:
print(e)
print("No conditions in " + id)
# output a CSV that has all the NCT ids that can be used downstream
with open("pubchem_id_list.csv", "w") as f:
for i in id_list:
f.write(f"{i}\n")
def build_rdf_from_json(inputfolder, ttlfilename):
g = Graph()
filelist = os.listdir(inputfolder)
print(filelist)
for file in filelist:
turn_to_rdf(g, os.path.join(inputfolder, file))
g.serialize(destination=ttlfilename)