-
Notifications
You must be signed in to change notification settings - Fork 0
/
oncotree_fhir.py
469 lines (400 loc) · 15.4 KB
/
oncotree_fhir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
"""convert Oncotree to HL7 FHIR CodeSystem resources
"""
import sys
import json
import argparse
import os
import textwrap
from csv import DictWriter
from typing import Dict, List, Tuple
import requests
from fhir.resources.codesystem import (
CodeSystem,
CodeSystemConcept,
CodeSystemConceptProperty,
)
from tqdm import tqdm
def parse_args(print_args: bool = True):
"""create the argument parser
Args:
print_args (bool, optional): If true, the arguments will
be printed to stdout after parsing. Defaults to True.
Returns:
argparse.Namespace: the parsed arguments as a Namespace object
"""
parser = argparse.ArgumentParser(
prog="python oncotree-fhir.py",
description="convert Oncotree to a HL7 FHIR CodeSystem resource",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--version",
"-v",
help="version of Oncotree to download",
default="oncotree_latest_stable",
)
parser.add_argument(
"--url",
"-u",
help="Endpoint for Oncotree API",
default="http://oncotree.mskcc.org/api",
type=lambda x: x.rstrip("/"),
)
parser.add_argument(
"--output",
"-o",
help="output file in JSON format. $version is replaced with the version string in filename",
default=os.path.join(".", "$version.json"),
type=str,
)
parser.add_argument(
"--canonical",
help="canonical url of the CodeSystem to generate. For the undated versions, the version string will be appended to this URL",
default="http://oncotree.mskcc.org/fhir/CodeSystem",
)
parser.add_argument(
"--valueset",
help="canonical url of the implicit ValueSet with all codes to generate",
default="http://oncotree.mskcc.org/fhir/ValueSet",
)
parser.add_argument(
"--write-tsv",
help="write the CodeSystem as a TSV file suitable for import into CSIRO's Snapper tool, helpful when creating ConceptMaps or ValueSets referencing Oncotree.",
action="store_true"
)
parser.add_argument(
"--tsv-output",
default=os.path.join(".", "$version.tsv"),
help="output file in TSV format (if --write-tsv given). $version is replaced with the version string in filename"
)
parser.add_argument(
"action",
default="convert",
# default="versions",
nargs="?",
choices=["versions", "convert", "convert-all"],
help="action to carry out",
)
args = parser.parse_args()
if print_args:
for arg in vars(args):
print(f" - {arg}: {getattr(args, arg)}")
available_versions = get_versions(args)
if args.version not in list(x["api_identifier"] for x in available_versions):
parser.error(
f"version '{args.version}' is not known to the endpoint {args.url}. Use the 'versions' operation to list the available versions"
)
if args.action == "convert-all":
if not "$version" in args.output:
parser.error(
"When converting all the available versions of Oncotree, the '--output' parameter must contain a placeholder '$version' that is replaced with the respective version string."
)
return args
class TreeNode:
""" a node in the version tree graph """
def __init__(self, value: str = None, children: 'List[TreeNode]' = None):
"""create a tree node, perhaps with children
Args:
value (str, optional): the text value of the node. Defaults to None.
children (List[TreeNode], optional): the children of the node. Defaults to None.
"""
if children is None:
children = []
self.value, self.children = value, children
def pprint_tree(node: TreeNode, file=None, _prefix="", _last=True, width=70):
"""Pretty-print a tree of nodes, from https://vallentin.dev/2016/11/29/pretty-print-tree
Args:
node ([type]): the root node
file ([type], optional): File to pass to print(). Defaults to None.
_prefix (str, optional): internal for recursive calls. Defaults to "".
_last (bool, optional): internal for recursive calls. Defaults to True.
"""
def wrap_to_width(val: str) -> str:
wrapped_value = textwrap.wrap(val, width=width)
if len(wrapped_value) > 1:
join_wrapped_value = "\n".join(wrapped_value[1:])
return (
wrapped_value[0]
+ "\n"
+ textwrap.indent(join_wrapped_value, " " * (len(_prefix) + 3))
)
else:
return wrapped_value[0]
filled_value = wrap_to_width(node.value)
print(_prefix, "`- " if _last else "|- ", filled_value, sep="", file=file)
_prefix += " " if _last else "| "
child_count = len(node.children)
for i, child in enumerate(node.children):
_last = i == (child_count - 1)
pprint_tree(child, file, _prefix, _last)
def get_versions(args: argparse.Namespace) -> List[Dict]:
"""get the list of versions from the respective OncoTree endpoint
Args:
args (argparse.Namespace): the command line args
Returns:
List[Dict]: the list of versions as a (JSON) dict.
"""
endpoint = f"{args.url}/versions"
rx = requests.get(endpoint).json()
rx.sort(key=lambda x: x["release_date"], reverse=True)
return rx
def convert_oncotree(args: argparse.Namespace, version: str = None) -> CodeSystem:
"""convert the oncotree system with given version to FHIR
Args:
args (argparse.Namespace): the command line args
version (str, optional): the version string Defaults to None. If not specified, args.version is used
Returns:
CodeSystem: the code system in FHIR R4 format
"""
if version is None:
version = args.version
endpoint = f"{args.url}/tumorTypes?version={version}"
rx = requests.get(endpoint)
with open(os.path.join(".", "oncotree.tmp.json"), "w") as f:
json.dump(rx.json(), f, indent=2)
date_of_version = date_for_version_string(version)
valueset_url = args.valueset.rstrip("/")
codesystem_url = args.canonical.rstrip("/")
name = "oncotree"
title = "OncoTree"
if version in [
"oncotree_latest_stable",
"oncotree_candidate_release",
"oncotree_development",
"oncotree_legacy_1.1",
]:
fhir_version = version.replace("_", "-")
codesystem_url += "/" + "snapshot"
valueset_url += "/" + "snapshot"
name = "oncotree-snapshot"
title = "OncoTree Snapshot"
else:
fhir_version = version.replace("oncotree_", "").replace("_", "")
print(
f"getting {version} (released {date_of_version}) from {endpoint}")
print()
json_dict = {
"resourceType": "CodeSystem",
"id": version.replace("_", "-"),
"url": codesystem_url,
"valueSet": valueset_url,
"status": "draft",
"content": "complete",
"name": name,
"title": title,
"version": fhir_version,
"date": date_of_version,
"hierarchyMeaning": "is-a",
"property": [
{
"code": "color",
"description": "Color in the Oncotree Visualisation",
"type": "string",
},
{
"code": "level",
"description": "Level in the Oncotree hierarchy",
"type": "integer",
},
{
"code": "umls",
"description": "Linked UMLS concept",
"type": "string",
},
{
"code": "nci",
"description": "Linked NCI concept",
"type": "string",
},
],
"concept": [],
}
print(json.dumps(json_dict))
cs = CodeSystem(json_dict)
print()
print("Converting concepts...")
sys.stdout.flush()
for concept in tqdm(rx.json()):
fhir_concept = convert_concept(concept)
cs.concept.append(fhir_concept)
return cs
def convert_concept(oncotree_concept: Dict) -> CodeSystemConcept:
"""convert the oncotree concept to a FHIR R4 CodeSystem concept
Args:
oncotree_concept (Dict): the element from the Oncotree API to convert
Returns:
CodeSystemConcept: the element in FHIR R4, with properties
"""
concept = CodeSystemConcept(
{
"code": oncotree_concept["code"],
"display": oncotree_concept["name"],
"property": [],
}
)
concept.property.append(
CodeSystemConceptProperty(
{"code": "level", "valueInteger": oncotree_concept["level"]}
)
)
if "color" in oncotree_concept and oncotree_concept["color"] is not None:
concept.property.append(
CodeSystemConceptProperty(
{"code": "color", "valueString": oncotree_concept["color"]}
)
)
if "parent" in oncotree_concept and oncotree_concept["parent"] is not None:
concept.property.append(
CodeSystemConceptProperty(
{"code": "parent", "valueCode": oncotree_concept["parent"]}
)
)
if len(oncotree_concept["externalReferences"]) > 0:
if "UMLS" in oncotree_concept["externalReferences"]:
concept.property.append(
CodeSystemConceptProperty(
{
"code": "umls",
"valueString": ", ".join(
oncotree_concept["externalReferences"]["UMLS"]
),
# there is at least on concept, SRCCR, that has multiple UMLS and NCI references
}
)
)
if "NCI" in oncotree_concept["externalReferences"]:
concept.property.append(
CodeSystemConceptProperty(
{
"code": "nci",
"valueString": ", ".join(
oncotree_concept["externalReferences"]["NCI"]
),
}
)
)
return concept
def write_codesystem(args: argparse.Namespace, cs: CodeSystem, version: str = None):
"""write the codesystem to a JSON file, as defined by the args
Args:
args (argparse.Namespace): the command line args
cs (CodeSystem): the FHIR code system to write
version (str): the version string. Default to None. If not specified, args.version is used.
"""
if version is None:
version = args.version
_, filepath = sanitize_filename(args.output, version)
with open(filepath, "w") as jf:
json.dump(cs.as_json(), jf, indent=2)
print(f"Wrote output to {filepath}")
def sanitize_filename(fn: str, version: str) -> Tuple[str, str]:
"""clean up the filename, add the version string if required (replacing $version) and return the full path to the output file
Args:
fn (str): the filename to sanitize
version (str): the version string to add, replacing $version
Returns:
Tuple[str, str]: (filename, filepath)
"""
filename = os.path.expanduser(fn).replace("$version", version)
if os.path.dirname(os.path.abspath(filename)) == os.path.abspath("."):
filepath = filename
else:
filepath = os.path.join(
os.path.abspath(os.path.dirname(filename)), os.path.basename(filename))
return filename, filepath
def date_for_version_string(version_string: str) -> str:
"""find the date a version was released from the versions API response
Args:
version_string (str): the version string to look up
Returns:
str: the release date of the version string, in ISO 8601 format, i.e. "YYYY-MM-DD"
"""
return [v for v in versions if v["api_identifier"] == version_string][0][
"release_date"
]
def print_versions(versions: List[str]):
"""print all available version from the given oncotree endpoint as a pretty tree
Args:
versions (List[str]): the list of versions to pretty-print
"""
endpoint = f"{args.url}/versions"
visible_versions = [x for x in versions if x["visible"]]
invisible_versions = [x for x in versions if not x["visible"]]
root_node = TreeNode(f"available versions from {endpoint}")
def print_version_strings(prefix: str, versions: List[Dict]) -> TreeNode:
"""helper function to print a version string with a given, common, prefix
Args:
prefix (str): the prefix, for example "2020 versions" or "current releases", that the releases are grouped under
versions (List[Dict]): the list of versions to pretty-print under the prefix
Returns:
TreeNode: the tree node containing the prefix with respective child nodes
"""
root = TreeNode(prefix)
for version in versions:
node = TreeNode(
version["api_identifier"],
[
TreeNode(f"released {version['release_date']}"),
TreeNode(version["description"]),
],
)
root.children.append(node)
return root
root_node.children.append(
print_version_strings("current/visible versions", visible_versions)
)
root_node.children.append(
print_version_strings("invisible versions", invisible_versions)
)
pprint_tree(root_node)
def write_tsv_codesystem(args: argparse.Namespace, cs: CodeSystem, version: str) -> None:
"""write the codesystem to a JSON file, as defined by the args (or return immediately if no TSV files should be written)
Args:
args (argparse.Namespace): the command line args
cs (CodeSystem): the FHIR CS to write
version (str): the version string of this file
Returns:
None
"""
if not args.write_tsv:
return None
fieldnames = ["code", "label", "parent"]
def parent_for_code(c: str) -> str:
"""helper function to return the parent of a FHIR code system concept
Args:
c (str): the concept to extract the property for
Returns:
str: the code of the parent, or None
"""
p = [p for p in c.property if p.code == "parent"]
if any(p):
return p[0].valueCode
return None
tsv_codes = [{"code": c.code, "label": c.display,
"parent": parent_for_code(c)} for c in cs.concept]
tsv_codes.sort(key=lambda c: c["code"])
#tsv_filename = args.tsv_output.replace("$version", version)
tsv_filename, tsv_path = sanitize_filename(args.tsv_output, version)
with open(tsv_path, "w") as csvfile:
writer = DictWriter(csvfile, fieldnames=fieldnames, delimiter="\t")
writer.writerows(tsv_codes)
print(f"wrote TSV to {tsv_filename}")
if __name__ == "__main__":
args = parse_args()
print("\n")
versions = get_versions(args)
if args.action == "versions":
print_versions(versions)
elif args.action == "convert":
cs = convert_oncotree(args)
write_codesystem(args, cs)
write_tsv_codesystem(args, cs, args.version)
elif args.action == "convert-all":
for v in versions:
version = v["api_identifier"]
print(f"Getting version {version}")
cs = convert_oncotree(args, version)
write_codesystem(args, cs, version)
write_tsv_codesystem(args, cs, version)
print("\n----\n")
sys.stdout.flush()