-
Notifications
You must be signed in to change notification settings - Fork 1
/
cellapi_builder.py
1474 lines (1285 loc) · 62.1 KB
/
cellapi_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import sys
import codecs
import os
#import glob
import json
import gc
import pickle
#import re
from optparse import OptionParser
from datetime import datetime
from lxml import etree
from io import StringIO, BytesIO
import random
from copy import deepcopy
import requests
import subprocess
import ApiCommon
from ApiCommon import log_it
from fields_utils import FldDef
from namespace_registry import NamespaceRegistry as ns_reg
from rdf_builder import RdfBuilder
from organizations import KnownOrganizations, Organization
from terminologies import Terminologies, Terminology
from ontology_builder import OntologyBuilder
from databases import Database, Databases
from sexes import Sexes, Sex
# no not remove imports below, parsers are called dynamically
from ncbi_taxid_parser import NcbiTaxid_Parser
from chebi_parser import Chebi_Parser
from cl_parser import Cl_Parser
from uberon_parser import Uberon_Parser
from ncit_parser import Ncit_Parser
from ordo_parser import Ordo_Parser
from vbo_parser import Vbo_Parser
from rs_parser import Rs_Parser
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_solr_search_url(verbose=False):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
value = os.getenv("CELLAPI_SOLR_SEARCH_URL","http://localhost:8983/solr/pamcore1/select")
if verbose:
log_it("INFO:", "reading / getting default for env variable", "CELLAPI_SOLR_SEARCH_URL", value)
return value
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_all_solr_params(fldDef, query="id:HeLa", fields="ac", sort="score desc", start=0, rows=1000):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
solr_q = fldDef.normalize_solr_q(query)
solr_fl = "" if fields is None or len(fields.strip())==0 else fldDef.normalize_solr_fl(fields)
solr_sort = "score desc" if sort is None or len(sort.strip())==0 else fldDef.normalize_solr_sort(sort)
params = {
"fl":solr_fl,
"indent": True,
"q.op": "AND",
"q": solr_q ,
"start": str(start),
"rows": str(rows),
"defType": "edismax",
"sort": solr_sort,
"wt" : "json",
"qf": "ac^16 id^16 sy^8 text", # acas dr rx ww cc str di ox hi oi sx ag ca dt ch text",
"pf": "ac^160 id^160 sy^80 text" # acas dr rx ww cc str di ox hi oi sx ag ca dt ch text"
}
return params
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_json_object(el):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
obj = build_jsonable_object(el)
return {el.tag: obj}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def seen_as_list(tag):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# case 1: should be renamed to str-profile in the future for more clarity
if tag == "str-list" : return False
# case 2: general case
if tag.endswith("-list") : return True
# case 3: special cases
if tag in ["same-origin-as", "site", "derived-from"]: return True
# case 4: default
return False
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_items_prop_name(tag):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if tag == "site": return "xref-list"
return "items"
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def build_jsonable_object(el):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# create object, add attributes and text node as obj properties
obj = dict()
if el.text is not None:
txt = el.text.strip()
if len(txt) > 0: obj["value"]=txt
for attr in el.keys():
obj[attr]=el.get(attr)
# an author-list contains a choice of persons and consortia, the element tag is turned into the attribute "type"
if el.tag in ["person", "consortium"]:
obj["type"] = el.tag
# XML elements containing a list of sub elements are expected to names ending with '-list'
# But as always there are some exceptions... which are handled in seen_as_list()
if seen_as_list(el.tag):
# Recursively add child objects in items list
items_prop = get_items_prop_name(el.tag)
obj[items_prop]= list()
for child_el in el:
# - - - - - - - with object name encapsulation - - - - - - - (change 1)
#child_obj = dict()
#child_obj[child_el.tag]=build_jsonable_object(child_el)
# - - - - - - - - - - - - or without - - - - - - - - - - - -
child_obj = build_jsonable_object(child_el)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
obj[items_prop].append(child_obj)
# Simplify list having only an "items" property (almost all the cases except for el "site")
if len(obj)==1 and "items" in obj: return obj["items"]
else:
# Recursively add child objects in object properties
for child_el in el:
obj[child_el.tag]=build_jsonable_object(child_el)
# Simplify object having only a "value" property - - - - - - - - (change 2)
if len(obj)==1 and "value" in obj: return obj["value"]
return obj
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def wait_for_input(msg):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
print(msg)
for line in sys.stdin: break;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def read_txt_cell_line(f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
lines = list()
while True:
line = f_in.readline()
if line == "": return None # <-- end of file
lines.append(line)
if line[0:2] == "//": return lines # <-- end of record
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def read_txt_ref(f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
lines = list()
while True:
line = f_in.readline()
if line == "": return None # <-- end of file
if line[0:2] == "**": continue # <-- ignore comments in the header
lines.append(line)
if line[0:2] == "//": return lines # <-- end of record
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def save_txt_refs(input_file):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
t0 = datetime.now()
output_file = ApiCommon.RF_TXT_FILE
log_it("INFO:", "Start saving txt references in", output_file)
rf_dict = dict()
f_in = open(input_file, "r")
f_out = open(output_file, "wb")
no = 0
while True:
# read cell line record as a list of lines
lines = read_txt_ref(f_in)
if lines == None: break
no += 1
if no % 10000 == 0: log_it("INFO:", "Saving reference", no)
# build record as bytes and write them
pos = f_out.tell()
rec = "".join(lines)
bytes = rec.encode("utf-8")
f_out.write(bytes)
size = len(bytes)
# find reference identifier(s)
ids = list()
for line in lines:
if line[0:5] == "RX ":
tmp_list = line[5:].strip().split("; ")
for id in tmp_list:
clean_id = id.strip()
if clean_id.endswith(";") : clean_id = clean_id[:-1]
clean_id = clean_id.strip()
if len(clean_id) > 0: ids.append(clean_id)
break
if len(ids) == 0:
log_it("ERROR: no ids for record", rec)
sys.exit()
# save info for index
for id in ids: rf_dict[id] = { "txt_pos": pos, "txt_size": size }
log_it("INFO:", "Saving reference", no)
f_in.close()
f_out.close()
log_it("INFO:", "Saved txt reference, count:" , len(rf_dict), duration_since=t0)
return rf_dict
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def save_txt_cell_lines(input_file):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
t0 = datetime.now()
output_file = ApiCommon.CL_TXT_FILE
log_it("INFO:", "Start saving txt cell_lines in", output_file)
cl_dict = dict()
f_in = open(input_file, "r")
f_out = open(output_file, "wb")
# skip header
while True:
if "__________" in f_in.readline(): break
no = 0
while True:
# read cell line record as a list of lines
lines = read_txt_cell_line(f_in)
if lines == None: break
no += 1
if no % 10000 == 0: log_it("INFO:", "Saving cell line", no)
# build record as bytes and write them
pos = f_out.tell()
rec = "".join(lines)
bytes = rec.encode("utf-8")
f_out.write(bytes)
size = len(bytes)
# find accession value
ac = None
for line in lines:
if line[0:5] == "AC ":
ac = line[5:].strip()
break
if ac == None:
log_it("ERROR: no AC for record", rec)
sys.exit()
# save info for index
cl_dict[ac] = { "txt_pos": pos, "txt_size": size }
log_it("INFO:", "Saving cell line", no)
f_in.close()
f_out.close()
log_it("INFO:", "Saved txt cell lines, count:" , len(cl_dict), duration_since=t0)
return cl_dict
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def save_xml_cell_lines(bigxml_root):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
t0 = datetime.now()
filename = ApiCommon.CL_XML_FILE
log_it("INFO:", "Start saving xml cell_lines in", filename)
# find list of cell line elements
el_list = bigxml_root.xpath("/Cellosaurus/cell-line-list/cell-line/accession-list/accession[@type='primary']")
log_it("INFO:", "Found cell line primary accessions, count:" , len(el_list))
# build cell line dictionary key = cvcl, value=(file_offset,obj_size)
f_out = open(filename, "wb")
cl_dict = dict()
children_dict = dict()
no=0
for el in el_list:
no+=1
if no % 10000 == 0: log_it("INFO:", "Saving cell line", no)
# write record get its position and size in file
cl = el.getparent().getparent()
ac = el.text.strip()
id = cl.xpath("./name-list/name[@type='identifier']")[0].text.strip()
cl_bytes = etree.tostring(cl, encoding='utf-8')
pos = f_out.tell()
size = f_out.write(cl_bytes)
# add dictionary item with pos, size in file and ref list for this cell line
ref_list = cl.xpath("./reference-list/reference")
iref_list = list()
for ref in ref_list:
iref_list.append(ref.get("resource-internal-ref"))
rec = { "xml_pos": pos, "xml_size": size, "ref_list": iref_list }
cl_dict[ac] = rec
# update parent cell line dictionary entry by adding current cl as a child
parent_list = cl.xpath("./derived-from/xref")
for parent in parent_list:
parent_ac = parent.get("accession")
if parent_ac not in children_dict: children_dict[parent_ac] = list()
children_dict[parent_ac].append({"ac": ac, "id": id})
#if no>10: break
log_it("INFO:", "Saving cell line", no)
f_out.close()
# update cl dictionary with parent child info
for ac in cl_dict:
rec = cl_dict[ac]
if ac in children_dict:
rec["child_list"] = children_dict[ac]
else:
rec["child_list"] = list()
log_it("INFO:", "Saved xml cell lines, count:" , len(cl_dict), duration_since=t0)
return cl_dict
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def save_xml_references(bigxml_root):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
filename = ApiCommon.RF_XML_FILE
t0 = datetime.now()
log_it("INFO:", "Start saving xml references in", filename)
ref_list = bigxml_root.xpath("/Cellosaurus/publication-list/publication")
log_it("INFO:", "Found publication elements, count:" , len(ref_list))
f_out = open(filename, "wb")
ref_dict = dict()
no=0
for ref in ref_list:
no+=1
id = ref.get("internal-id")
ref_bytes = etree.tostring(ref, encoding='utf-8')
pos = f_out.tell()
size = f_out.write(ref_bytes)
if no % 10000 == 0: log_it("INFO:", "Saving reference", no)
rec = { "xml_pos": pos, "xml_size": size }
ref_dict[id]=rec
#if no>10: break
log_it("INFO:", "Saving reference", no)
f_out.close()
delta = datetime.now() - t0
log_it("INFO:", "Saved xml references, count:" , len(cl_dict), duration_since=t0)
return ref_dict
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_xml_release_info(bigxml_root):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
t0 = datetime.now()
log_it("INFO:", "Start extracting release stats from xml")
node = bigxml_root.xpath("/Cellosaurus/header/release")[0] # there is one and only one such element
version = node.get("version")
updated= node.get("updated")
nb_cell_lines = node.get("nb-cell-lines")
nb_publications = node.get("nb-publications")
log_it("INFO:", version, updated, nb_cell_lines, nb_publications)
log_it("INFO:", "Extracting release stats done")
release_info = {"version": version, "updated": updated, "nb-cell-lines":nb_cell_lines, "nb-publications": nb_publications}
return release_info
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def save_pickle(obj, filename):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
t0 = datetime.now()
log_it("INFO:", "Start saving object in", filename)
f_out = open(filename, 'wb')
pickle.dump(obj, f_out, protocol=3) # pickle.HIGHEST_PROTOCOL is 3 for python 3.6, 4 for python 3.8
f_out.close()
delta = datetime.now() - t0
log_it("INFO:", "Saved object in", filename, duration_since=t0)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def check_xml_references(cl_dict, ref_dict):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
count = 0
for cl_key in cl_dict:
cl = cl_dict[cl_key]
for ref_id in cl["ref_list"]:
count += 1
if ref_id not in ref_dict:
log_it("ERROR: in", cl_key, "Unknown reference", ref_id )
log_it("INFO:", "Reference check count", count)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def load_pickle(filename):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
log_it("INFO:", "Start loading object from", filename)
t0 = datetime.now()
f_in = open(filename, 'rb')
obj = pickle.load(f_in)
f_in.close()
delta = datetime.now() - t0
log_it("INFO:", "Loaded object from", filename, duration_since=t0)
return obj
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def load_and_parse_xml(filename):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
log_it("INFO:", "reading", filename, "...")
parser = etree.XMLParser(remove_blank_text=True)
root_node = etree.parse(filename, parser).getroot()
log_it("INFO:", "xml parsed")
return root_node
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_txt_cell_line(ac, cl_dict, cl_f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cl_index = cl_dict[ac]
cl_f_in.seek(cl_index["txt_pos"])
bytes = cl_f_in.read(cl_index["txt_size"])
rec = bytes.decode("utf-8")
rec = rec.strip()[0:-2] # uggly removal of final //
rec += get_txt_cell_children(ac, cl_dict)
rec += "//\n" # re-appending of //
return rec
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_txt_cell_children(ac, cl_dict):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
rec = ""
cl_index = cl_dict[ac]
for child in cl_index["child_list"]:
line = "CH " + child["ac"] + " ! " + child["id"] + "\n"
rec += line
return rec
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def filter_lines_by_prefixes(data, prefixes):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if prefixes is None: return data
result = ""
line_list = data.split("\n")
for line in line_list:
for pf in prefixes:
if line.startswith(pf):
result += line + "\n"
break
return result
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_tsv_multi_cell(ac_list, fields, fldDef, cl_dict, cl_txt_f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# if no fields are defined we want them all
if fields is None or fields=="":
fld_list = fldDef.keys()
else:
fld_list = fields if isinstance(fields,list) else fields.split(",")
fld_list = [fld.lower().strip() for fld in fld_list]
# build tsv header with field names
result = "\t".join(fld_list) + '\n'
# collect cell line records
for ac in ac_list:
cl_text = get_txt_cell_line(ac, cl_dict, cl_txt_f_in)
fld_dic = get_cell_line_fld_dic_from_text(cl_text, fld_list, fldDef)
first_fld = True
for fld in fld_list:
value = fld_dic.get(fld) or ""
if first_fld:
first_fld = False
result += value
else:
result += "\t" + value
result += "\n"
return result
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_txt_multi_cell(ac_list, prefixes, cl_dict, rf_dict, cl_txt_f_in, rf_txt_f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
result = ""
rf_ids = set()
# collect cell line records
for ac in ac_list:
#log_it("ac", ac)
cl = get_txt_cell_line(ac, cl_dict, cl_txt_f_in)
cl = filter_lines_by_prefixes(cl, prefixes)
#log_it(">>> cl: ",cl)
#log_it("------")
if len(cl)>0 and prefixes is not None: cl += "//\n"
result += cl
cl_index = cl_dict[ac]
for ref_id in cl_index["ref_list"]: rf_ids.add(ref_id)
for rf_id in rf_ids:
rf = get_txt_reference(rf_id, rf_dict, rf_txt_f_in)
if rf is not None:
rf = filter_lines_by_prefixes(rf, prefixes)
if len(rf)>0 and prefixes is not None: rf += "//\n"
result += rf
return result
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_xml_multi_cell(ac_list, xpaths, cl_dict, rf_dict, cl_xml_f_in, rf_xml_f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
trg_root = etree.Element("Cellosaurus")
trg_cl_root = etree.SubElement(trg_root, "cell-line-list")
rf_ids = set()
# get list of cell line nodes and build set of referred publications
for ac in ac_list:
cl = get_xml_cell_line(ac, cl_dict, cl_xml_f_in)
tmp_node = etree.Element("tmp")
append_filtered_xml(tmp_node, cl, xpaths)
cl_node = tmp_node.find("cell-line")
if cl_node is not None: trg_cl_root.append(cl_node)
cl_index = cl_dict[ac]
for ref_id in cl_index["ref_list"]: rf_ids.add(ref_id)
# get list of referred publications nodes
trg_rf_root = etree.Element("publication-list")
for rf_id in rf_ids:
rf = get_xml_reference(rf_id, rf_dict, rf_xml_f_in)
tmp_node = etree.Element("tmp")
append_filtered_xml(tmp_node, rf, xpaths)
rf_node = tmp_node.find("publication")
if rf_node is not None: trg_rf_root.append(rf_node)
if len(trg_rf_root)>0: trg_root.append(trg_rf_root)
return trg_root
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def append_filtered_xml(trg_xml_root, some_xml_root, path_list):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if path_list is None:
trg_xml_root.append(some_xml_root)
return
for path in path_list:
nodes = some_xml_root.xpath(path)
if len(nodes)>0:
# the '' before first / is irrelevant
# e.g: /cell-line/xref-list/xref/property-list/property[@name="Discontinued"]/../..
path_items = path.split("/")[1:]
# ->: "cell-line" "xref-list" "xref" "property-list" "property[@name="Discontinued"]" ".." ".."
# find parent elements of nodes we retrieved from xpath expression:
# a) remove and count '..' path elements from the right side (if any)
# e.g: ".." ".." in "cell-line" "xref-list" "xref" "property-list" "property[@name="Discontinued"]" ".." ".."
back_num = 0
while path_items[-1]=='..':
back_num += 1
path_items.pop()
# b) remove elements of the xpath expression used as a selection criterion that are children of the nodes retrieved
# e.g: "property-list" "property[@name="Discontinued"]" in "cell-line" "xref-list" "xref" "property-list" "property[@name="Discontinued"]"
if back_num != 0: path_items = path_items[0: - back_num]
# c) finally remove the tag of the nodes we retrieved from the xpath expression
parent_tags = path_items[0:-1]
current_parent = trg_xml_root
# now create an xml element for each parent of the nodes we retrieved
for tag in parent_tags:
node = current_parent.find(tag)
if node is None: node = etree.SubElement(current_parent, tag)
current_parent = node
last_item = path_items[-1]
# add retrieved attribute or element nodes as children of closest parent element
if last_item.startswith("attribute"): # i.e. attribute::category
attr_name = last_item.split(":")[2]
attr_value = nodes[0] # we get an attribute value in nodes[0]
current_parent.attrib[attr_name]=attr_value
else:
for n in nodes: # we get real nodes in nodes
current_parent.append(deepcopy(n))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_txt_multi_cell_children(ac_list, cl_dict):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
rec = ""
for ac in ac_list:
rec += "AC " + ac + "\n"
rec += get_txt_cell_children(ac, cl_dict)
rec += "//\n"
return rec
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_txt_reference(rf_id, rf_dict, rf_f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if rf_id not in rf_dict:
log_it("ERROR: reference not found", rf_id)
return None
rf_index = rf_dict[rf_id]
rf_f_in.seek(rf_index["txt_pos"])
bytes = rf_f_in.read(rf_index["txt_size"])
ref = bytes.decode("utf-8")
return ref
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_tsv_cell_children(ac, cl_dict):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cl_index = cl_dict[ac]
rec = ""
for child in cl_index["child_list"]:
rec += ac + "\t" + child["ac"] + "\t" + child["id"] + "\n" # tsv data
return rec
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_tsv_multi_cell_children(ac_list, cl_dict):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
result = "parent_AC\tchild_AC\tchild_ID\n" # tsv header
for ac in ac_list:
result += get_tsv_cell_children(ac, cl_dict)
return result
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_xml_cell_line(ac, cl_dict, cl_f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cl_index = cl_dict[ac]
cl_f_in.seek(cl_index["xml_pos"])
bytes = cl_f_in.read(cl_index["xml_size"])
tree = etree.parse(BytesIO(bytes))
# get cell line xml element
cl = tree.getroot()
children_node = get_xml_cell_children(ac, cl_dict)
if children_node != None: cl.append(children_node)
return cl
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_xml_cell_children(ac, cl_dict):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cl_index = cl_dict[ac]
child_list = cl_index["child_list"]
#if len(child_list)==0: return None
children_node = etree.Element("child-list")
for child in child_list:
child_el = etree.SubElement(children_node, "child")
ac_el = etree.SubElement(child_el, "accession", type = "primary" )
ac_el.text = child["ac"]
id_el = etree.SubElement(child_el, "name", type = "identifier" )
id_el.text = child["id"]
return children_node
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_xml_multi_cell_children(ac_list, cl_dict):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
main_node = etree.Element("Cellosaurus")
cll_node = etree.SubElement(main_node, "cell-line-list")
for ac in ac_list:
cl_node = etree.SubElement(cll_node, "cell-line")
al_node = etree.SubElement(cl_node, "accession-list")
ac_node = etree.SubElement(al_node, "accession", type="primary")
ac_node.text = ac
ch_node = get_xml_cell_children(ac, cl_dict)
if ch_node != None: cl_node.append(ch_node)
return main_node
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_json_multi_cell_children(ac_list, cl_dict):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
node = get_xml_multi_cell_children(ac_list, cl_dict)
obj = get_json_object(node)
return obj
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_xml_reference(rf_id, rf_dict, rf_f_in):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
rf_index = rf_dict[rf_id]
rf_f_in.seek(rf_index["xml_pos"])
bytes = rf_f_in.read(rf_index["xml_size"])
tree = etree.parse(BytesIO(bytes))
return tree.getroot()
# builds a string from a xml element (and sub elements) based on text, attrib ans tail
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_solr_field_value(node):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
value = recursive_solr_field_value(node,"")
value = value.replace('\n',' ').replace('\t',' ')
value = " ".join(value.split())
return value
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def recursive_solr_field_value(node, value):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
result = value
for k in node.attrib:
result += k + "=" + node.get(k) + "; "
if node.text is not None:
result += node.text + " "
for sub_node in node:
result += recursive_solr_field_value(sub_node, value) + " "
if node.tail is not None:
result += node.tail + " "
return result
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_values_from_xpath(node, xp):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
values = list()
if xp.startswith("attribute::"):
attr_name = xp.split(":")[2]
value = node.get(attr_name)
if value is not None: values.append(value)
else:
for n in node.xpath(xp):
value = get_solr_field_value(n)
if len(value) >= 0: values.append(value)
return values
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_cell_line_solr_xml_doc_from_node(node, fldDef):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
doc_node = etree.Element("doc")
cl_node = node.xpath("./cell-line-list/cell-line")[0]
pu_node_list = node.xpath("./publication-list/publication")
for k in fldDef.keys():
fname = k.replace('-','_').lower()
for xp in fldDef.get_xpaths(k):
if xp.startswith("/publication"):
rel_xp = xp.replace("/publication/","")
for pu_node in pu_node_list:
values = get_values_from_xpath(pu_node, rel_xp)
for value in values:
fld = etree.SubElement(doc_node, "field")
fld.set("name", fname)
fld.text = value
elif xp.startswith("/cell-line"):
rel_xp = xp.replace("/cell-line/","")
values = get_values_from_xpath(cl_node, rel_xp)
for value in values:
fld = etree.SubElement(doc_node, "field")
fld.set("name", fname)
fld.text = value
return doc_node
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def merge_consecutive_prefix(lines, prefix):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
merged_line = ""
new_lines = list()
for line in lines:
if line.startswith(prefix):
merged_line += line[5:].strip() + " "
else:
if len(merged_line) > 0:
merged_line = prefix + " " + merged_line.strip()
new_lines.append(merged_line)
merged_line = ""
new_lines.append(line)
return new_lines
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_cell_line_fld_dic_from_text(cl_text, fld_list, fldDef):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
result = dict()
lines = cl_text.split("\n")
lines = merge_consecutive_prefix(lines, "RA")
lines = merge_consecutive_prefix(lines, "RT")
for fld in fld_list:
for line in lines:
for pr in fldDef.get_prefixes(fld):
if line.startswith(pr):
if fld in ["dtc","dtu","dtv"]:
value = get_field_from_dt_line(line, fld)
else:
value = line[len(pr):]
if value.startswith(":"): value = value[1:]
value = value.strip()
if fld in result:
result[fld] += "|" + value
else:
result[fld] = value
return result
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_cell_line_solr_xml_doc_from_text(text, fldDef):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
doc_node = etree.Element("doc")
lines = text.split("\n")
lines = merge_consecutive_prefix(lines, "RA")
lines = merge_consecutive_prefix(lines, "RT")
for k in fldDef.keys():
fname = k.replace('-','_').lower()
for line in lines:
for pr in fldDef.get_prefixes(k):
if line.startswith(pr):
# special handler for date fields and long field of lines with prefix DT
if fname in ["dtc","dtu","dtv"]:
fld = etree.SubElement(doc_node, "field")
fld.set("name", fname)
fld.text = get_field_from_dt_line(line, fname)
# special case for sy, idsy: we want to split multiple values in line
elif pr == "SY": # fname in ["sy", "idsy"]:
sy_list = line[len(pr):].strip().split("; ")
for sy in sy_list:
fld = etree.SubElement(doc_node, "field")
fld.set("name", fname)
fld.text = sy
# normal handler
else:
fld = etree.SubElement(doc_node, "field")
fld.set("name", fname)
value = line[len(pr):]
if value.startswith(":"): value = value[1:]
value = value.strip()
fld.text = value
# we need the additional field ac_str which is the unique key for solr
if fname == "ac":
fld = etree.SubElement(doc_node, "field")
fld.set("name", "ac_str")
fld.text = value
return doc_node
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_field_from_dt_line(line, fname):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# example of DT line:
# DT Created: 23-09-21; Last updated: 16-12-21; Version: 2
items = line.split("; ")
if fname == "dtc" and len(items)==3:
(dd,mm,yy) = items[0].split(": ")[1].split("-")
return "20" + yy + "-" + mm + "-" + dd
elif fname == "dtu" and len(items)==3:
(dd,mm,yy) = items[1].split(": ")[1].split("-")
return "20" + yy + "-" + mm + "-" + dd
elif fname == "dtv" and len(items)==3:
version = items[2].split(": ")[1]
return version
return None
# - - - - - - - - - - - - - -
def get_clid_dic(fldDef):
# - - - - - - - - - - - - - -
# called by main at init time
# for subsequent efficient /fsearch
t0 = datetime.now()
log_it("INFO:", "Building clid_dict...")
url = get_solr_search_url()
params = get_all_solr_params(fldDef, query="*:*", fields="ac,id,ox", sort="id asc", start=0, rows=1000000)
headers = { "Accept": "application/json" }
response = requests.get(url, params=params, headers=headers)
obj = response.json()
if response.status_code != 200:
error_msg = ""
solr_error = obj.get("error")
if solr_error is not None: error_msg = solr_error.get("msg")
log_it("ERROR:", "code:", response.status_code, error_msg )
log_it("ERROR:", "while building clid_dict, exiting !", duration_since=t0)
sys.exit(1)
clid_dict = dict()
items = obj["response"]["docs"]
for item in items:
fields = [item["ac"], item["id"]] # order for elisabeth ?
oxs = list()
for ox in item["ox"]:
_, label = ox.split(" ! ")
oxs.append(label)
fields.append(" / ".join(oxs))
line = "\t".join(fields)
clid_dict[item["id"]] = line
log_it("INFO:", "clid_dict size:", len(clid_dict), duration_since=t0)
return clid_dict
# - - - - - - - - - - - - - - - - - - - -
def save_virtuoso_isql_setup_file(output_file):
# - - - - - - - - - - - - - - - - - - - -
lines = []
lines.append("""grant select on "DB.DBA.SPARQL_SINV_2" to "SPARQL";""")
lines.append("""grant execute on "DB.DBA.SPARQL_SINV_IMP" to "SPARQL";""")
lines.append("""GRANT SPARQL_SPONGE TO "SPARQL";""")
lines.append("""GRANT EXECUTE ON DB.DBA.L_O_LOOK TO "SPARQL";""")
for ns in ns_reg.namespaces:
lines.append(ns.getSQLforVirtuoso())
f_out = open(output_file, "w")
for line in lines: f_out.write(line + "\n")
f_out.close()
# ===========================================================================================
if __name__ == "__main__":
# ===========================================================================================
parser = OptionParser()
(options, args) = parser.parse_args()
if len(args) < 1: sys.exit("Invalid arg1, expected BUILD, SOLR, RDF, LOAD_RDF, ONTO, SPARQL_PAGES or TEST")
if args[0] not in [ "BUILD", "SOLR", "RDF", "LOAD_RDF", "ONTO", "SPARQL_PAGES", "TEST" ]:
sys.exit("Invalid arg1, expected BUILD, SOLR, RDF, LOAD_RDF, ONTO, SPARQL_PAGES or TEST")
input_dir = "data_in/"
if input_dir[-1] != "/" : input_dir + "/"
# -------------------------------------------------------
if args[0]=="RDF":
# -------------------------------------------------------
# be aware that
# here we use api data and indexes created when args0="BUILD"
known_orgs = KnownOrganizations()
known_orgs.loadInstitutions(input_dir + "institution_list")
known_orgs.loadOnlineResources(input_dir + "cellosaurus_xrefs.txt")
terminologies = Terminologies()
rb = RdfBuilder(known_orgs)
cl_dict = load_pickle(ApiCommon.CL_IDX_FILE)
rf_dict = load_pickle(ApiCommon.RF_IDX_FILE)
cl_txt_f_in = open(ApiCommon.CL_TXT_FILE,"rb")
rf_txt_f_in = open(ApiCommon.RF_TXT_FILE,"rb")
cl_xml_f_in = open(ApiCommon.CL_XML_FILE,"rb")
rf_xml_f_in = open(ApiCommon.RF_XML_FILE,"rb")
fldDef = FldDef(ApiCommon.FLDDEF_FILE)
out_dir = "rdf_data/"
# create or clean output dir
if not os.path.exists(out_dir): os.mkdir(out_dir)
os.system("rm " + out_dir + "*")
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# create RDF for cell-line data
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
file_out = open(out_dir + "data_cell_lines.ttl", "wb")
file_out.write(bytes(rb.get_ttl_prefixes() + "\n", "utf-8"))
item_cnt = 0
log_it("INFO:", f"serializing cl: {item_cnt} / {len(cl_dict)}")
for ac in cl_dict:
item_cnt += 1
#if item_cnt > 20000: break
if item_cnt % 10000 == 0: log_it("INFO:", f"serializing cl: {item_cnt} / {len(cl_dict)}")
cl_xml = get_xml_cell_line(ac, cl_dict, cl_xml_f_in)
cl_obj = get_json_object(cl_xml)
file_out.write( bytes(rb.get_ttl_for_cl(ac, cl_obj) , "utf-8" ) )
file_out.close()
log_it("INFO:", f"serialized cl: {item_cnt} / {len(cl_dict)}")
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# create RDF for publications
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ref_out = open(out_dir + "data_refs.ttl", "wb")
ref_out.write(bytes(rb.get_ttl_prefixes() + "\n", "utf-8"))
item_cnt = 0
log_it("INFO:", f"serializing refs: {item_cnt} / {len(rf_dict)}")
for rf_id in rf_dict:
item_cnt += 1
if item_cnt % 10000 == 0: log_it("INFO:", f"serializing refs: {item_cnt} / {len(rf_dict)}")
rf_xml = get_xml_reference(rf_id, rf_dict, rf_xml_f_in)
rf_obj = get_json_object(rf_xml)
ref_out.write( bytes(rb.get_ttl_for_ref(rf_obj), "utf-8") )
ref_out.close()
log_it("INFO:", f"serialized refs: {item_cnt} / {len(rf_dict)}")
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# extract cited terms from xrefs, infer parent terms from cited terms
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
cited_terms = dict()
xr_dict = rb.get_xref_dict()
item_cnt = 0
log_it("INFO:", f"extracting terms from xrefs: {item_cnt} / {len(xr_dict)}")
for k in xr_dict:
item_cnt += 1
if item_cnt % 10000 == 0: log_it("INFO:", f"looking up xrefs: {item_cnt} / {len(xr_dict)}")
db, ac = k.split("=")
termi = terminologies.get(db)
if termi is not None:
if db not in cited_terms: cited_terms[db] = set()
cited_terms[db].add(ac)
log_it("INFO:", f"looked up xrefs: {item_cnt} / {len(xr_dict)}")
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# get relevant set of terms to be RDFized and serialize them
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
for k in terminologies.termi_dict:
termi: Terminology = terminologies.get(k)
log_it("INFO:", f"Serializing terminology {k} ...")
file_out = open(out_dir + "data_" + termi.abbrev + ".ttl", "wb")
file_out.write(bytes(rb.get_ttl_prefixes() + "\n", "utf-8"))
parser = getattr(__import__("__main__"), termi.parser_name)(k)
# we get version from parser, will be used in RDF defining terminologies as NamedIndividual
termi.version = parser.get_termi_version()
cited_set = cited_terms.get(k) or set()
log_it("INFO:", "Cited_set", k, len(cited_set))
relevant_id_set = set()