diff --git a/src/humancellatlas/data/metadata/api.py b/src/humancellatlas/data/metadata/api.py index 45e2417..88147ad 100644 --- a/src/humancellatlas/data/metadata/api.py +++ b/src/humancellatlas/data/metadata/api.py @@ -12,6 +12,7 @@ MutableMapping, Optional, Set, + Tuple, Type, TypeVar, Union, @@ -25,14 +26,13 @@ ) from humancellatlas.data.metadata.age_range import AgeRange - -# A few helpful type aliases -# from humancellatlas.data.metadata.lookup import ( - lookup, LookupDefault, + lookup, ) +# A few helpful type aliases +# UUID4 = UUID AnyJSON2 = Union[str, int, float, bool, None, Mapping[str, Any], List[Any]] AnyJSON1 = Union[str, int, float, bool, None, Mapping[str, AnyJSON2], List[AnyJSON2]] @@ -677,17 +677,18 @@ class Link: source_type: str destination_id: UUID4 destination_type: str + link_type: str = 'process_link' @classmethod - def from_json(cls, json: JSON) -> Iterable['Link']: + def from_json(cls, json: JSON, schema_version: Tuple[int]) -> Iterable['Link']: if 'source_id' in json: - # v5 + # DCP/1 v5 (obsolete) yield cls(source_id=UUID4(json['source_id']), source_type=json['source_type'], destination_id=UUID4(json['destination_id']), destination_type=json['destination_type']) - else: - # vx + elif schema_version[0] == 1: + # DCP/1 vx (current) process_id = UUID4(json['process']) for source_id in json['inputs']: yield cls(source_id=UUID4(source_id), @@ -704,6 +705,42 @@ def from_json(cls, json: JSON) -> Iterable['Link']: source_type='process', destination_id=UUID4(protocol['protocol_id']), destination_type=lookup(protocol, 'type', 'protocol_type')) + elif schema_version[0] == 2: + # DCP/2 (current) + link_type = json['link_type'] + if link_type == 'process_link': + process_id = UUID4(json['process_id']) + process_type = json['process_type'] + for input_ in json['inputs']: + yield cls(link_type=link_type, + source_id=UUID4(input_['input_id']), + source_type=input_['input_type'], + destination_id=process_id, + destination_type=process_type) + for output in json['outputs']: + yield cls(link_type=link_type, + source_id=process_id, + source_type=process_type, + destination_id=UUID4(output['output_id']), + destination_type=output['output_type']) + for protocol in json['protocols']: + yield cls(link_type=link_type, + source_id=process_id, + source_type=process_type, + destination_id=UUID4(protocol['protocol_id']), + destination_type=protocol['protocol_type']) + elif link_type == 'supplementary_file_link': + entity = json['entity'] + for supp_file in json['files']: + yield cls(link_type=link_type, + source_id=UUID4(entity['entity_id']), + source_type=entity['entity_type'], + destination_id=UUID4(supp_file['file_id']), + destination_type=supp_file['file_type']) + else: + assert False, f'Unknown link_type {link_type}' + else: + assert False, f'Unknown schema_version {schema_version}' @dataclass(init=False) @@ -772,16 +809,21 @@ def from_json_vx(core_cls: Type[E], **kwargs) -> MutableMapping[UUID4, E]: self.entities = {**self.projects, **self.biomaterials, **self.processes, **self.protocols, **self.files} - links = metadata_files['links.json']['links'] - self.links = list(chain.from_iterable(map(Link.from_json, links))) + links_json = metadata_files['links.json'] + schema_version = tuple(map(int, links_json['schema_version'].split('.'))) + self.links = list(chain.from_iterable( + Link.from_json(link, schema_version) + for link in links_json['links'] + )) for link in self.links: - source_entity = self.entities[link.source_id] - destination_entity = self.entities[link.destination_id] - assert isinstance(source_entity, LinkedEntity) - assert isinstance(destination_entity, LinkedEntity) - source_entity.connect_to(destination_entity, forward=True) - destination_entity.connect_to(source_entity, forward=False) + if link.link_type == 'process_link': + source_entity = self.entities[link.source_id] + destination_entity = self.entities[link.destination_id] + assert isinstance(source_entity, LinkedEntity) + assert isinstance(destination_entity, LinkedEntity) + source_entity.connect_to(destination_entity, forward=True) + destination_entity.connect_to(source_entity, forward=False) def root_entities(self) -> Mapping[UUID4, LinkedEntity]: roots = {} diff --git a/test/cans/prod/cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439/2019-05-15T222432.561000Z/manifest.json b/test/cans/prod/cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439/2019-05-15T222432.561000Z/manifest.json new file mode 100644 index 0000000..41ef55a --- /dev/null +++ b/test/cans/prod/cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439/2019-05-15T222432.561000Z/manifest.json @@ -0,0 +1,194 @@ +[ + { + "content-type": "application/json; dcp-type=\"metadata/biomaterial\"", + "crc32c": "0300a92e", + "indexed": true, + "name": "cell_suspension_0.json", + "s3_etag": "4f3859e7778d1818bcf4120b76a1ffa6", + "sha1": "505038762f830ae810e7a63eea87ca72fc90196b", + "sha256": "4909898b1cbaea063ba589b146f7457c56928a6864546b4a465cde3d1b1d67f3", + "size": 850, + "uuid": "01ba6be9-ed4b-4c6b-ae05-2e06aadc2019", + "version": "2019-05-14T120006.941000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/biomaterial\"", + "crc32c": "7158fd43", + "indexed": true, + "name": "specimen_from_organism_0.json", + "s3_etag": "676e573bec1eb8fe9d7b9888ece979e7", + "sha1": "04c71e27a86ea91ee260df9feab594517cf9c5cd", + "sha256": "37eb5a18c9be2be0c452aecbf2d7cf50ec9af68e0379f6647c7aedf5372b9833", + "size": 812, + "uuid": "74eb3cb5-918a-49fc-9e15-3ac49fd54caf", + "version": "2019-05-14T114647.512000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/biomaterial\"", + "crc32c": "833cd7f3", + "indexed": true, + "name": "donor_organism_0.json", + "s3_etag": "15380716b8a9c7a78f33a6051a1a477d", + "sha1": "657c0e467fb61ee80819f31f86369c2227220307", + "sha256": "33e5ecaf9d039b5a41718eb2e38a0d66d15a701eaca7fdbebc9f2b5f46561ee1", + "size": 1749, + "uuid": "63818269-c4d9-429b-85a3-db39c0dd7fa0", + "version": "2019-05-14T112950.173000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/file\"", + "crc32c": "02c57821", + "indexed": true, + "name": "sequence_file_0.json", + "s3_etag": "608e74f1bf465c2d5e4951fab48b349b", + "sha1": "b885e81fd0ce11e170176535536a45299e1eed3a", + "sha256": "e497c5c87514a04bf398d95c60884bfdf10bc084c75d413ff2d308cb22326b94", + "size": 459, + "uuid": "61fd5348-92c5-446b-a57a-746330cebf76", + "version": "2019-05-14T112117.762000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/file\"", + "crc32c": "61502b5b", + "indexed": true, + "name": "supplementary_file_0.json", + "s3_etag": "b9789082b025294c44dce46ddd22b7aa", + "sha1": "daa76ae1d2af21871d8180006b066f56cdf7594f", + "sha256": "069391131baa61584a57a5fd9ec9633b6224722c783faf269db28d76d6133c79", + "size": 481, + "uuid": "e738a267-87fc-4070-abc7-b3be6442c6d0", + "version": "2019-05-14T110115.816000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/file\"", + "crc32c": "030e888d", + "indexed": true, + "name": "supplementary_file_1.json", + "s3_etag": "0dc591fd5db04fbb770de26ae5419037", + "sha1": "48b75f01afaee3ca9f04510725c4c3b0c536f8b0", + "sha256": "2a3940369dacfea767cc02b29b637d94f46fdffde6bc1b08c238f8b2850471de", + "size": 477, + "uuid": "01a1d04b-05d0-4904-b627-68b0dc02bc17", + "version": "2019-05-14T110109.564000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/project\"", + "crc32c": "ef3ef012", + "indexed": true, + "name": "project_0.json", + "s3_etag": "308189c7eab62a35a894125efbe76558", + "sha1": "9da3dba1ec159439071d6390a25ab9110be95f86", + "sha256": "2e2f330270644222106b2a8f648733cbd7609c283e29f1ee07363be32c223a9b", + "size": 8464, + "uuid": "8c3c290d-dfff-4553-8868-54ce45f4ba7f", + "version": "2019-05-14T112051.382000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/process\"", + "crc32c": "57699965", + "indexed": true, + "name": "process_0.json", + "s3_etag": "fdc514845934d034918521bb96f1ee24", + "sha1": "ba3c05024bdd18dd054ef07cbd2c6594fe64e168", + "sha256": "2ff1fab2dac6ac35b866a437f7bd720066212998432ab8eb05f6039b748c5225", + "size": 379, + "uuid": "91f475ec-be51-4f1e-a904-74b10b7259f1", + "version": "2019-05-14T121053.274000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/process\"", + "crc32c": "ad459007", + "indexed": true, + "name": "process_1.json", + "s3_etag": "b067d9b22cf3d999069bdc42a56fabbe", + "sha1": "23ed66f1f47ecb29cf5354d6fb675784804df817", + "sha256": "a3948a06685954b8f5d21113621e53cea384aea12aa07d9bb19796ccf4e7300d", + "size": 378, + "uuid": "e521d67d-c134-4dbd-9555-29e23f0463c5", + "version": "2019-05-14T122720.135000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/process\"", + "crc32c": "4352f3bf", + "indexed": true, + "name": "process_2.json", + "s3_etag": "f94d4ee8d194714e8db01f7f1d9ad66e", + "sha1": "db413a2532ec0d9d235aa56fd69ba27fc96bc70d", + "sha256": "cf1d4be9c9a62590b38014d092b67a981ec516013af924d30870a18cd6a060ed", + "size": 377, + "uuid": "453a352c-94fb-4d3b-b609-df1e7abf8c09", + "version": "2019-05-14T122652.545000Z" + }, + { + "content-type": "application/json; dcp-type=\"metadata/links\"", + "crc32c": "2c25a3c2", + "indexed": true, + "name": "links.json", + "s3_etag": "1f7c2cbccc797da2d885bb52de52232d", + "sha1": "39ba144902ba5d1095af5b8d5dace8dd19a4b08d", + "sha256": "33052247612f39f6ea48568a12c761842af30030e1242943bb2c5eb238722488", + "size": 2083, + "uuid": "51168054-6dad-45aa-916a-ef71135651b2", + "version": "2019-05-16T015324.197421Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "86192092", + "indexed": false, + "name": "21784_6#51_1.fastq.gz", + "s3_etag": "8ef4064fd5c94502b0d42c0dbecc74ca", + "sha1": "0c0a36b8e8e8bf53db8e5eed5688546f0d23f863", + "sha256": "1975336c7071ac70caef3ff833089b0dc26962b4d6fc5159aece28eaa4324052", + "size": 7591723, + "uuid": "e4d9ebe5-2e62-47cf-bc35-1fb8ef7c1ef7", + "version": "2019-05-16T015324.490566Z" + }, + { + "content-type": "application/gzip; dcp-type=data", + "crc32c": "56a9e200", + "indexed": false, + "name": "21784_6#51_2.fastq.gz", + "s3_etag": "86b0f2604a0aa1dbe3bd72d316f077f3", + "sha1": "d138b7a22c6fd4834f03564921a6436ebed5076e", + "sha256": "6a95e7f0792fe70dfcf174ee585c6900f2ed5fb401c5c8d0d15bef5e95271726", + "size": 7553715, + "uuid": "60f17b79-74d9-49b5-a6da-48580a67f11f", + "version": "2019-05-16T015324.739985Z" + }, + { + "content-type": "application/pdf; dcp-type=data", + "crc32c": "847325b6", + "indexed": false, + "name": "TissueDissociationProtocol.pdf", + "s3_etag": "7e892bf8f6aa489ccb08a995c7f017e1", + "sha1": "f2237ad0a776fd7057eb3d3498114c85e2f521d7", + "sha256": "6929799f227ae5f0b3e0167a6cf2bd683db097848af6ccde6329185212598779", + "size": 32748, + "uuid": "6578c322-7060-4c82-8469-9e54100e6b44", + "version": "2019-05-16T015325.007527Z" + }, + { + "content-type": "application/pdf; dcp-type=data", + "crc32c": "b9364bfa", + "indexed": false, + "name": "SmartSeq2_RTPCR_protocol.pdf", + "s3_etag": "846fd9e6b98041df46a1ddb94e85b6b9", + "sha1": "89d9eb3f1b94f78a33d46c0288c2e81d4002049b", + "sha256": "2f6866c4ede92123f90dd15fb180fac56e33309b8fd3f4f52f263ed2f8af2f16", + "size": 29230, + "uuid": "cd8e02d1-d0f9-4094-9a31-329931df60dc", + "version": "2019-05-16T015325.251968Z" + }, + { + "content-type": "application/pdf; dcp-type=data", + "crc32c": "3658ec51", + "indexed": false, + "name": "SmartSeq2_sequencing_protocol.pdf", + "s3_etag": "2742e1e78f6d4663bf41d3080396695c", + "sha1": "9ec6ee2b6e2093681c1fed694b3a8c78a2aa3438", + "sha256": "9c93a354a8636c041a31ba6f3fb00ef20352e1b853d8080d63a654221cb35673", + "size": 61134, + "uuid": "bf92ef4a-c422-44fb-bfc1-c2f86528b86b", + "version": "2019-05-16T015325.498431Z" + } +] diff --git a/test/cans/prod/cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439/2019-05-15T222432.561000Z/metadata.json b/test/cans/prod/cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439/2019-05-15T222432.561000Z/metadata.json new file mode 100644 index 0000000..b7c2fea --- /dev/null +++ b/test/cans/prod/cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439/2019-05-15T222432.561000Z/metadata.json @@ -0,0 +1,233 @@ +{ + "cell_suspension_0.json": { + "describedBy": "https://schema.humancellatlas.org/type/biomaterial/13.1.0/cell_suspension", + "schema_type": "biomaterial", + "biomaterial_core": { + "biomaterial_id": "21784_6#51", + "ncbi_taxon_id": [ + 10090 + ] + }, + "provenance": { + "document_id": "01ba6be9-ed4b-4c6b-ae05-2e06aadc2019", + "submission_date": "2019-05-14T11:01:32.467Z", + "update_date": "2019-05-14T12:00:06.941Z" + } + }, + "specimen_from_organism_0.json": { + "describedBy": "https://schema.humancellatlas.org/type/biomaterial/10.2.0/specimen_from_organism", + "schema_type": "biomaterial", + "biomaterial_core": { + "biomaterial_id": "1126_LN", + "ncbi_taxon_id": [ + 10090 + ] + }, + "organ": { + "text": "lymph node" + }, + "provenance": { + "document_id": "74eb3cb5-918a-49fc-9e15-3ac49fd54caf", + "submission_date": "2019-05-14T11:01:26.115Z", + "update_date": "2019-05-14T11:46:47.512Z" + } + }, + "donor_organism_0.json": { + "describedBy": "https://schema.humancellatlas.org/type/biomaterial/15.3.0/donor_organism", + "schema_type": "biomaterial", + "biomaterial_core": { + "biomaterial_id": "1126", + "ncbi_taxon_id": [ + 10090 + ] + }, + "genus_species": [ + { + "text": "Mus musculus" + } + ], + "is_living": "no", + "sex": "female", + "development_stage": { + "text": "adult" + }, + "provenance": { + "document_id": "63818269-c4d9-429b-85a3-db39c0dd7fa0", + "submission_date": "2019-05-14T11:01:25.684Z", + "update_date": "2019-05-14T11:29:50.173Z" + } + }, + "sequence_file_0.json": { + "describedBy": "https://schema.humancellatlas.org/type/file/9.0.0/sequence_file", + "schema_type": "file", + "file_core": { + "file_name": "21784_6#51_1.fastq.gz", + "format": "fastq.gz" + }, + "read_index": "read1", + "provenance": { + "document_id": "61fd5348-92c5-446b-a57a-746330cebf76", + "submission_date": "2019-05-14T10:52:59.245Z", + "update_date": "2019-05-14T11:21:17.762Z" + } + }, + "supplementary_file_0.json": { + "describedBy": "https://schema.humancellatlas.org/type/file/2.0.0/supplementary_file", + "schema_type": "file", + "file_core": { + "file_name": "TissueDissociationProtocol.pdf", + "format": "pdf" + }, + "provenance": { + "document_id": "e738a267-87fc-4070-abc7-b3be6442c6d0", + "submission_date": "2019-05-14T10:52:33.892Z", + "update_date": "2019-05-14T11:01:15.816Z" + } + }, + "supplementary_file_1.json": { + "describedBy": "https://schema.humancellatlas.org/type/file/2.0.0/supplementary_file", + "schema_type": "file", + "file_core": { + "file_name": "SmartSeq2_RTPCR_protocol.pdf", + "format": "pdf" + }, + "provenance": { + "document_id": "01a1d04b-05d0-4904-b627-68b0dc02bc17", + "submission_date": "2019-05-14T10:52:33.898Z", + "update_date": "2019-05-14T11:01:09.564Z" + } + }, + "project_0.json": { + "describedBy": "https://schema.humancellatlas.org/type/project/14.0.0/project", + "schema_type": "project", + "project_core": { + "project_short_name": "Mouse Melanoma", + "project_title": "Melanoma infiltration of stromal and immune cells", + "project_description": "The cancer microenvironment is a complex ecosystem characterized by dynamic interactions between diverse cell types, including malignant, immune and stromal cells. Here, we performed single-cell RNA sequencing on CD45+ and CD45- cells isolated from tumour and lymph nodes during a mouse model of melanoma. The transcriptional profiles of these individual cells taken at different time points coupled with assembled T cell receptor sequences, allowed us to identify distinct immune subpopulations and delineate their developmental trajectory. Our study provides insights into the complex interplay among cells within the tumour microenvironment and presents a valuable resource for future translational applications." + }, + "funders": [ + ], + "provenance": { + "document_id": "8c3c290d-dfff-4553-8868-54ce45f4ba7f", + "submission_date": "2019-05-14T10:52:33.885Z", + "update_date": "2019-05-14T11:20:51.382Z" + } + }, + "process_0.json": { + "process_core": { + "process_id": "proc_21784_6#51" + }, + "schema_type": "process", + "describedBy": "https://schema.humancellatlas.org/type/process/9.0.0/process", + "provenance": { + "document_id": "91f475ec-be51-4f1e-a904-74b10b7259f1", + "submission_date": "2019-05-14T11:06:54.971Z", + "update_date": "2019-05-14T12:10:53.274Z" + } + }, + "process_1.json": { + "process_core": { + "process_id": "process_id_366" + }, + "schema_type": "process", + "describedBy": "https://schema.humancellatlas.org/type/process/9.0.0/process", + "provenance": { + "document_id": "e521d67d-c134-4dbd-9555-29e23f0463c5", + "submission_date": "2019-05-14T11:12:40.680Z", + "update_date": "2019-05-14T12:27:20.135Z" + } + }, + "process_2.json": { + "process_core": { + "process_id": "process_id_24" + }, + "schema_type": "process", + "describedBy": "https://schema.humancellatlas.org/type/process/9.0.0/process", + "provenance": { + "document_id": "453a352c-94fb-4d3b-b609-df1e7abf8c09", + "submission_date": "2019-05-14T11:12:15.946Z", + "update_date": "2019-05-14T12:26:52.545Z" + } + }, + "links.json": { + "describedBy": "https://schema.humancellatlas.org/system/2.0.0/links", + "schema_type": "link_bundle", + "schema_version": "2.0.0", + "links": [ + { + "link_type": "process_link", + "process_id": "91f475ec-be51-4f1e-a904-74b10b7259f1", + "process_type": "process", + "inputs": [ + { + "input_type": "cell_suspension", + "input_id": "01ba6be9-ed4b-4c6b-ae05-2e06aadc2019" + } + ], + "outputs": [ + { + "output_type": "sequence_file", + "output_id": "61fd5348-92c5-446b-a57a-746330cebf76" + } + ], + "protocols": [ + ] + }, + { + "link_type": "process_link", + "process_id": "e521d67d-c134-4dbd-9555-29e23f0463c5", + "process_type": "process", + "inputs": [ + { + "input_type": "specimen_from_organism", + "input_id": "74eb3cb5-918a-49fc-9e15-3ac49fd54caf" + } + ], + "outputs": [ + { + "output_type": "cell_suspension", + "output_id": "01ba6be9-ed4b-4c6b-ae05-2e06aadc2019" + } + ], + "protocols": [ + ] + }, + { + "link_type": "process_link", + "process_id": "453a352c-94fb-4d3b-b609-df1e7abf8c09", + "process_type": "process", + "inputs": [ + { + "input_type": "donor_organism", + "input_id": "63818269-c4d9-429b-85a3-db39c0dd7fa0" + } + ], + "outputs": [ + { + "output_type": "specimen_from_organism", + "output_id": "74eb3cb5-918a-49fc-9e15-3ac49fd54caf" + } + ], + "protocols": [ + ] + }, + { + "link_type": "supplementary_file_link", + "entity": { + "entity_type": "project", + "entity_id": "8c3c290d-dfff-4553-8868-54ce45f4ba7f" + }, + "files": [ + { + "file_id": "e738a267-87fc-4070-abc7-b3be6442c6d0", + "file_type": "supplementary_file" + }, + { + "file_id": "01a1d04b-05d0-4904-b627-68b0dc02bc17", + "file_type": "supplementary_file" + } + ] + } + ] + } +} diff --git a/test/test.py b/test/test.py index cc1d9dd..6a5818e 100644 --- a/test/test.py +++ b/test/test.py @@ -24,6 +24,7 @@ Biomaterial, Bundle, DonorOrganism, + entity_types as api_entity_types, Project, SequenceFile, SpecimenFromOrganism, @@ -589,6 +590,25 @@ def test_cell_line(self): self.assertEqual(cell_lines[0].type, cell_lines[0].cell_line_type) self.assertEqual(cell_lines[0].model_organ, 'brain') + def test_links_json_v2_0_0(self): + """ + Test a bundle with a v2.0.0 links.json and supplementary_file links + """ + uuid = 'cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439' + version = '2019-05-15T222432.561000Z' + manifest, metadata_files = self._load_bundle(uuid, version, replica='aws', deployment='prod') + bundle = Bundle(uuid, version, manifest, metadata_files) + for expected_count, link_type in [(6, 'process_link'), (2, 'supplementary_file_link')]: + actual_count = sum([1 for link in bundle.links if link.link_type == link_type]) + self.assertEqual(expected_count, actual_count) + for link in bundle.links: + self.assertIn(link.source_type, api_entity_types) + self.assertIn(link.source_id, bundle.entities) + self.assertIsInstance(bundle.entities[link.source_id], api_entity_types[link.source_type]) + self.assertIn(link.destination_type, api_entity_types) + self.assertIn(link.destination_id, bundle.entities) + self.assertIsInstance(bundle.entities[link.destination_id], api_entity_types[link.destination_type]) + def test_project_fields(self): uuid = '68bdc676-c442-4581-923e-319c1c2d9018' version = '2018-10-07T130111.835234Z'