Release beta 29

DataBiosphere · May 21, 2021 · a291ea7 · a291ea7
2 parents d460591 + e4811a5
commit a291ea7
Show file tree

Hide file tree

Showing 9 changed files with 506 additions and 115 deletions.
diff --git a/Makefile b/Makefile
@@ -12,11 +12,11 @@ install_flake8:
 	pip install -U flake8==3.7.8
 
 install:
-	pip install -e .[dss,test,coverage,examples]
+	pip install -e .[dss,staging_area,test,coverage,examples]
 
 travis_install:
 	pip install -U setuptools>=40.1.0
-	pip install -e .[dss,test,coverage]
+	pip install -e .[dss,staging_area,test,coverage]
 
 test: install
 	coverage run -m unittest discover -vs test

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
-[![Build Status](https://travis-ci.org/HumanCellAtlas/metadata-api.svg?branch=develop)](https://travis-ci.org/HumanCellAtlas/metadata-api)
-[![Coverage Status](https://coveralls.io/repos/github/HumanCellAtlas/metadata-api/badge.svg?branch=develop)](https://coveralls.io/github/HumanCellAtlas/metadata-api?branch=develop)
+[![Build Status](https://travis-ci.com/DataBiosphere/hca-metadata-api.svg?branch=develop)](https://travis-ci.com/DataBiosphere/hca-metadata-api)
+[![Coverage Status](https://coveralls.io/repos/github/DataBiosphere/hca-metadata-api/badge.svg?branch=develop)](https://coveralls.io/github/DataBiosphere/hca-metadata-api?branch=develop)
 
 ## The HumanCellAtlas metadata API
 
@@ -16,9 +16,25 @@ Version 1.0 will be on PyPI but until then we need to install from GitHub:
 ```
 virtualenv -p python3 foo
 source foo/bin/activate
-pip install "git+git://github.com/HumanCellAtlas/metadata-api@master#egg=hca-metadata-api[dss]"
+pip install "git+git://github.com/DataBiosphere/hca-metadata-api@master#egg=hca-metadata-api[dss]"
 ```
 
 You can omit `[dss]` at the end of the `pip` invocation if you don't need
 the download helper this library provides and don't want to pull in the HCA CLI
 distribution the helper depends on.
+
+## Github credentials
+
+Github credentials in the form of a personal access token are required to run
+test cases that pull files from the canned staging area in the
+[schema-test-data](https://github.com/HumanCellAtlas/schema-test-data)
+repository.
+
+Use the
+[Creating a personal access token](https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token)
+guide to create your token. No additional scopes or permissions should be
+granted to this token as it will only be used to read from the canned staging
+area repository.
+
+Copy the token and use it as the value of an environment variable named
+`GITHUB_TOKEN`.
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="hca-metadata-api",
-    version="1.0b28",
+    version="1.0b29",
     license='MIT',
     install_requires=[
         "dataclasses >= 0.6;python_version<'3.7'"
@@ -14,6 +14,12 @@
             'urllib3 >= 1.23',
             'requests >= 2.19.1'
         ],
+        "staging_area": [
+            'attrs == 20.3.0',
+            'furl == 2.1.2',
+            'jsonschema == 3.2.0',
+            'PyGithub == 1.54.1'
+        ],
         "examples": [
             'jupyter >= 1.0.0'
         ],
@@ -30,6 +36,6 @@
     package_dir={'': 'src'},
     packages=find_namespace_packages('src'),
     project_urls={
-        "Source Code": "https://github.com/HumanCellAtlas/metadata-api",
+        "Source Code": "https://github.com/DataBiosphere/hca-metadata-api",
     }
 )
diff --git a/src/humancellatlas/data/metadata/api.py b/src/humancellatlas/data/metadata/api.py
@@ -83,6 +83,8 @@ class Entity:
     document_id: UUID4
     submitter_id: Optional[str]
     metadata_manifest_entry: Optional[ManifestEntry]
+    submission_date: str
+    update_date: Optional[str]
 
     @property
     def is_stitched(self):
@@ -120,6 +122,8 @@ def __init__(self,
         if False and self.metadata_manifest_entry is not None:
             assert self.document_id == self.metadata_manifest_entry.uuid
         self.submitter_id = provenance.get('submitter_id')
+        self.submission_date = lookup(provenance, 'submission_date', 'submissionDate')
+        self.update_date = lookup(provenance, 'update_date', 'updateDate', default=None)
 
     @property
     def address(self):
@@ -740,6 +744,7 @@ class File(LinkedEntity):
     to_processes: MutableMapping[UUID4, Process]
     manifest_entry: ManifestEntry
     content_description: Set[str]
+    file_source: str
 
     def __init__(self,
                  json: JSON,
@@ -755,6 +760,7 @@ def __init__(self,
         self.format = lookup(core, 'format', 'file_format')
         self.manifest_entry = manifest[core['file_name']]
         self.content_description = {ontology_label(cd) for cd in core.get('content_description', [])}
+        self.file_source = core.get('file_source')
         self.from_processes = {}
         self.to_processes = {}
 
@@ -796,7 +802,15 @@ class SupplementaryFile(File):
 
 @dataclass(init=False)
 class AnalysisFile(File):
-    pass
+    matrix_cell_count: int
+
+    def __init__(self,
+                 json: JSON,
+                 metadata_manifest_entry,
+                 manifest: Mapping[str, ManifestEntry]):
+        super().__init__(json, metadata_manifest_entry, manifest)
+        content = json.get('content', json)
+        self.matrix_cell_count = content.get('matrix_cell_count')
 
 
 @dataclass(init=False)
@@ -843,7 +857,7 @@ def from_json(cls, json: JSON, schema_version: Tuple[int]) -> Iterable['Link']:
                           source_type='process',
                           destination_id=UUID4(protocol['protocol_id']),
                           destination_type=lookup(protocol, 'type', 'protocol_type'))
-        elif schema_version[0] == 2:
+        elif schema_version[0] in (2, 3):
             # DCP/2 (current)
             link_type = json['link_type']
             if link_type == 'process_link':

diff --git a/src/humancellatlas/data/metadata/helpers/exception.py b/src/humancellatlas/data/metadata/helpers/exception.py
@@ -0,0 +1,36 @@
+# Copied from https://github.com/DataBiosphere/azul/blob/develop/src/azul/__init__.py
+
+
+class RequirementError(RuntimeError):
+    """
+    Unlike assertions, unsatisfied requirements do not constitute a bug in the program.
+    """
+
+
+def require(condition: bool, *args, exception: type = RequirementError):
+    """
+    Raise a RequirementError, or an instance of the given exception class, if the given condition is False.
+
+    :param condition: the boolean condition to be required
+
+    :param args: optional positional arguments to be passed to the exception constructor. Typically only one such
+                 argument should be provided: a string containing a textual description of the requirement.
+
+    :param exception: a custom exception class to be instantiated and raised if the condition does not hold
+    """
+    reject(not condition, *args, exception=exception)
+
+
+def reject(condition: bool, *args, exception: type = RequirementError):
+    """
+    Raise a RequirementError, or an instance of the given exception class, if the given condition is True.
+
+    :param condition: the boolean condition to be rejected
+
+    :param args: optional positional arguments to be passed to the exception constructor. Typically only one such
+                 argument should be provided: a string containing a textual description of the rejected condition.
+
+    :param exception: a custom exception class to be instantiated and raised if the condition occurs
+    """
+    if condition:
+        raise exception(*args)
diff --git a/src/humancellatlas/data/metadata/helpers/schema_validation.py b/src/humancellatlas/data/metadata/helpers/schema_validation.py
@@ -0,0 +1,42 @@
+from functools import (
+    lru_cache,
+)
+import json
+import logging
+
+from jsonschema import (
+    FormatChecker,
+    ValidationError,
+    validate,
+)
+import requests
+
+from humancellatlas.data.metadata.api import (
+    JSON,
+)
+from humancellatlas.data.metadata.helpers.exception import (
+    RequirementError,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class SchemaValidator:
+
+    def validate_json(self, file_json: JSON, file_name: str):
+        try:
+            schema = self._download_schema(file_json['describedBy'])
+        except json.decoder.JSONDecodeError as e:
+            schema_url = file_json['describedBy']
+            raise RequirementError('Failed to parse schema JSON',
+                                   file_name, schema_url) from e
+        try:
+            validate(file_json, schema, format_checker=FormatChecker())
+        except ValidationError as e:
+            raise RequirementError(*e.args, file_name) from e
+
+    @lru_cache(maxsize=None)
+    def _download_schema(self, schema_url: str) -> JSON:
+        response = requests.get(schema_url, allow_redirects=False)
+        response.raise_for_status()
+        return response.json()