Skip to content

Commit

Permalink
Add DANDI upload to YAML spec (#1089)
Browse files Browse the repository at this point in the history
Co-authored-by: Heberto Mayorquin <h.mayorquin@gmail.com>
  • Loading branch information
CodyCBakerPhD and h-mayorquin authored Dec 9, 2024
1 parent 4ba1e82 commit 4b3172c
Show file tree
Hide file tree
Showing 9 changed files with 182 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/deploy-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ jobs:
if: ${{ needs.assess-file-changes.outputs.SOURCE_CHANGED == 'true' }}
uses: ./.github/workflows/live-service-testing.yml
secrets:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
S3_GIN_BUCKET: ${{ secrets.S3_GIN_BUCKET }}
DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }}
with: # Ternary operator: condition && value_if_true || value_if_false
python-versions: ${{ github.event.pull_request.draft == true && '["3.9"]' || needs.load_python_and_os_versions.outputs.ALL_PYTHON_VERSIONS }}
Expand Down
16 changes: 16 additions & 0 deletions .github/workflows/live-service-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ on:
type: string

secrets:
AWS_ACCESS_KEY_ID:
required: true
AWS_SECRET_ACCESS_KEY:
required: true
S3_GIN_BUCKET:
required: true
DANDI_API_KEY:
required: true

Expand Down Expand Up @@ -45,7 +51,17 @@ jobs:
- name: Install full requirements
run: pip install .[test,full]

- name: Prepare data for tests
uses: ./.github/actions/load-data
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
s3-gin-bucket: ${{ secrets.S3_GIN_BUCKET }}
os: ${{ matrix.os }}

- name: Run subset of tests that use DANDI live services
run: pytest -rsx -n auto tests/test_minimal/test_tools/dandi_transfer_tools.py
- name: Run subset of tests that use DANDI live services with YAML
run: pytest -rsx -n auto tests/test_on_data/test_yaml/yaml_dandi_transfer_tools.py
- name: Run subset of tests that use Globus live services
run: pytest -rsx -n auto tests/test_minimal/test_tools/globus_transfer_tools.py
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* Added .csv support to DeepLabCutInterface [PR #1140](https://github.com/catalystneuro/neuroconv/pull/1140)
* Added the `rclone_transfer_batch_job` helper function for executing Rclone data transfers in AWS Batch jobs. [PR #1085](https://github.com/catalystneuro/neuroconv/pull/1085)
* Added the `deploy_neuroconv_batch_job` helper function for deploying NeuroConv AWS Batch jobs. [PR #1086](https://github.com/catalystneuro/neuroconv/pull/1086)
* YAML specification files now accept an outer keyword `upload_to_dandiset="< six-digit ID >"` to automatically upload the produced NWB files to the DANDI archive [PR #1089](https://github.com/catalystneuro/neuroconv/pull/1089)


## Improvements
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"required": ["experiments"],
"additionalProperties": false,
"properties": {
"upload_to_dandiset": {"type": "string"},
"metadata": {"$ref": "./metadata_schema.json#"},
"conversion_options": {"type": "object"},
"data_interfaces": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json
import os
from importlib import import_module
from pathlib import Path
from typing import Optional
Expand All @@ -7,6 +9,7 @@
from pydantic import DirectoryPath, FilePath
from referencing import Registry, Resource

from ..data_transfers import automatic_dandi_upload
from ...nwbconverter import NWBConverter
from ...utils import dict_deep_update, load_dict_from_file

Expand Down Expand Up @@ -50,7 +53,7 @@ def run_conversion_from_yaml(
data_folder_path: Optional[DirectoryPath] = None,
output_folder_path: Optional[DirectoryPath] = None,
overwrite: bool = False,
):
) -> None:
"""
Run conversion to NWB given a yaml specification file.
Expand Down Expand Up @@ -100,6 +103,14 @@ def run_conversion_from_yaml(
registry=registry,
)

upload_to_dandiset = "upload_to_dandiset" in specification
if upload_to_dandiset and "DANDI_API_KEY" not in os.environ:
message = (
"The 'upload_to_dandiset' prompt was found in the YAML specification, "
"but the environment variable 'DANDI_API_KEY' was not set."
)
raise ValueError(message)

global_metadata = specification.get("metadata", dict())
global_conversion_options = specification.get("conversion_options", dict())
data_interfaces_spec = specification.get("data_interfaces")
Expand All @@ -115,28 +126,55 @@ def run_conversion_from_yaml(
experiment_metadata = experiment.get("metadata", dict())
for session in experiment["sessions"]:
file_counter += 1

source_data = session["source_data"]
for interface_name, interface_source_data in session["source_data"].items():
for key, value in interface_source_data.items():
if key == "file_paths":
source_data[interface_name].update({key: [str(Path(data_folder_path) / x) for x in value]})
elif key in ("file_path", "folder_path"):
source_data[interface_name].update({key: str(Path(data_folder_path) / value)})

converter = CustomNWBConverter(source_data=source_data)

metadata = converter.get_metadata()
for metadata_source in [global_metadata, experiment_metadata, session.get("metadata", dict())]:
metadata = dict_deep_update(metadata, metadata_source)
nwbfile_name = session.get("nwbfile_name", f"temp_nwbfile_name_{file_counter}").strip(".nwb")

session_id = session.get("metadata", dict()).get("NWBFile", dict()).get("session_id", None)
if upload_to_dandiset and session_id is None:
message = (
"The 'upload_to_dandiset' prompt was found in the YAML specification, "
"but the 'session_id' was not found for session with info block: "
f"\n\n {json.dumps(obj=session, indent=2)}\n\n"
"File intended for DANDI upload must include a session ID."
)
raise ValueError(message)

session_conversion_options = session.get("conversion_options", dict())
conversion_options = dict()
for key in converter.data_interface_objects:
conversion_options[key] = dict(session_conversion_options.get(key, dict()), **global_conversion_options)

nwbfile_name = session.get("nwbfile_name", f"temp_nwbfile_name_{file_counter}").strip(".nwb")
converter.run_conversion(
nwbfile_path=output_folder_path / f"{nwbfile_name}.nwb",
metadata=metadata,
overwrite=overwrite,
conversion_options=conversion_options,
)

if upload_to_dandiset:
dandiset_id = specification["upload_to_dandiset"]
staging = int(dandiset_id) >= 200_000
automatic_dandi_upload(
dandiset_id=dandiset_id,
nwb_folder_path=output_folder_path,
staging=staging,
)

return None # We can early return since organization below will occur within the upload step

# To properly mimic a true dandi organization, the full directory must be populated with NWBFiles.
all_nwbfile_paths = [nwbfile_path for nwbfile_path in output_folder_path.iterdir() if nwbfile_path.suffix == ".nwb"]
nwbfile_paths_to_set = [
Expand Down
1 change: 1 addition & 0 deletions tests/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def test_tools(self):
"get_package_version",
"is_package_installed",
"deploy_process",
"data_transfers",
"LocalPathExpander",
"get_module",
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
metadata:
NWBFile:
lab: My Lab
institution: My Institution

conversion_options:
stub_test: True

data_interfaces:
ap: SpikeGLXRecordingInterface
lf: SpikeGLXRecordingInterface
phy: PhySortingInterface

upload_to_dandiset: "200560"

experiments:
ymaze:
metadata:
NWBFile:
session_description: Subject navigating a Y-shaped maze.

sessions:
- nwbfile_name: example_converter_spec_1
source_data:
ap:
file_path: spikeglx/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.ap.bin
metadata:
NWBFile:
session_start_time: "2020-10-09T21:19:09+00:00"
session_id: "test-yaml-1"
Subject:
subject_id: "yaml-1"
sex: F
age: P35D
species: Mus musculus
- nwbfile_name: example_converter_spec_2.nwb
metadata:
NWBFile:
session_start_time: "2020-10-10T21:19:09+00:00"
session_id: "test-yaml-2"
Subject:
subject_id: "yaml-002"
sex: F
age: P35D
species: Mus musculus
source_data:
lf:
file_path: spikeglx/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.lf.bin

open_explore:
sessions:
- nwbfile_name: example_converter_spec_3
source_data:
lf:
file_path: spikeglx/Noise4Sam_g0/Noise4Sam_g0_imec0/Noise4Sam_g0_t0.imec0.lf.bin
phy:
folder_path: phy/phy_example_0/
metadata:
NWBFile:
session_start_time: "2020-10-11T21:19:09+00:00"
session_id: test YAML 3
Subject:
subject_id: YAML Subject Name
sex: F
age: P35D
species: Mus musculus
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"fname",
[
"GIN_conversion_specification.yml",
"GIN_conversion_specification_dandi_upload.yml",
"GIN_conversion_specification_missing_nwbfile_names.yml",
"GIN_conversion_specification_no_nwbfile_name_or_other_metadata.yml",
"GIN_conversion_specification_videos.yml",
Expand Down
53 changes: 53 additions & 0 deletions tests/test_on_data/test_yaml/yaml_dandi_transfer_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import platform
import time
from datetime import datetime, timedelta
from pathlib import Path

import dandi.dandiapi
import pytest
from packaging.version import Version

from neuroconv import run_conversion_from_yaml

from ..setup_paths import ECEPHY_DATA_PATH, OUTPUT_PATH

DANDI_API_KEY = os.getenv("DANDI_API_KEY")
HAVE_DANDI_KEY = DANDI_API_KEY is not None and DANDI_API_KEY != "" # can be "" from external forks
_PYTHON_VERSION = platform.python_version()


@pytest.mark.skipif(
not HAVE_DANDI_KEY or Version(".".join(_PYTHON_VERSION.split(".")[:2])) != Version("3.12"),
reason="You must set your DANDI_API_KEY to run this test!",
)
def test_run_conversion_from_yaml_with_dandi_upload():
path_to_test_yml_files = Path(__file__).parent / "conversion_specifications"
yaml_file_path = path_to_test_yml_files / "GIN_conversion_specification_dandi_upload.yml"
run_conversion_from_yaml(
specification_file_path=yaml_file_path,
data_folder_path=ECEPHY_DATA_PATH,
output_folder_path=OUTPUT_PATH,
overwrite=True,
)

time.sleep(60) # Give some buffer room for server to process before making assertions against DANDI API

client = dandi.dandiapi.DandiAPIClient(api_url="https://api-staging.dandiarchive.org/api")
dandiset = client.get_dandiset("200560")

expected_asset_paths = [
"sub-yaml-1/sub-yaml-1_ses-test-yaml-1_ecephys.nwb",
"sub-yaml-002/sub-yaml-002_ses-test-yaml-2_ecephys.nwb",
"sub-YAML-Subject-Name/sub-YAML-Subject-Name_ses-test-YAML-3_ecephys.nwb",
]
for asset_path in expected_asset_paths:
test_asset = dandiset.get_asset_by_path(path=asset_path) # Will error if not found
test_asset_metadata = test_asset.get_raw_metadata()

# Past uploads may have created the same apparent file, so look at the modification time to ensure
# this test is actually testing the most recent upload
date_modified = datetime.fromisoformat(
test_asset_metadata["dateModified"].split("Z")[0] # Timezones look a little messy
)
assert datetime.now() - date_modified < timedelta(minutes=10)

0 comments on commit 4b3172c

Please sign in to comment.