Skip to content

Commit

Permalink
added option to export all, and cache of aggregate dataset metrics to…
Browse files Browse the repository at this point in the history
… speed up export
  • Loading branch information
sherwoodf committed Dec 12, 2024
1 parent 089ea7b commit 61c7a5d
Show file tree
Hide file tree
Showing 8 changed files with 4,728 additions and 82 deletions.
24 changes: 12 additions & 12 deletions bia-export/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,18 @@ Run:

This will create `bia-study-metadata.json` using the example test data for studies

Note that with -r (root directory) - local files will be used to generate the export. If using ingest to generate the files, this will usually be: ~/.cache/bia-integrator-data-sm. If no root location is passed, and a study UUID (as opposed to accession ID) is used, then the API will be called to create the files.

If no Accession ID or UUID is passed, all studies will be processed (either based on all studies in the <root-folder>/study/ directory, or by querying for studies in the api). The studies a exported in order of release date.

The two points above hold for all export commands for the api. For the website-study export only, there is a optional cache in order to avoid processing all file reference every time an export is performed (as this slows down export a lot). E.g. running:

`poetry run bia-export website-study -o bia-study-metadata.json -c read_cache`

Will export all studies using the cached aggregation (when avaliable) as the counts for images, files, and the list of different file types.



Image Export
Run:

Expand All @@ -48,15 +60,3 @@ Run:
`poetry run bia-export datasets-for-website-image S-BIADTEST -o bia-dataset-export.json -r test/input_data`
This will create `bia-dataset-export.json` using the example test data.


Commands to generate the current BIA data
-----------------------------------------

Study export:

```
poetry run bia-export website-study S-BIAD1218 S-BIAD1201 S-BIAD1034 S-BIAD1026 S-BIAD1287 S-BIAD1055 S-BIAD1185 S-BIAD957 S-BIAD748 S-BIAD850 S-BIAD1018 S-BIAD1077 S-BIAD1122 S-BIAD1163 S-BIAD618 S-BIAD620 S-BIAD650 S-BIAD814 S-BIAD825 S-BIAD954 S-BIAD963 S-BIAD1044 S-BIAD1183 S-BIAD627 S-BIAD826 S-BIAD843 S-BIAD845 S-BIAD849 S-BIAD890 S-BIAD1008 S-BIAD1024 S-BIAD1092 S-BIAD1093 S-BIAD1134 S-BIAD1135 S-BIAD1165 S-BIAD1175 S-BIAD1197 S-BIAD1215 S-BIAD1248 S-BIAD1260 S-BIAD663 S-BIAD831 S-BIAD846 S-BIAD851 S-BIAD887 S-BIAD922 S-BIAD928 S-BIAD1005 S-BIAD1009 S-BIAD1057 S-BIAD1079 S-BIAD1083 S-BIAD1088 S-BIAD1091 S-BIAD1099 S-BIAD1104 S-BIAD1168 S-BIAD1169 S-BIAD1193 S-BIAD1203 S-BIAD1268 S-BIAD1284 S-BIAD1323 S-BIAD607 S-BIAD608 S-BIAD610 S-BIAD616 S-BIAD626 S-BIAD661 S-BIAD694 S-BIAD705 S-BIAD800 S-BIAD813 S-BIAD815 S-BIAD823 S-BIAD824 S-BIAD829 S-BIAD830 S-BIAD847 S-BIAD848 S-BIAD852 S-BIAD861 S-BIAD865 S-BIAD866 S-BIAD884 S-BIAD885 S-BIAD904 S-BIAD931 S-BIAD955 S-BIAD970 S-BIAD986 S-BIAD1012 S-BIAD1019 S-BIAD1028 S-BIAD1039 S-BIAD1064 S-BIAD1078 S-BIAD1084 S-BIAD1090 S-BIAD1094 S-BIAD1095 S-BIAD1097 S-BIAD1102 S-BIAD1114 S-BIAD1116 S-BIAD1119 S-BIAD1130 S-BIAD1133 S-BIAD1151 S-BIAD1152 S-BIAD1157 S-BIAD1159 S-BIAD1162 S-BIAD1167 S-BIAD1196 S-BIAD1199 S-BIAD1200 S-BIAD1204 S-BIAD1236 S-BIAD1244 S-BIAD1245 S-BIAD1267 S-BIAD1271 S-BIAD1272 S-BIAD1274 S-BIAD1285 S-BIAD1293 S-BIAD1308 S-BIAD1327 S-BIAD612 S-BIAD633 S-BIAD646 S-BIAD678 S-BIAD680 S-BIAD684 S-BIAD700 S-BIAD720 S-BIAD725 S-BIAD728 S-BIAD770 S-BIAD855 S-BIAD860 S-BIAD862 S-BIAD864 S-BIAD882 S-BIAD886 S-BIAD915 S-BIAD916 S-BIAD965 S-BIAD967 S-BIAD985 S-BIAD987 S-BIAD992 S-BIAD993 S-BIAD1027 S-BIAD1030 S-BIAD1033 S-BIAD1045 S-BIAD1063 S-BIAD1080 S-BIAD1082 S-BIAD1086 S-BIAD1096 S-BIAD1100 S-BIAD1121 S-BIAD1155 S-BIAD1158 S-BIAD1184 S-BIAD1186 S-BIAD1194 S-BIAD1216 S-BIAD1232 S-BIAD1235 S-BIAD1237 S-BIAD1239 S-BIAD1247 S-BIAD1250 S-BIAD1259 S-BIAD1270 S-BIAD1282 S-BIAD1298 S-BIAD1300 S-BIAD1302 S-BIAD1314 S-BIAD1316 S-BIAD1333 S-BIAD602 S-BIAD603 S-BIAD606 S-BIAD609 S-BIAD611 S-BIAD617 S-BIAD619 S-BIAD621 S-BIAD624 S-BIAD647 S-BIAD651 S-BIAD652 S-BIAD657 S-BIAD658 S-BIAD664 S-BIAD668 S-BIAD669 S-BIAD676 S-BIAD679 S-BIAD682 S-BIAD696 S-BIAD703 S-BIAD704 S-BIAD767 S-BIAD769 S-BIAD777 S-BIAD807 S-BIAD808 S-BIAD817 S-BIAD821 S-BIAD822 S-BIAD827 S-BIAD832 S-BIAD840 S-BIAD841 S-BIAD842 S-BIAD844 S-BIAD863 S-BIAD900 S-BIAD901 S-BIAD903 S-BIAD910 S-BIAD913 S-BIAD914 S-BIAD926 S-BIAD930 S-BIAD961 S-BIAD969 S-BIAD976 S-BIAD984 S-BIAD988 S-BIAD999 -o bia-study-metadata.json -r ~/.cache/bia-integrator-data-sm/
```

And similarly for the images and datasets.

47 changes: 34 additions & 13 deletions bia-export/bia_export/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
from rich.logging import RichHandler
from typing_extensions import Annotated
from pathlib import Path
from bia_export.website_export.export_all import get_study_ids
from .website_export.studies.transform import transform_study
from .website_export.studies.models import StudyCLIContext
from .website_export.studies.models import StudyCLIContext, CacheUse
from .website_export.images.transform import transform_images
from .website_export.images.models import ImageCLIContext
from .website_export.datasets_for_images.transform import transform_datasets
Expand All @@ -23,7 +24,16 @@

@app.command()
def website_study(
id_list: Annotated[List[str], typer.Argument(help="IDs of the studies to export")],
id_list: Annotated[
Optional[List[str]], typer.Argument(help="IDs of the studies to export")
] = None,
output_filename: Annotated[
Path,
typer.Option(
"--out_file",
"-o",
),
] = Path("bia-images-export.json"),
root_directory: Annotated[
Optional[Path],
typer.Option(
Expand All @@ -32,24 +42,29 @@ def website_study(
help="If root directory specified then use files there, rather than calling API",
),
] = None,
output_filename: Annotated[
Path,
cache: Annotated[
Optional[CacheUse],
typer.Option(
"--out_file",
"-o",
"--cache",
"-c",
),
] = Path("bia-images-export.json"),
] = None,
):

if root_directory:
abs_root = root_directory.resolve()

if not id_list:
id_list = get_study_ids(root_directory)

studies_map = {}
for id in id_list:
if root_directory:
context = StudyCLIContext(root_directory=abs_root, accession_id=id)
context = StudyCLIContext(
root_directory=abs_root, accession_id=id, cache_use=cache
)
else:
context = StudyCLIContext(study_uuid=id)
context = StudyCLIContext(study_uuid=id, cache_use=cache)
study = transform_study(context)
studies_map[study.accession_id] = study.model_dump(mode="json")

Expand All @@ -61,8 +76,8 @@ def website_study(
@app.command()
def website_image(
id_list: Annotated[
List[str], typer.Argument(help="Accession ID of the study to export")
],
Optional[List[str]], typer.Argument(help="Accession ID of the study to export")
] = None,
root_directory: Annotated[
Optional[Path],
typer.Option(
Expand All @@ -84,6 +99,9 @@ def website_image(
if root_directory:
abs_root = root_directory.resolve()

if not id_list:
id_list = get_study_ids(root_directory)

image_map = {}
for id in id_list:
if root_directory:
Expand All @@ -101,8 +119,8 @@ def website_image(
@app.command()
def datasets_for_website_image(
id_list: Annotated[
List[str], typer.Argument(help="Accession ID of the study to export")
],
Optional[List[str]], typer.Argument(help="Accession ID of the study to export")
] = None,
root_directory: Annotated[
Optional[Path],
typer.Option(
Expand All @@ -123,6 +141,9 @@ def datasets_for_website_image(
if root_directory:
abs_root = root_directory.resolve()

if not id_list:
id_list = get_study_ids(root_directory)

dataset_map = {}
for id in id_list:
if root_directory:
Expand Down
48 changes: 48 additions & 0 deletions bia-export/bia_export/website_export/export_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from glob import glob
from pathlib import Path
from typing import Optional
from bia_export.bia_client import api_client
from bia_integrator_api.models import Study
from .generic_object_retrieval import read_api_json_file
import logging

logger = logging.getLogger("__main__." + __name__)

def find_local_studies(root_path: Path) -> list[Study]:
study_search_path = root_path.joinpath("study", "**/*.json")
file_paths = glob(str(study_search_path), recursive=True)
studies = []
for file_path in file_paths:
studies.append(read_api_json_file(file_path, Study))
return studies


def fetch_studies_from_api(
fetch_size: int, agregator_list: list[Study] = None
) -> list[Study]:
if not agregator_list:
agregator_list = []
start_uuid = None
else:
start_uuid = agregator_list[-1].uuid

fetched_studies = api_client.get_studies(
page_size=fetch_size, start_from_uuid=start_uuid
)
agregator_list += fetched_studies

if len(fetched_studies) != fetch_size:
return agregator_list
else:
return fetch_studies_from_api(fetch_size, agregator_list)


def get_study_ids(root_directory: Optional[Path] = None):
if root_directory:
studies_list = find_local_studies(root_directory)
sorted_studies = sorted(studies_list, key=lambda study: study.release_date, reverse=True)
return [study.accession_id for study in sorted_studies]
else:
studies_list = fetch_studies_from_api(100)
sorted_studies = sorted(studies_list, key=lambda study: study.release_date, reverse=True)
return [study.uuid for study in sorted_studies]
39 changes: 23 additions & 16 deletions bia-export/bia_export/website_export/generic_object_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from uuid import UUID
from pydantic import BaseModel
from pydantic.alias_generators import to_snake
from bia_shared_datamodels.bia_data_model import DocumentMixin
from bia_export.bia_client import api_client
from .website_models import CLIContext
import json
Expand Down Expand Up @@ -90,19 +91,25 @@ def retrieve_object(


def get_all_api_results(
uuid: UUID, api_method, page_size_setting=20
) -> List[BaseModel]:
object_list = []
added_objects = api_method(str(uuid), page_size=page_size_setting)
if len(added_objects) > 0:
object_list += added_objects
start_uuid = added_objects[-1].uuid
while page_size_setting == len(added_objects):
added_objects = api_method(
str(uuid),
page_size=page_size_setting,
start_from_uuid=start_uuid,
)
object_list += added_objects
start_uuid = added_objects[-1].uuid
return object_list
uuid: UUID,
api_method,
page_size_setting=20,
aggregator_list: list[DocumentMixin] = None,
) -> list[DocumentMixin]:
if not aggregator_list:
aggregator_list: list[DocumentMixin] = []
start_uuid = None
else:
start_uuid = aggregator_list[-1].uuid

fetched_objects = api_method(
str(uuid),
page_size=page_size_setting,
start_from_uuid=str(start_uuid) if start_uuid else None,
)
aggregator_list += fetched_objects

if len(fetched_objects) != page_size_setting:
return aggregator_list
else:
return get_all_api_results(uuid, api_method, page_size_setting, aggregator_list)
9 changes: 8 additions & 1 deletion bia-export/bia_export/website_export/studies/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
)
from bia_integrator_api import models

from pydantic import BaseModel, Field
from pydantic import Field

from enum import Enum

from typing import List, Optional, Type

Expand Down Expand Up @@ -57,6 +58,11 @@ class Study(models.Study):
)


class CacheUse(Enum):
READ_CACHE = "read_cache"
WRITE_CACHE = "write_cache"


class StudyCLIContext(CLIContext):
dataset_file_aggregate_data: dict = Field(
default_factory=dict,
Expand All @@ -73,3 +79,4 @@ class StudyCLIContext(CLIContext):
description="""Tracks e.g. which BioSamples have been displayed in previous dataset sections to
determine whether details should default to open or closed.""",
)
cache_use: Optional[CacheUse] = None
Loading

0 comments on commit 61c7a5d

Please sign in to comment.