added option to export all, and cache of aggregate dataset metrics to…

… speed up export
BioImage-Archive · Dec 12, 2024 · 61c7a5d · 61c7a5d
1 parent 089ea7b
commit 61c7a5d
Show file tree

Hide file tree

Showing 8 changed files with 4,728 additions and 82 deletions.
diff --git a/bia-export/README.md b/bia-export/README.md
@@ -36,6 +36,18 @@ Run:
 
 This will create `bia-study-metadata.json` using the example test data for studies
 
+Note that with -r (root directory) - local files will be used to generate the export. If using ingest to generate the files, this will usually be: ~/.cache/bia-integrator-data-sm. If no root location is passed, and a study UUID (as opposed to accession ID) is used, then the API will be called to create the files.
+
+If no Accession ID or UUID is passed, all studies will be processed (either based on all studies in the <root-folder>/study/ directory, or by querying for studies in the api). The studies a exported in order of release date. 
+
+The two points above hold for all export commands for the api. For the website-study export only, there is a optional cache in order to avoid processing all file reference every time an export is performed (as this slows down export a lot). E.g. running:
+
+`poetry run bia-export website-study -o bia-study-metadata.json -c read_cache`
+
+Will export all studies using the cached aggregation (when avaliable) as the counts for images, files, and the list of different file types.
+
+
+
 Image Export
 Run:
 
@@ -48,15 +60,3 @@ Run:
     `poetry run bia-export datasets-for-website-image S-BIADTEST -o bia-dataset-export.json -r test/input_data`
 This will create `bia-dataset-export.json` using the example test data.
 
-
-Commands to generate the current BIA data
------------------------------------------
-
-Study export:
-
-```
-poetry run bia-export website-study S-BIAD1218 S-BIAD1201 S-BIAD1034 S-BIAD1026 S-BIAD1287 S-BIAD1055 S-BIAD1185 S-BIAD957 S-BIAD748 S-BIAD850 S-BIAD1018 S-BIAD1077 S-BIAD1122 S-BIAD1163 S-BIAD618 S-BIAD620 S-BIAD650 S-BIAD814 S-BIAD825 S-BIAD954 S-BIAD963 S-BIAD1044 S-BIAD1183 S-BIAD627 S-BIAD826 S-BIAD843 S-BIAD845 S-BIAD849 S-BIAD890 S-BIAD1008 S-BIAD1024 S-BIAD1092 S-BIAD1093 S-BIAD1134 S-BIAD1135 S-BIAD1165 S-BIAD1175 S-BIAD1197 S-BIAD1215 S-BIAD1248 S-BIAD1260 S-BIAD663 S-BIAD831 S-BIAD846 S-BIAD851 S-BIAD887 S-BIAD922 S-BIAD928 S-BIAD1005 S-BIAD1009 S-BIAD1057 S-BIAD1079 S-BIAD1083 S-BIAD1088 S-BIAD1091 S-BIAD1099 S-BIAD1104 S-BIAD1168 S-BIAD1169 S-BIAD1193 S-BIAD1203 S-BIAD1268 S-BIAD1284 S-BIAD1323 S-BIAD607 S-BIAD608 S-BIAD610 S-BIAD616 S-BIAD626 S-BIAD661 S-BIAD694 S-BIAD705 S-BIAD800 S-BIAD813 S-BIAD815 S-BIAD823 S-BIAD824 S-BIAD829 S-BIAD830 S-BIAD847 S-BIAD848 S-BIAD852 S-BIAD861 S-BIAD865 S-BIAD866 S-BIAD884 S-BIAD885 S-BIAD904 S-BIAD931 S-BIAD955 S-BIAD970 S-BIAD986 S-BIAD1012 S-BIAD1019 S-BIAD1028 S-BIAD1039 S-BIAD1064 S-BIAD1078 S-BIAD1084 S-BIAD1090 S-BIAD1094 S-BIAD1095 S-BIAD1097 S-BIAD1102 S-BIAD1114 S-BIAD1116 S-BIAD1119 S-BIAD1130 S-BIAD1133 S-BIAD1151 S-BIAD1152 S-BIAD1157 S-BIAD1159 S-BIAD1162 S-BIAD1167 S-BIAD1196 S-BIAD1199 S-BIAD1200 S-BIAD1204 S-BIAD1236 S-BIAD1244 S-BIAD1245 S-BIAD1267 S-BIAD1271 S-BIAD1272 S-BIAD1274 S-BIAD1285 S-BIAD1293 S-BIAD1308 S-BIAD1327 S-BIAD612 S-BIAD633 S-BIAD646 S-BIAD678 S-BIAD680 S-BIAD684 S-BIAD700 S-BIAD720 S-BIAD725 S-BIAD728 S-BIAD770 S-BIAD855 S-BIAD860 S-BIAD862 S-BIAD864 S-BIAD882 S-BIAD886 S-BIAD915 S-BIAD916 S-BIAD965 S-BIAD967 S-BIAD985 S-BIAD987 S-BIAD992 S-BIAD993 S-BIAD1027 S-BIAD1030 S-BIAD1033 S-BIAD1045 S-BIAD1063 S-BIAD1080 S-BIAD1082 S-BIAD1086 S-BIAD1096 S-BIAD1100 S-BIAD1121 S-BIAD1155 S-BIAD1158 S-BIAD1184 S-BIAD1186 S-BIAD1194 S-BIAD1216 S-BIAD1232 S-BIAD1235 S-BIAD1237 S-BIAD1239 S-BIAD1247 S-BIAD1250 S-BIAD1259 S-BIAD1270 S-BIAD1282 S-BIAD1298 S-BIAD1300 S-BIAD1302 S-BIAD1314 S-BIAD1316 S-BIAD1333 S-BIAD602 S-BIAD603 S-BIAD606 S-BIAD609 S-BIAD611 S-BIAD617 S-BIAD619 S-BIAD621 S-BIAD624 S-BIAD647 S-BIAD651 S-BIAD652 S-BIAD657 S-BIAD658 S-BIAD664 S-BIAD668 S-BIAD669 S-BIAD676 S-BIAD679 S-BIAD682 S-BIAD696 S-BIAD703 S-BIAD704 S-BIAD767 S-BIAD769 S-BIAD777 S-BIAD807 S-BIAD808 S-BIAD817 S-BIAD821 S-BIAD822 S-BIAD827 S-BIAD832 S-BIAD840 S-BIAD841 S-BIAD842 S-BIAD844 S-BIAD863 S-BIAD900 S-BIAD901 S-BIAD903 S-BIAD910 S-BIAD913 S-BIAD914 S-BIAD926 S-BIAD930 S-BIAD961 S-BIAD969 S-BIAD976 S-BIAD984 S-BIAD988 S-BIAD999 -o bia-study-metadata.json -r ~/.cache/bia-integrator-data-sm/
-```
-
-And similarly for the images and datasets.
-
diff --git a/bia-export/bia_export/cli.py b/bia-export/bia_export/cli.py
@@ -4,8 +4,9 @@
 from rich.logging import RichHandler
 from typing_extensions import Annotated
 from pathlib import Path
+from bia_export.website_export.export_all import get_study_ids
 from .website_export.studies.transform import transform_study
-from .website_export.studies.models import StudyCLIContext
+from .website_export.studies.models import StudyCLIContext, CacheUse
 from .website_export.images.transform import transform_images
 from .website_export.images.models import ImageCLIContext
 from .website_export.datasets_for_images.transform import transform_datasets
@@ -23,7 +24,16 @@
 
 @app.command()
 def website_study(
-    id_list: Annotated[List[str], typer.Argument(help="IDs of the studies to export")],
+    id_list: Annotated[
+        Optional[List[str]], typer.Argument(help="IDs of the studies to export")
+    ] = None,
+    output_filename: Annotated[
+        Path,
+        typer.Option(
+            "--out_file",
+            "-o",
+        ),
+    ] = Path("bia-images-export.json"),
     root_directory: Annotated[
         Optional[Path],
         typer.Option(
@@ -32,24 +42,29 @@ def website_study(
             help="If root directory specified then use files there, rather than calling API",
         ),
     ] = None,
-    output_filename: Annotated[
-        Path,
+    cache: Annotated[
+        Optional[CacheUse],
         typer.Option(
-            "--out_file",
-            "-o",
+            "--cache",
+            "-c",
         ),
-    ] = Path("bia-images-export.json"),
+    ] = None,
 ):
 
     if root_directory:
         abs_root = root_directory.resolve()
 
+    if not id_list:
+        id_list = get_study_ids(root_directory)
+
     studies_map = {}
     for id in id_list:
         if root_directory:
-            context = StudyCLIContext(root_directory=abs_root, accession_id=id)
+            context = StudyCLIContext(
+                root_directory=abs_root, accession_id=id, cache_use=cache
+            )
         else:
-            context = StudyCLIContext(study_uuid=id)
+            context = StudyCLIContext(study_uuid=id, cache_use=cache)
         study = transform_study(context)
         studies_map[study.accession_id] = study.model_dump(mode="json")
 
@@ -61,8 +76,8 @@ def website_study(
 @app.command()
 def website_image(
     id_list: Annotated[
-        List[str], typer.Argument(help="Accession ID of the study to export")
-    ],
+        Optional[List[str]], typer.Argument(help="Accession ID of the study to export")
+    ] = None,
     root_directory: Annotated[
         Optional[Path],
         typer.Option(
@@ -84,6 +99,9 @@ def website_image(
     if root_directory:
         abs_root = root_directory.resolve()
 
+    if not id_list:
+        id_list = get_study_ids(root_directory)
+
     image_map = {}
     for id in id_list:
         if root_directory:
@@ -101,8 +119,8 @@ def website_image(
 @app.command()
 def datasets_for_website_image(
     id_list: Annotated[
-        List[str], typer.Argument(help="Accession ID of the study to export")
-    ],
+        Optional[List[str]], typer.Argument(help="Accession ID of the study to export")
+    ] = None,
     root_directory: Annotated[
         Optional[Path],
         typer.Option(
@@ -123,6 +141,9 @@ def datasets_for_website_image(
     if root_directory:
         abs_root = root_directory.resolve()
 
+    if not id_list:
+        id_list = get_study_ids(root_directory)
+
     dataset_map = {}
     for id in id_list:
         if root_directory:

diff --git a/bia-export/bia_export/website_export/export_all.py b/bia-export/bia_export/website_export/export_all.py
@@ -0,0 +1,48 @@
+from glob import glob
+from pathlib import Path
+from typing import Optional
+from bia_export.bia_client import api_client
+from bia_integrator_api.models import Study
+from .generic_object_retrieval import read_api_json_file
+import logging
+
+logger = logging.getLogger("__main__." + __name__)
+
+def find_local_studies(root_path: Path) -> list[Study]:
+    study_search_path = root_path.joinpath("study", "**/*.json")
+    file_paths = glob(str(study_search_path), recursive=True)
+    studies = []
+    for file_path in file_paths:
+        studies.append(read_api_json_file(file_path, Study))
+    return studies
+
+
+def fetch_studies_from_api(
+    fetch_size: int, agregator_list: list[Study] = None
+) -> list[Study]:
+    if not agregator_list:
+        agregator_list = []
+        start_uuid = None
+    else:
+        start_uuid = agregator_list[-1].uuid
+
+    fetched_studies = api_client.get_studies(
+        page_size=fetch_size, start_from_uuid=start_uuid
+    )
+    agregator_list += fetched_studies
+
+    if len(fetched_studies) != fetch_size:
+        return agregator_list
+    else:
+        return fetch_studies_from_api(fetch_size, agregator_list)
+
+
+def get_study_ids(root_directory: Optional[Path] = None):
+    if root_directory:
+        studies_list = find_local_studies(root_directory)
+        sorted_studies = sorted(studies_list, key=lambda study: study.release_date, reverse=True)
+        return [study.accession_id  for study in sorted_studies]
+    else:
+        studies_list = fetch_studies_from_api(100)
+        sorted_studies = sorted(studies_list, key=lambda study: study.release_date, reverse=True)
+        return [study.uuid  for study in sorted_studies]
diff --git a/bia-export/bia_export/website_export/generic_object_retrieval.py b/bia-export/bia_export/website_export/generic_object_retrieval.py
@@ -2,6 +2,7 @@
 from uuid import UUID
 from pydantic import BaseModel
 from pydantic.alias_generators import to_snake
+from bia_shared_datamodels.bia_data_model import DocumentMixin
 from bia_export.bia_client import api_client
 from .website_models import CLIContext
 import json
@@ -90,19 +91,25 @@ def retrieve_object(
 
 
 def get_all_api_results(
-    uuid: UUID, api_method, page_size_setting=20
-) -> List[BaseModel]:
-    object_list = []
-    added_objects = api_method(str(uuid), page_size=page_size_setting)
-    if len(added_objects) > 0:
-        object_list += added_objects
-        start_uuid = added_objects[-1].uuid
-        while page_size_setting == len(added_objects):
-            added_objects = api_method(
-                str(uuid),
-                page_size=page_size_setting,
-                start_from_uuid=start_uuid,
-            )
-            object_list += added_objects
-            start_uuid = added_objects[-1].uuid
-    return object_list
+    uuid: UUID,
+    api_method,
+    page_size_setting=20,
+    aggregator_list: list[DocumentMixin] = None,
+) -> list[DocumentMixin]:
+    if not aggregator_list:
+        aggregator_list: list[DocumentMixin] = []
+        start_uuid = None
+    else:
+        start_uuid = aggregator_list[-1].uuid
+
+    fetched_objects = api_method(
+        str(uuid),
+        page_size=page_size_setting,
+        start_from_uuid=str(start_uuid) if start_uuid else None,
+    )
+    aggregator_list += fetched_objects
+
+    if len(fetched_objects) != page_size_setting:
+        return aggregator_list
+    else:
+        return get_all_api_results(uuid, api_method, page_size_setting, aggregator_list)
diff --git a/bia-export/bia_export/website_export/studies/models.py b/bia-export/bia_export/website_export/studies/models.py
@@ -10,8 +10,9 @@
 )
 from bia_integrator_api import models
 
-from pydantic import BaseModel, Field
+from pydantic import Field
 
+from enum import Enum
 
 from typing import List, Optional, Type
 
@@ -57,6 +58,11 @@ class Study(models.Study):
     )
 
 
+class CacheUse(Enum):
+    READ_CACHE = "read_cache"
+    WRITE_CACHE = "write_cache"
+
+
 class StudyCLIContext(CLIContext):
     dataset_file_aggregate_data: dict = Field(
         default_factory=dict,
@@ -73,3 +79,4 @@ class StudyCLIContext(CLIContext):
         description="""Tracks e.g. which BioSamples have been displayed in previous dataset sections to 
         determine whether details should default to open or closed.""",
     )
+    cache_use: Optional[CacheUse] = None