From 5ffe7bd099b936ad6b7640885d470d9410d608b1 Mon Sep 17 00:00:00 2001 From: Vincent Emonet Date: Wed, 10 Apr 2024 17:24:10 +0200 Subject: [PATCH] store airlock info at upload time --- README.md | 91 +++++++++++++---------- backend/pyproject.toml | 2 +- backend/src/decentriq.py | 5 +- backend/src/models.py | 1 + backend/src/upload.py | 13 +++- backend/src/utils.py | 2 + frontend/src/components/Nav.tsx | 4 +- frontend/src/pages/_app.tsx | 2 +- frontend/src/pages/cohorts/[cohortId].tsx | 4 + frontend/src/pages/index.tsx | 16 ++++ frontend/src/pages/upload.tsx | 2 +- 11 files changed, 92 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index b9ce9b9..37fb904 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -# ๐Ÿซ€ iCare4CVD Cohort Explorer +# ๐Ÿซ€ iCARE4CVD Cohort Explorer -Webapp built for the [iCare4CVD project](https://icare4cvd.eu). +Webapp built for the [iCARE4CVD project](https://icare4cvd.eu). It aims to enable data owners and data scientists to: * ๐Ÿ” Login with their [Decentriq](https://www.decentriq.com/) account (OAuth based authentication, can be easily switch to other providers). Only accounts with the required permissions will be able to access the webapp. - * โœ‰๏ธ Contact [Decentriq](https://www.decentriq.com/) to request an account if you are part of the iCare4CVD project + * โœ‰๏ธ Contact [Decentriq](https://www.decentriq.com/) to request an account if you are part of the iCARE4CVD project * ๐Ÿ“ค Data owners upload CSV cohort metadata files describing the variables of a study cohort * ๐Ÿ”Ž Data scientists explore available cohorts and their variables through a web app: * Full text search across all cohorts and variables @@ -14,7 +14,7 @@ It aims to enable data owners and data scientists to: * ๐Ÿ”— Data owners can map each variable of their cohorts to standard concepts, sourced from [OHDSI Athena](https://athena.ohdsi.org/search-terms/terms?query=) API (SNOMEDCT, LOINC...) through the web app. * Mapping variables will help with data processing and exploration (โš ๏ธ work in progress) * We use namespaces from the [Bioregistry](https://bioregistry.io) to convert concepts CURIEs to URIs. -* ๐Ÿ›’ Data scientists can add the cohorts they need to perform their analysis to a Data Clean Room (DCR) +* ๐Ÿ›’ Data scientists can add the cohorts they need to perform their analysis to a [Data Clean Room](https://www.decentriq.com/) (DCR) on the Decentriq platform. * Once complete, the data scientists can publish their DCR to Decentriq in one click. * The DCR will be automatically created with a data schema corresponding to the selected cohorts, generated from the metadata provided by the data owners. * The data scientist can then access their DCR in Decentriq, write the code for their analysis, and request computation of this code on the provisioned cohorts. @@ -113,7 +113,7 @@ pnpm dev ### ๐Ÿงน Code formatting and linting -Automatically format Python code with ruff and black, and TypeScript code with prettier. +Automatically format Python code with ruff and black, and TypeScript code with prettier: ```bash ./scripts/fmt.sh @@ -125,47 +125,33 @@ Deploy on a server in production with docker compose. Put the excel spreadsheet with all cohorts metadata in `data/iCARE4CVD_Cohorts.xlsx`. Uploaded cohorts will go to separated folders in `data/cohorts/` -Generate a secret key used to encode/decode JWT token for a secure authentication system: +1. Generate a secret key used to encode/decode JWT token for a secure authentication system: -```bash -python -c "import secrets ; print(secrets.token_urlsafe(32))" -``` - -Create a `.env` file with secret configuration: - -```bash -AUTH_ENDPOINT=https://auth0.com -CLIENT_ID=AAA -CLIENT_SECRET=BBB -DECENTRIQ_EMAIL=ccc@ddd.com -DECENTRIQ_TOKEN=EEE -JWT_SECRET=vCitcsPBwH4BMCwEqlO1aHJSIn--usrcyxPPRbeYdHM -ADMINS=admin1@email.com,admin2@email.com -``` - -Deploy: - -```bash -docker compose -f docker-compose.prod.yml up -d -``` - -## ๐Ÿช„ Administration + ```bash + python -c "import secrets ; print(secrets.token_urlsafe(32))" + ``` -### โœจ Automatically generate variables metadata +2. Create a `.env` file with secret configuration: -You can use the [`csvw-ontomap`](https://github.com/vemonet/csvw-ontomap) python package to automatically generate a CSV metadata file for your data file, with the format expected by iCARE4CVD. It will automatically fill the following columns: var name, var type, categorical, min, max + ```bash + AUTH_ENDPOINT=https://auth0.com + CLIENT_ID=AAA + CLIENT_SECRET=BBB + DECENTRIQ_EMAIL=ccc@ddd.com + DECENTRIQ_TOKEN=EEE + JWT_SECRET=vCitcsPBwH4BMCwEqlO1aHJSIn--usrcyxPPRbeYdHM + ADMINS=admin1@email.com,admin2@email.com + ``` -Install the package: +3. Deploy the stack for production: -```bash -pip install git+https://github.com/vemonet/csvw-ontomap.git -``` + ```bash + docker compose -f docker-compose.prod.yml up -d + ``` -Run profiling, supports `.csv`, `.xlsx`, `.sav`: +We currently use [nginx-proxy](https://github.com/nginx-proxy/nginx-proxy) for routing through environment variables in the `docker-compose.yml` file, you can change for the proxy of your liking. -```bash -csvw-ontomap data/COHORT_data.sav -o data/COHORT_datadictionary.csv -``` +## ๐Ÿช„ Administration ### ๐Ÿ—‘๏ธ Reset database @@ -175,6 +161,15 @@ Reset the database by deleting the `data/db` folder: rm -rf data/db ``` +Next restart of the application the database will be re-populated using the data dictionaries CSV files stored on the server. + +> [!WARNING] +> +> Resetting the database only if really necessary, it will cause to lose: +> +> - All concept mappings added from the Cohort Explorer +> - The info about Decentriq airlock data preview for cohorts that have been uploaded (it will default to false when recreating the database, admins can update them by downloading and reuploading the cohorts with the right airlock setting) + ### ๐Ÿ’พ Backup database It can be convenient to dump the content of the triplestore database to create a backup. @@ -218,3 +213,21 @@ docker compose exec backend curl -X POST -T /data/triplestore_dump_20240225.nq - ### ๐Ÿšš Move the app If you need to move the app to a different server, just copy the whole `data/` folder. + +### โœจ Automatically generate variables metadata + +Experimental: you can use the [`csvw-ontomap`](https://github.com/vemonet/csvw-ontomap) python package to automatically generate a CSV metadata file for your data file, with the format expected by iCARE4CVD. It will automatically fill the following columns: var name, var type, categorical, min, max. But it does not properly extract datetime data types. + +Install the package: + +```bash +pip install git+https://github.com/vemonet/csvw-ontomap.git +``` + +Run profiling, supports `.csv`, `.xlsx`, `.sav`: + +```bash +csvw-ontomap data/COHORT_data.sav -o data/COHORT_datadictionary.csv +``` + +### diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 6187556..c4ee843 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" requires-python = ">=3.8" version = "0.0.1" name = "cohort-explorer-backend" -description = "Backend for the iCare4CVD Cohort Explorer." +description = "Backend for the iCARE4CVD Cohort Explorer." license = "MIT" authors = [ { name = "Vincent Emonet", email = "vincent.emonet@gmail.com" }, diff --git a/backend/src/decentriq.py b/backend/src/decentriq.py index 133910f..49c45c9 100644 --- a/backend/src/decentriq.py +++ b/backend/src/decentriq.py @@ -116,7 +116,6 @@ def pandas_script_merge_cohorts(merged_cohorts: dict[str, list[str]], all_cohort ) async def create_compute_dcr( cohorts_request: dict[str, Any], - airlock: bool = True, user: Any = Depends(get_current_user), ) -> dict[str, Any]: """Create a Data Clean Room for computing with the cohorts requested using Decentriq SDK""" @@ -168,7 +167,9 @@ async def create_compute_dcr( builder.add_node_definition(TableDataNodeDefinition(name=data_node_id, columns=get_cohort_schema(cohort), is_required=True)) data_nodes.append(data_node_id) - if airlock: + # TODO: made airlock always True for testing + # if cohort.airlock: + if True: # Add airlock node to make it easy to access small part of the dataset preview_node_id = f"preview-{data_node_id}" builder.add_node_definition(PreviewComputeNodeDefinition( diff --git a/backend/src/models.py b/backend/src/models.py index 9624eb5..01c5fc3 100644 --- a/backend/src/models.py +++ b/backend/src/models.py @@ -50,6 +50,7 @@ class Cohort: study_population: Optional[str] = None study_objective: Optional[str] = None variables: Dict[str, CohortVariable] = field(default_factory=dict) + airlock: bool = False can_edit: bool = False def dict(self): diff --git a/backend/src/upload.py b/backend/src/upload.py index 71d0fc8..293fb78 100644 --- a/backend/src/upload.py +++ b/backend/src/upload.py @@ -185,7 +185,7 @@ def to_camelcase(s: str) -> str: s = sub(r"(_|-)+", " ", s).title().replace(" ", "") return "".join([s[0].lower(), s[1:]]) -def load_cohort_dict_file(dict_path: str, cohort_id: str, user_email: str) -> Dataset: +def load_cohort_dict_file(dict_path: str, cohort_id: str, airlock: bool) -> Dataset: """Parse the cohort dictionary uploaded as excel or CSV spreadsheet, and load it to the triplestore""" # print(f"Loading dictionary {dict_path}") # df = pd.read_csv(dict_path) if dict_path.endswith(".csv") else pd.read_excel(dict_path) @@ -220,6 +220,7 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str, user_email: str) -> Da g = init_graph() g.add((cohort_uri, RDF.type, ICARE.Cohort, cohort_uri)) g.add((cohort_uri, DC.identifier, Literal(cohort_id), cohort_uri)) + g.add((cohort_uri, ICARE.previewEnabled, Literal(str(airlock).lower()), cohort_uri)) # Record all errors and raise them at the end errors = [] @@ -310,6 +311,7 @@ async def upload_cohort( cohort_id: str = Form(...), cohort_dictionary: UploadFile = File(...), cohort_data: UploadFile | None = None, + airlock: bool = True, ) -> dict[str, Any]: """Upload a cohort metadata file to the server and add its variables to the triplestore.""" user_email = user["email"] @@ -326,6 +328,8 @@ async def upload_cohort( detail=f"User {user_email} cannot edit cohort {cohort_id}", ) + cohort_info.airlock = airlock + # Create directory named after cohort_id cohorts_folder = os.path.join(settings.data_folder, "cohorts", cohort_id) os.makedirs(cohorts_folder, exist_ok=True) @@ -354,7 +358,7 @@ async def upload_cohort( shutil.copyfileobj(cohort_dictionary.file, buffer) try: - g = load_cohort_dict_file(metadata_path, cohort_id, user_email) + g = load_cohort_dict_file(metadata_path, cohort_id, airlock) # Delete previous graph for this file from triplestore delete_existing_triples(get_cohort_uri(cohort_id)) publish_graph_to_endpoint(g) @@ -463,8 +467,9 @@ def init_triplestore() -> None: folder_path = os.path.join(settings.data_folder, "cohorts", folder) if os.path.isdir(folder_path): for file in glob.glob(os.path.join(folder_path, "*_datadictionary.*")): - # TODO: currently when we reset all existing cohorts default to the main admin - g = load_cohort_dict_file(file, folder, settings.decentriq_email) + # NOTE: default airlock preview to false if we ever need to reset cohorts, + # admins can easily ddl and reupload the cohorts with the correct airlock value + g = load_cohort_dict_file(file, folder, False) g.serialize(f"{settings.data_folder}/cohort_explorer_triplestore.trig", format="trig") if publish_graph_to_endpoint(g): print(f"๐Ÿ’พ Triplestore initialization: added {len(g)} triples for cohorts {file}.") diff --git a/backend/src/utils.py b/backend/src/utils.py index a8a780c..21789b7 100644 --- a/backend/src/utils.py +++ b/backend/src/utils.py @@ -55,6 +55,7 @@ def run_query(query: str) -> dict[str, Any]: OPTIONAL { ?cohort icare:studyOngoing ?study_ongoing . } OPTIONAL { ?cohort icare:studyPopulation ?study_population . } OPTIONAL { ?cohort icare:studyObjective ?study_objective . } + OPTIONAL { ?cohort icare:previewEnabled ?airlock . } } OPTIONAL { @@ -139,6 +140,7 @@ def retrieve_cohorts_metadata(user_email: str) -> dict[str, Cohort]: study_population=get_value("study_population", row), study_objective=get_value("study_objective", row), variables={}, # You might want to populate this separately, depending on your data structure + airlock=get_value("airlock", row), can_edit=user_email in [*settings.admins_list, get_value("cohortEmail", row)], ) elif get_value("cohortEmail", row) not in target_dict[cohort_id].cohort_email: diff --git a/frontend/src/components/Nav.tsx b/frontend/src/components/Nav.tsx index 545883b..8bc73fb 100644 --- a/frontend/src/components/Nav.tsx +++ b/frontend/src/components/Nav.tsx @@ -130,7 +130,7 @@ export function Nav() {
- iCare4CVD Cohort Explorer + iCARE4CVD Cohort Explorer
@@ -213,7 +213,7 @@ export function Nav() {

โœ… Data Clean Room{' '} - + {publishedDCR['dcr_title']} {' '} published in Decentriq. diff --git a/frontend/src/pages/_app.tsx b/frontend/src/pages/_app.tsx index a452702..d7df5ce 100644 --- a/frontend/src/pages/_app.tsx +++ b/frontend/src/pages/_app.tsx @@ -12,7 +12,7 @@ export default function App({Component, pageProps}: AppProps) { - + Cohort Explorer diff --git a/frontend/src/pages/cohorts/[cohortId].tsx b/frontend/src/pages/cohorts/[cohortId].tsx index 46353f5..d7eb3b9 100644 --- a/frontend/src/pages/cohorts/[cohortId].tsx +++ b/frontend/src/pages/cohorts/[cohortId].tsx @@ -1,5 +1,9 @@ 'use client'; +// NOTE: this page is not really used in the current version of the app +// All cohorts are accessed from the cohorts.tsx page +// We keep it here as a placeholder in case we want to add pages for each cohort + import React, {useState} from 'react'; import {useRouter} from 'next/router'; import {useCohorts} from '@/components/CohortsContext'; diff --git a/frontend/src/pages/index.tsx b/frontend/src/pages/index.tsx index bdcd28f..040ab55 100644 --- a/frontend/src/pages/index.tsx +++ b/frontend/src/pages/index.tsx @@ -48,6 +48,22 @@ export default function Home() { Explore and search data dictionaries of available Cohorts

+ + +

+ Technical details{' '} + + -> + +

+

+ View the documentation and source code on GitHub +

+
); diff --git a/frontend/src/pages/upload.tsx b/frontend/src/pages/upload.tsx index 4da368f..4b37d36 100644 --- a/frontend/src/pages/upload.tsx +++ b/frontend/src/pages/upload.tsx @@ -223,7 +223,7 @@ export default function UploadPage() {

โœ… Data Clean Room{' '} - + {publishedDCR['dcr_title']} {' '} published.