From 9c3d26c95837054630f7445655cf979f503f8d62 Mon Sep 17 00:00:00 2001 From: Vincent Emonet Date: Fri, 12 Apr 2024 11:55:50 +0200 Subject: [PATCH] fix concept categories --- README.md | 17 +++++++++------ backend/src/models.py | 1 + backend/src/upload.py | 25 +++++++++++------------ backend/src/utils.py | 4 +++- frontend/src/components/VariablesList.tsx | 4 ++-- frontend/src/types.ts | 10 +++++---- 6 files changed, 35 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 9051e8e..a14e7d5 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,6 @@ It aims to enable *data custodians* and *data scientists* to: * The DCR will be automatically created with a data schema corresponding to the selected cohorts, generated from the metadata provided by the data custodians. * The data scientist can then access their DCR in Decentriq, write the code for their analysis, and request computation of this code on the provisioned cohorts. -> [!WARNING] -> -> If you logged in with a Decentriq user that does not have access to the Cohort Explorer, and need to re-login with another user: you will need to clear cache and cookies. Because Auth0 will keep your login in mind for some time, and it can be quite tricky to reset (they don't give the tools for managing that properly). - > [!IMPORTANT] > > Only the owner of the cohort (as described in the spreadsheet holding all cohorts generic metadata), and the platform admins, can upload the data dictionary or edit mappings for a cohort. @@ -33,6 +29,13 @@ It aims to enable *data custodians* and *data scientists* to: > > You can reupload a cohort dictionary that have been already uploaded (in case you need to fix something). The mappings defined via the Cohort Explorer will be kept, as long as the variables names do not change. +## ⚠️ Known issues + +Here are a known "issues" with the Cohort Explorer, and how to fix them: + +- [ ] If you logged in with a Decentriq user that does not have access to the Cohort Explorer, and need to re-login with another user: you will need to clear cache and cookies. Because Auth0 will keep your login in mind for some time, and it can be quite tricky to reset (they don't give the tools for managing that properly). +- [ ] After a period of inactivity you might see a black screen with an error message, in this case just reload the page + ## πŸ—ΊοΈ Technical overview This platform is composed of 3 main components: @@ -48,7 +51,7 @@ This platform is composed of 3 main components: πŸ” Authentication is done through the Decentriq OAuth provider, but it could be replaced by any other OAuth provider easily. Once the user logged in through the external OAuth provider, the backend generates an encrypted JWT token, which is passed to the frontend using HTTP-only cookies. -> \[!NOTE] +> [!NOTE] > > All metadata about cohorts and variables are retrieved by one mighty SPARQL query, and passed to the frontend as one big dictionary. Filtering and searching is then done in TypeScript on this cohorts dictionary. > @@ -56,7 +59,9 @@ This platform is composed of 3 main components: ## β˜‘οΈ To do -* [ ] Integrate LUCE blockchain component. Should it be deployed separately, or as a service in the `docker-compose.yml`? +* [ ] Integrate the LUCE blockchain component for data sharing consent: + * [ ] We will store blockchain addresses, handle authentication, and add the UI elements directly in the Cohort Explorer (we can even store private keys or do wallet stuff there too if needed) + * [ ] But we need to be able to query the blockchain easily through an API from our system (a basic HTTP OpenAPI would suffice, e.g. built with [FastAPI](https://fastapi.tiangolo.com)) ## πŸ§‘β€πŸ’» Development diff --git a/backend/src/models.py b/backend/src/models.py index 01c5fc3..8902506 100644 --- a/backend/src/models.py +++ b/backend/src/models.py @@ -8,6 +8,7 @@ class VariableCategory: value: str label: str + concept_id: Optional[str] = None mapped_id: Optional[str] = None mapped_label: Optional[str] = None diff --git a/backend/src/upload.py b/backend/src/upload.py index aff1ccd..c0a069a 100644 --- a/backend/src/upload.py +++ b/backend/src/upload.py @@ -211,7 +211,7 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset: ) df["categories"] = df["CATEGORICAL"].apply(parse_categorical_string) if "Label Concept Code" in df.columns: - df["concept_id"] = str(df["Label Concept Code"]).strip() + df["concept_id"] = df.apply(lambda row: str(row["Label Concept Code"]).strip(), axis=1) else: # Try to get IDs from old format multiple columns df["concept_id"] = df.apply(lambda row: get_id_from_multi_columns(row), axis=1) @@ -245,35 +245,34 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset: categories_codes = [] if row.get("Categorical Value Concept Code"): categories_codes = row["Categorical Value Concept Code"].split(",") - # Add properties - for column, value in row.items(): - # if value and column not in ["categories"]: - if column not in ["categories"] and value: + for column, col_value in row.items(): + if column not in ["categories"] and col_value: + # NOTE: we literally use the column name as the property URI in camelcase (that's what I call lazy loading!) property_uri = ICARE[to_camelcase(column)] if ( - isinstance(value, str) - and (value.startswith("http://") or value.startswith("https://")) - and " " not in value + isinstance(col_value, str) + and (col_value.startswith("http://") or col_value.startswith("https://")) + and " " not in col_value ): - g.add((variable_uri, property_uri, URIRef(value), cohort_uri)) + g.add((variable_uri, property_uri, URIRef(col_value), cohort_uri)) else: - g.add((variable_uri, property_uri, Literal(value), cohort_uri)) + g.add((variable_uri, property_uri, Literal(col_value), cohort_uri)) # Handle Category if column in ["categories"]: - if len(value) == 1: + if len(col_value) == 1: errors.append( f"Row {i+2} for variable `{row['VARIABLE NAME']}` has only one category `{row['categories'][0]['value']}`. It should have at least two." ) continue - for index, category in enumerate(value): + for index, category in enumerate(col_value): cat_uri = get_category_uri(variable_uri, index) g.add((variable_uri, ICARE.categories, cat_uri, cohort_uri)) g.add((cat_uri, RDF.type, ICARE.VariableCategory, cohort_uri)) g.add((cat_uri, RDF.value, Literal(category["value"]), cohort_uri)) g.add((cat_uri, RDFS.label, Literal(category["label"]), cohort_uri)) try: - if categories_codes: + if categories_codes and str(categories_codes[index]).strip() != "na": cat_code_uri = converter.expand(str(categories_codes[index]).strip()) if not cat_code_uri: errors.append( diff --git a/backend/src/utils.py b/backend/src/utils.py index edc5d6e..3df25d8 100644 --- a/backend/src/utils.py +++ b/backend/src/utils.py @@ -41,7 +41,7 @@ def run_query(query: str) -> dict[str, Any]: SELECT DISTINCT ?cohortId ?cohortInstitution ?cohortType ?cohortEmail ?study_type ?study_participants ?study_duration ?study_ongoing ?study_population ?study_objective ?airlock ?variable ?varName ?varLabel ?varType ?index ?count ?na ?max ?min ?units ?formula ?definition - ?omopDomain ?conceptId ?mappedId ?mappedLabel ?visits ?categoryValue ?categoryLabel ?categoryMappedId ?categoryMappedLabel + ?omopDomain ?conceptId ?mappedId ?mappedLabel ?visits ?categoryValue ?categoryLabel ?categoryConceptId ?categoryMappedId ?categoryMappedLabel WHERE { GRAPH ?cohortMetadataGraph { ?cohort a icare:Cohort ; @@ -79,6 +79,7 @@ def run_query(query: str) -> dict[str, Any]: ?variable icare:categories ?category. ?category rdfs:label ?categoryLabel ; rdf:value ?categoryValue . + OPTIONAL { ?category icare:conceptId ?categoryConceptId } } } } @@ -180,6 +181,7 @@ def retrieve_cohorts_metadata(user_email: str) -> dict[str, Cohort]: VariableCategory( value=str(row["categoryValue"]["value"]), label=str(row["categoryLabel"]["value"]), + concept_id=get_curie_value("categoryConceptId", row), mapped_id=get_curie_value("categoryMappedId", row), mapped_label=get_value("categoryMappedLabel", row), ) diff --git a/frontend/src/components/VariablesList.tsx b/frontend/src/components/VariablesList.tsx index 84298f0..f04ae87 100644 --- a/frontend/src/components/VariablesList.tsx +++ b/frontend/src/components/VariablesList.tsx @@ -314,8 +314,8 @@ const VariablesList = ({cohortId, searchFilters = {searchQuery: ''}}: any) => { handleConceptSelect(variable.var_name, concept, index)} canEdit={cohortsData[cohortId].can_edit} /> diff --git a/frontend/src/types.ts b/frontend/src/types.ts index 5be53e7..da55cd6 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -25,19 +25,21 @@ export interface Variable { visits: string; formula: string; definition: string; - concept_id: string; omop_domain: string; index: number; + concept_id: string; + mapped_id: string | null; + mapped_label: string | null; categories: Category[]; - mapped_concept: string | null; [key: string]: any; } export interface Category { value: string; label: string; - concept_id: string; - mapped_concept: string | null; + concept_id: string | null; + mapped_id: string | null; + mapped_label: string | null; } export interface Concept {