Merge branch 'script-updates-for-cohort-matching'

UoM-Data-Science-Platforms · Oct 3, 2024 · d0032cf · d0032cf
2 parents 106ceb6 + ac08eb7
commit d0032cf
Show file tree

Hide file tree

Showing 3 changed files with 358 additions and 3 deletions.
diff --git a/scripts/generate-sql.js b/scripts/generate-sql.js
@@ -199,11 +199,13 @@ function processNextOutputTable(sql, templateName, config, projectNameChunked) {
   //{{create-output-table-no-gmpseudo-ids::"1_Patients"}
   const outputTableRegex = /\{\{create-output-table::([^}:]*)\}\}/;
   const outputTableNoIdsRegex = /\{\{create-output-table-no-gmpseudo-ids::([^}:]*)\}\}/;
+  const outputTableMatchedCohortRegex = /\{\{create-output-table-matched-cohort::([^}:]*)\}\}/;
 
   let isMatch = false;
 
   const outputTableMatch = outputTableRegex.exec(sql);
   const outputTableNoIdsMatch = outputTableNoIdsRegex.exec(sql);
+  const outputTableMatchedCohortMatch = outputTableMatchedCohortRegex.exec(sql);
   if (outputTableMatch) {
     isMatch = true;
     let needSemiColon = false;
@@ -273,17 +275,103 @@ FROM "AllPseudos_${projectNameChunked.join('_')}";
 -- created in the 0.code-sets.sql file
 DROP TABLE IF EXISTS ${config.PROJECT_SPECIFIC_SCHEMA_FOR_DATA}.${tableName};
 CREATE TABLE ${config.PROJECT_SPECIFIC_SCHEMA_FOR_DATA}.${tableName} AS
-SELECT ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}.gm_pseudo_hash_${projectNameChunked.join(
+SELECT ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}.gm_pseudo_hash_${projectNameChunked.join('_')}("GmPseudo") AS "PatientID",
+	* EXCLUDE "GmPseudo"
+FROM ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}."${tableNameNoQuotes}_WITH_PSEUDO_IDS";`;
+
+    sql =
+      sql.substring(0, indexOfOutputTable) +
+      replacedSql +
+      finalSql +
+      sql.substring(indexOfFinalSemiColon + 1);
+  } else if (outputTableMatchedCohortMatch) {
+    isMatch = true;
+    let needSemiColon = false;
+
+    const indexOfOutputTable = sql.indexOf('{{create-output-table-matched-cohort');
+    let indexOfFinalSemiColon = sql.indexOf(';', indexOfOutputTable);
+    if (indexOfFinalSemiColon < 0) {
+      needSemiColon = true;
+      indexOfFinalSemiColon = sql.length - 1;
+    }
+
+    const [, tableName] = outputTableMatchedCohortMatch;
+    const tableNameNoQuotes = tableName.match(/^"?([^"]+)"?$/)[1];
+
+    const replacedSql =
+      sql.substring(indexOfOutputTable, indexOfFinalSemiColon + 1).replace(
+        outputTableMatchedCohortRegex,
+        `
+-- ... processing ${outputTableMatchedCohortMatch[0].replace(/\{/g, '[').replace(/\}/g, ']')} ... 
+-- ... Need to create an output table called ${tableName} and replace 
+-- ... the GmPseudo column with a study-specific random patient id.
+
+-- First we create a table in an area only visible to the RDEs which contains
+-- the GmPseudos. THESE CANNOT BE RELEASED TO END USERS.
+DROP TABLE IF EXISTS ${
+          config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES
+        }."${tableNameNoQuotes}_WITH_PSEUDO_IDS";
+CREATE TABLE ${
+          config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES
+        }."${tableNameNoQuotes}_WITH_PSEUDO_IDS" AS`
+      ) + (needSemiColon ? ';' : '');
+
+    const finalSql = `
+
+-- Then we check to see if there are any new GmPseudo ids. We do this by making a temp table 
+-- of all "new" GmPseudo ids from either the main column or the matched column. I.e. any GmPseudo ids that 
+-- we've already got a unique id for for this study are excluded
+
+DROP TABLE IF EXISTS "AllPseudos_${projectNameChunked.join('_')}";
+CREATE TEMPORARY TABLE "AllPseudos_${projectNameChunked.join('_')}" AS
+(
+SELECT DISTINCT "GmPseudo" FROM ${
+      config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES
+    }."${tableNameNoQuotes}_WITH_PSEUDO_IDS"
+UNION 
+SELECT DISTINCT "MainCohortMatchedGmPseudo" FROM ${
+      config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES
+    }."${tableNameNoQuotes}_WITH_PSEUDO_IDS"
+)
+EXCEPT
+SELECT "GmPseudo" FROM "Patient_ID_Mapping_${projectNameChunked.join('_')}";
+
+-- Find the highest currently assigned id. Ids are given incrementally, so now ones
+-- need to start at +1 of the current highest
+SET highestPatientId = (
+    SELECT IFNULL(MAX("StudyPatientPseudoId"),0) FROM "Patient_ID_Mapping_${projectNameChunked.join(
       '_'
-    )}("GmPseudo") AS "PatientID", * EXCLUDE "GmPseudo"
+    )}"
+);
+
+-- Make a study specific hash for each new GmPseudo and insert it
+-- into the patient lookup table
+INSERT INTO "Patient_ID_Mapping_${projectNameChunked.join('_')}"
+SELECT
+    "GmPseudo", -- the GM SDE patient ids for patients in this cohort
+    SHA2(CONCAT('${projectNameChunked.join(
+      '_'
+    )}', "GmPseudo")) AS "Hash", -- used to provide a random (study-specific) ordering for the patient ids we provide
+    $highestPatientId + ROW_NUMBER() OVER (ORDER BY "Hash") -- the patient id that we provide to the analysts
+FROM "AllPseudos_${projectNameChunked.join('_')}";
+
+-- Finally, we select from the output table which includes the GmPseudos, in order
+-- to populate the table for the end users where the GmPseudo fields are redacted via a function
+-- created in the 0.code-sets.sql file
+DROP TABLE IF EXISTS ${config.PROJECT_SPECIFIC_SCHEMA_FOR_DATA}.${tableName};
+CREATE TABLE ${config.PROJECT_SPECIFIC_SCHEMA_FOR_DATA}.${tableName} AS
+SELECT ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}.gm_pseudo_hash_${projectNameChunked.join('_')}("GmPseudo") AS "PatientID",
+	${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}.gm_pseudo_hash_${projectNameChunked.join('_')}("MainCohortMatchedGmPseudo") AS "MainCohortMatchedPatientID",
+	* EXCLUDE ("GmPseudo", "MainCohortMatchedGmPseudo")
 FROM ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}."${tableNameNoQuotes}_WITH_PSEUDO_IDS";`;
 
     sql =
       sql.substring(0, indexOfOutputTable) +
       replacedSql +
       finalSql +
       sql.substring(indexOfFinalSemiColon + 1);
-  } else if (outputTableNoIdsMatch) {
+
+} else if (outputTableNoIdsMatch) {
     isMatch = true;
     let needSemiColon = false;
 

diff --git a/shared/Reusable queries for data extraction/query-cohort-matching-yob-sex-ethnicity.sql b/shared/Reusable queries for data extraction/query-cohort-matching-yob-sex-ethnicity.sql
@@ -0,0 +1,212 @@
+--┌────────────────────────────────────────────────────┐
+--│ Cohort matching on year of birth / sex 					   │
+--└────────────────────────────────────────────────────┘
+
+-- OBJECTIVE: To take a primary cohort and find a 1:n matched cohort based on year of birth and sex.
+
+-- INPUT: Takes two parameters
+--  - yob-flex: integer - number of years each way that still allow a year of birth match
+--  - num-matches: integer - number of matches for each patient in the cohort
+-- Requires two temp tables to exist as follows:
+-- MainCohort (FK_Patient_Link_ID, Sex, YearOfBirth)
+-- 	- FK_Patient_Link_ID - unique patient id
+--	- Sex - M/F
+--	- YearOfBirth - Integer
+-- PotentialMatches (FK_Patient_Link_ID, Sex, YearOfBirth)
+-- 	- FK_Patient_Link_ID - unique patient id
+--	- Sex - M/F
+--	- YearOfBirth - Integer
+
+-- OUTPUT: A temp table as follows:
+-- #CohortStore (FK_Patient_Link_ID, YearOfBirth, Sex, MatchingPatientId, MatchingYearOfBirth)
+--  - FK_Patient_Link_ID - unique patient id for primary cohort patient
+--  - YearOfBirth - of the primary cohort patient
+--  - Sex - of the primary cohort patient
+--  - MatchingPatientId - id of the matched patient
+--  - MatchingYearOfBirth - year of birth of the matched patient
+
+-- TODO 
+-- A few things to consider when doing matching:
+--  - Consider removing "ghost patients" e.g. people without a primary care record
+--  - Consider matching on practice. Patients in different locations might have different outcomes. Also
+--    for primary care based diagnosing, practices might have different thoughts on severity, timing etc.
+--  - For instances where lots of cases have no matches, consider allowing matching to occur with replacement.
+--    I.e. a patient can match more than one person in the main cohort.
+
+-- First we extend the PrimaryCohort table to give each age-sex combo a unique number
+-- and to avoid polluting the MainCohort table
+
+DROP TABLE IF EXISTS Cases;
+CREATE TEMPORARY TABLE Cases AS
+SELECT "GmPseudo" AS PatientId, 
+	YearOfBirth, 
+	Sex, 
+	EthnicCategory,
+		Row_Number() OVER(PARTITION BY YearOfBirth, Sex, EthnicCategory ORDER BY "GmPseudo") AS CaseRowNumber
+FROM MainCohort;
+
+
+-- Then we do the same with the PotentialMatches table
+DROP TABLE IF EXISTS Matches;
+CREATE TEMPORARY TABLE Matches AS
+SELECT "GmPseudo" AS PatientId, 
+	YearOfBirth, 
+	Sex, 
+	EthnicCategory,
+	Row_Number() OVER(PARTITION BY YearOfBirth, Sex, EthnicCategory ORDER BY "GmPseudo") AS AssignedPersonNumber
+FROM PotentialMatches;
+
+-- Find the number of people with each characteristic in the main cohort
+DROP TABLE IF EXISTS CharacteristicCount;
+CREATE TEMPORARY TABLE CharacteristicCount AS
+SELECT YearOfBirth, Sex, EthnicCategory, COUNT(*) AS "Count" 
+FROM Cases 
+GROUP BY YearOfBirth, Sex, EthnicCategory;
+
+-- Find the number of potential matches for each Age/Sex combination
+-- The output of this is useful for seeing how many matches you can get
+-- SELECT A.YearOfBirth, A.Sex, B.Count / A.Count AS NumberOfPotentialMatchesPerCohortPatient FROM (SELECT * FROM #CharacteristicCount) A LEFT OUTER JOIN (SELECT YearOfBirth, Sex, COUNT(*) AS [Count] FROM #Matches GROUP BY YearOfBirth, Sex) B ON B.YearOfBirth = A.YearOfBirth AND B.Sex = A.Sex ORDER BY NumberOfPotentialMatches,A.YearOfBirth,A.Sex;
+
+-- The final table contains a row for each match, so e.g. if patient 1 has 4
+-- matches then there will be 4 rows in the table for this.
+DROP TABLE IF EXISTS CohortStore;
+CREATE TEMPORARY TABLE CohortStore ( 
+  PatientId BIGINT, 
+  YearOfBirth INT, 
+  Sex nchar(1), 
+  EthnicCategory varchar(50),
+  MatchingPatientId BIGINT,
+  MatchingYearOfBirth INT
+);
+
+--1. First match try to match people exactly. We do this as follows:
+--    - For each YOB/Sex/EthnicCategory combination we find all potential matches. E.g. all patients
+--    - in the potential matches with sex='F' and yob=1957 and EthnicCategory = 'White British'
+--    - We then try to assign a single match to all cohort members with sex='F' and yob=1957 and
+--    - EthnicCategory = 'White British'. If there are still matches unused, we then assign
+--    - a second match to all cohort members. This continues until we either run out of matches,
+--    - or successfully match everyone with the desired number of matches.
+
+DECLARE 
+    counter INT;
+
+BEGIN 
+    counter := 1; 
+
+    WHILE (counter <= {param:num-matches}) DO 
+
+        INSERT INTO CohortStore
+          SELECT c.PatientId, c.YearOfBirth, c.Sex, c.EthnicCategory, p.PatientId AS MatchedPatientId, c.YearOfBirth
+          FROM Cases c
+            INNER JOIN CharacteristicCount cc on cc.YearOfBirth = c.YearOfBirth and cc.Sex = c.Sex and cc.EthnicCategory = c.EthnicCategory
+            INNER JOIN Matches p 
+              ON p.Sex = c.Sex 
+              AND p.YearOfBirth = c.YearOfBirth 
+			  AND p.EthnicCategory = c.EthnicCategory
+              -- This next line is the trick to only matching each person once
+              AND p.AssignedPersonNumber = CaseRowNumber + (:counter - 1) * cc."Count";
+
+           -- We might not need this, but to be extra sure let's delete any patients who 
+           -- we're already using to match people
+           DELETE FROM Matches WHERE PatientId IN (SELECT MatchingPatientId FROM CohortStore);
+
+        counter := counter + 1; 
+
+    END WHILE; 
+
+END; 
+
+--2. Now relax the yob restriction to get extra matches for people with no matches
+
+DECLARE 
+    lastrowinsert1 INT;
+    CohortStoreRowsAtStart1 INT;
+
+BEGIN 
+    lastrowinsert1 := 1; 
+
+    WHILE (lastrowinsert1 > 0) DO 
+    CohortStoreRowsAtStart1 := (SELECT COUNT(*) FROM CohortStore);
+
+		INSERT INTO CohortStore
+		SELECT sub.PatientId, sub.YearOfBirth, sub.Sex, sub.EthnicCategory, MatchedPatientId, MAX(m.YearOfBirth) FROM (
+		SELECT c.PatientId, c.YearOfBirth, c.Sex, c.EthnicCategory, MAX(p.PatientId) AS MatchedPatientId, Row_Number() OVER(PARTITION BY MAX(p.PatientId) ORDER BY p.PatientId) AS AssignedPersonNumber
+		FROM Cases c
+		INNER JOIN Matches p 
+			ON p.Sex = c.Sex 
+			AND p.EthnicCategory = c.EthnicCategory
+			AND p.YearOfBirth >= c.YearOfBirth - {param:yob-flex}
+			AND p.YearOfBirth <= c.YearOfBirth + {param:yob-flex}
+		WHERE c.PatientId in (
+			-- find patients who aren't currently matched
+			select PatientId from Cases except select PatientId from CohortStore
+		)
+		GROUP BY c.PatientId, c.YearOfBirth, c.Sex, c.EthnicCategory, p.PatientId) sub
+		INNER JOIN Matches m 
+			ON m.Sex = sub.Sex 
+			AND m.EthnicCategory = sub.EthnicCategory
+			AND m.PatientId = sub.MatchedPatientId
+			AND m.YearOfBirth >= sub.YearOfBirth - {param:yob-flex}
+			AND m.YearOfBirth <= sub.YearOfBirth + {param:yob-flex}
+		WHERE sub.AssignedPersonNumber = 1
+		GROUP BY sub.PatientId, sub.YearOfBirth, sub.Sex, sub.EthnicCategory, MatchedPatientId;
+
+        lastrowinsert1 := CohortStoreRowsAtStart1 - (SELECT COUNT(*) FROM CohortStore);
+
+		DELETE FROM Matches WHERE PatientId IN (SELECT MatchingPatientId FROM CohortStore);
+
+	END WHILE;
+
+END;
+
+--3. Now relax the yob restriction to get extra matches for people with only 1, 2, 3, ... n-1 matches
+
+DECLARE
+    Counter2 INT;
+    CohortStoreRowsAtStart INT;
+    LastRowInsert INT;
+
+BEGIN
+    Counter2 := 1;
+
+    WHILE (Counter2 < {param:num-matches}) DO
+            LastRowInsert:= 1;
+
+            WHILE (LastRowInsert > 0) DO
+            CohortStoreRowsAtStart := (SELECT COUNT(*) FROM CohortStore);
+
+                DROP TABLE IF EXISTS CohortPatientForEachMatchingPatient;
+                CREATE TEMPORARY TABLE CohortPatientForEachMatchingPatient AS
+                SELECT p.PatientId AS MatchedPatientId, c.PatientId, Row_Number() OVER(PARTITION BY p.PatientId ORDER BY p.PatientId) AS MatchedPatientNumber
+                FROM Matches p
+                INNER JOIN Cases c
+                  ON p.Sex = c.Sex 
+				  AND p.EthnicCategory = c.EthnicCategory
+                  AND p.YearOfBirth >= c.YearOfBirth - {param:yob-flex}
+                  AND p.YearOfBirth <= c.YearOfBirth + {param:yob-flex}
+                WHERE c.PatientId IN (
+                  -- find patients who only have @Counter2 matches
+                  SELECT PatientId FROM CohortStore GROUP BY PatientId HAVING count(*) = :Counter2
+                );
+
+                DROP TABLE IF EXISTS CohortPatientForEachMatchingPatientWithCohortNumbered;
+                CREATE TEMPORARY TABLE CohortPatientForEachMatchingPatientWithCohortNumbered AS
+                SELECT PatientId, MatchedPatientId, Row_Number() OVER(PARTITION BY PatientId ORDER BY MatchedPatientId) AS PatientNumber
+                FROM CohortPatientForEachMatchingPatient
+                WHERE MatchedPatientNumber = 1;
+
+                INSERT INTO CohortStore
+                SELECT s.PatientId, c.YearOfBirth, c.Sex, c.EthnicCategory, MatchedPatientId, m.YearOfBirth FROM CohortPatientForEachMatchingPatientWithCohortNumbered s
+                LEFT OUTER JOIN Cases c ON c.PatientId = s.PatientId
+                LEFT OUTER JOIN Matches m ON m.PatientId = MatchedPatientId
+                WHERE PatientNumber = 1;
+
+                lastrowinsert := CohortStoreRowsAtStart - (SELECT COUNT(*) FROM CohortStore);
+
+                DELETE FROM Matches WHERE PatientId IN (SELECT MatchingPatientId FROM CohortStore);
+
+            END WHILE;
+
+    Counter2 := Counter2  + 1;
+    END WHILE;
+END;
diff --git a/shared/Reusable queries for data extraction/query-get-possible-patientsSDE.sql b/shared/Reusable queries for data extraction/query-get-possible-patientsSDE.sql
@@ -0,0 +1,55 @@
+
+--┌─────────────────────────────────────────────────────────────────┐
+--│ Create table of patients who were alive at the study start date │
+--└─────────────────────────────────────────────────────────────────┘
+
+-- ** any patients opted out of sharing GP data would not appear in the final table
+
+-- this script requires an input of StudyStartDate
+
+-- takes one parameter: 
+-- minimum-age : integer - The minimum age of the group of patients. Typically this would be 0 (all patients) or 18 (all adults)
+
+--ALL DEATHS 
+
+DROP TABLE IF EXISTS Death;
+CREATE TEMPORARY TABLE Death AS
+SELECT 
+    DEATH."GmPseudo",
+    TO_DATE(DEATH."RegisteredDateOfDeath") AS DeathDate,
+	OM."DiagnosisOriginalMentionCode",
+    OM."DiagnosisOriginalMentionDesc",
+    OM."DiagnosisOriginalMentionChapterCode",
+    OM."DiagnosisOriginalMentionChapterDesc",
+    OM."DiagnosisOriginalMentionCategory1Code",
+    OM."DiagnosisOriginalMentionCategory1Desc"
+FROM PRESENTATION.NATIONAL_FLOWS_PCMD."DS1804_Pcmd" DEATH
+LEFT JOIN PRESENTATION.NATIONAL_FLOWS_PCMD."DS1804_PcmdDiagnosisOriginalMentions" OM 
+        ON OM."XSeqNo" = DEATH."XSeqNo" AND OM."DiagnosisOriginalMentionNumber" = 1;
+
+-- GET LATEST SNAPSHOT OF DEMOGRAPHICS TABLE
+
+DROP TABLE IF EXISTS LatestSnapshot;
+CREATE TEMPORARY TABLE LatestSnapshot AS
+SELECT 
+    p.*
+FROM PRESENTATION.GP_RECORD."DemographicsProtectedCharacteristics_SecondaryUses" p 
+INNER JOIN (
+    SELECT "GmPseudo", MAX("Snapshot") AS LatestSnapshot
+    FROM PRESENTATION.GP_RECORD."DemographicsProtectedCharacteristics_SecondaryUses" p 
+	WHERE DATEDIFF(YEAR, TO_DATE("DateOfBirth"), $StudyStartDate) >= {param:minimum-age} -- adults only
+    GROUP BY "GmPseudo"
+    ) t2
+ON t2."GmPseudo" = p."GmPseudo" AND t2.LatestSnapshot = p."Snapshot";
+
+-- FIND ALL ADULT PATIENTS ALIVE AT STUDY START DATE
+
+DROP TABLE IF EXISTS AlivePatientsAtStart;
+CREATE TEMPORARY TABLE AlivePatientsAtStart AS 
+SELECT  
+    dem.*, 
+    Death.DeathDate
+FROM LatestSnapshot dem
+LEFT JOIN Death ON Death."GmPseudo" = dem."GmPseudo"
+WHERE 
+    (DeathDate IS NULL OR DeathDate > $StudyStartDate); -- alive on study start date