Skip to content

Commit

Permalink
Merge branch 'script-updates-for-cohort-matching'
Browse files Browse the repository at this point in the history
  • Loading branch information
Georgetilston committed Oct 3, 2024
2 parents 106ceb6 + ac08eb7 commit d0032cf
Show file tree
Hide file tree
Showing 3 changed files with 358 additions and 3 deletions.
94 changes: 91 additions & 3 deletions scripts/generate-sql.js
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,13 @@ function processNextOutputTable(sql, templateName, config, projectNameChunked) {
//{{create-output-table-no-gmpseudo-ids::"1_Patients"}
const outputTableRegex = /\{\{create-output-table::([^}:]*)\}\}/;
const outputTableNoIdsRegex = /\{\{create-output-table-no-gmpseudo-ids::([^}:]*)\}\}/;
const outputTableMatchedCohortRegex = /\{\{create-output-table-matched-cohort::([^}:]*)\}\}/;

let isMatch = false;

const outputTableMatch = outputTableRegex.exec(sql);
const outputTableNoIdsMatch = outputTableNoIdsRegex.exec(sql);
const outputTableMatchedCohortMatch = outputTableMatchedCohortRegex.exec(sql);
if (outputTableMatch) {
isMatch = true;
let needSemiColon = false;
Expand Down Expand Up @@ -273,17 +275,103 @@ FROM "AllPseudos_${projectNameChunked.join('_')}";
-- created in the 0.code-sets.sql file
DROP TABLE IF EXISTS ${config.PROJECT_SPECIFIC_SCHEMA_FOR_DATA}.${tableName};
CREATE TABLE ${config.PROJECT_SPECIFIC_SCHEMA_FOR_DATA}.${tableName} AS
SELECT ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}.gm_pseudo_hash_${projectNameChunked.join(
SELECT ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}.gm_pseudo_hash_${projectNameChunked.join('_')}("GmPseudo") AS "PatientID",
* EXCLUDE "GmPseudo"
FROM ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}."${tableNameNoQuotes}_WITH_PSEUDO_IDS";`;

sql =
sql.substring(0, indexOfOutputTable) +
replacedSql +
finalSql +
sql.substring(indexOfFinalSemiColon + 1);
} else if (outputTableMatchedCohortMatch) {
isMatch = true;
let needSemiColon = false;

const indexOfOutputTable = sql.indexOf('{{create-output-table-matched-cohort');
let indexOfFinalSemiColon = sql.indexOf(';', indexOfOutputTable);
if (indexOfFinalSemiColon < 0) {
needSemiColon = true;
indexOfFinalSemiColon = sql.length - 1;
}

const [, tableName] = outputTableMatchedCohortMatch;
const tableNameNoQuotes = tableName.match(/^"?([^"]+)"?$/)[1];

const replacedSql =
sql.substring(indexOfOutputTable, indexOfFinalSemiColon + 1).replace(
outputTableMatchedCohortRegex,
`
-- ... processing ${outputTableMatchedCohortMatch[0].replace(/\{/g, '[').replace(/\}/g, ']')} ...
-- ... Need to create an output table called ${tableName} and replace
-- ... the GmPseudo column with a study-specific random patient id.
-- First we create a table in an area only visible to the RDEs which contains
-- the GmPseudos. THESE CANNOT BE RELEASED TO END USERS.
DROP TABLE IF EXISTS ${
config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES
}."${tableNameNoQuotes}_WITH_PSEUDO_IDS";
CREATE TABLE ${
config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES
}."${tableNameNoQuotes}_WITH_PSEUDO_IDS" AS`
) + (needSemiColon ? ';' : '');

const finalSql = `
-- Then we check to see if there are any new GmPseudo ids. We do this by making a temp table
-- of all "new" GmPseudo ids from either the main column or the matched column. I.e. any GmPseudo ids that
-- we've already got a unique id for for this study are excluded
DROP TABLE IF EXISTS "AllPseudos_${projectNameChunked.join('_')}";
CREATE TEMPORARY TABLE "AllPseudos_${projectNameChunked.join('_')}" AS
(
SELECT DISTINCT "GmPseudo" FROM ${
config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES
}."${tableNameNoQuotes}_WITH_PSEUDO_IDS"
UNION
SELECT DISTINCT "MainCohortMatchedGmPseudo" FROM ${
config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES
}."${tableNameNoQuotes}_WITH_PSEUDO_IDS"
)
EXCEPT
SELECT "GmPseudo" FROM "Patient_ID_Mapping_${projectNameChunked.join('_')}";
-- Find the highest currently assigned id. Ids are given incrementally, so now ones
-- need to start at +1 of the current highest
SET highestPatientId = (
SELECT IFNULL(MAX("StudyPatientPseudoId"),0) FROM "Patient_ID_Mapping_${projectNameChunked.join(
'_'
)}("GmPseudo") AS "PatientID", * EXCLUDE "GmPseudo"
)}"
);
-- Make a study specific hash for each new GmPseudo and insert it
-- into the patient lookup table
INSERT INTO "Patient_ID_Mapping_${projectNameChunked.join('_')}"
SELECT
"GmPseudo", -- the GM SDE patient ids for patients in this cohort
SHA2(CONCAT('${projectNameChunked.join(
'_'
)}', "GmPseudo")) AS "Hash", -- used to provide a random (study-specific) ordering for the patient ids we provide
$highestPatientId + ROW_NUMBER() OVER (ORDER BY "Hash") -- the patient id that we provide to the analysts
FROM "AllPseudos_${projectNameChunked.join('_')}";
-- Finally, we select from the output table which includes the GmPseudos, in order
-- to populate the table for the end users where the GmPseudo fields are redacted via a function
-- created in the 0.code-sets.sql file
DROP TABLE IF EXISTS ${config.PROJECT_SPECIFIC_SCHEMA_FOR_DATA}.${tableName};
CREATE TABLE ${config.PROJECT_SPECIFIC_SCHEMA_FOR_DATA}.${tableName} AS
SELECT ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}.gm_pseudo_hash_${projectNameChunked.join('_')}("GmPseudo") AS "PatientID",
${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}.gm_pseudo_hash_${projectNameChunked.join('_')}("MainCohortMatchedGmPseudo") AS "MainCohortMatchedPatientID",
* EXCLUDE ("GmPseudo", "MainCohortMatchedGmPseudo")
FROM ${config.PROJECT_SPECIFIC_SCHEMA_PRIVATE_TO_RDES}."${tableNameNoQuotes}_WITH_PSEUDO_IDS";`;

sql =
sql.substring(0, indexOfOutputTable) +
replacedSql +
finalSql +
sql.substring(indexOfFinalSemiColon + 1);
} else if (outputTableNoIdsMatch) {

} else if (outputTableNoIdsMatch) {
isMatch = true;
let needSemiColon = false;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
--┌────────────────────────────────────────────────────┐
--│ Cohort matching on year of birth / sex │
--└────────────────────────────────────────────────────┘

-- OBJECTIVE: To take a primary cohort and find a 1:n matched cohort based on year of birth and sex.

-- INPUT: Takes two parameters
-- - yob-flex: integer - number of years each way that still allow a year of birth match
-- - num-matches: integer - number of matches for each patient in the cohort
-- Requires two temp tables to exist as follows:
-- MainCohort (FK_Patient_Link_ID, Sex, YearOfBirth)
-- - FK_Patient_Link_ID - unique patient id
-- - Sex - M/F
-- - YearOfBirth - Integer
-- PotentialMatches (FK_Patient_Link_ID, Sex, YearOfBirth)
-- - FK_Patient_Link_ID - unique patient id
-- - Sex - M/F
-- - YearOfBirth - Integer

-- OUTPUT: A temp table as follows:
-- #CohortStore (FK_Patient_Link_ID, YearOfBirth, Sex, MatchingPatientId, MatchingYearOfBirth)
-- - FK_Patient_Link_ID - unique patient id for primary cohort patient
-- - YearOfBirth - of the primary cohort patient
-- - Sex - of the primary cohort patient
-- - MatchingPatientId - id of the matched patient
-- - MatchingYearOfBirth - year of birth of the matched patient

-- TODO
-- A few things to consider when doing matching:
-- - Consider removing "ghost patients" e.g. people without a primary care record
-- - Consider matching on practice. Patients in different locations might have different outcomes. Also
-- for primary care based diagnosing, practices might have different thoughts on severity, timing etc.
-- - For instances where lots of cases have no matches, consider allowing matching to occur with replacement.
-- I.e. a patient can match more than one person in the main cohort.

-- First we extend the PrimaryCohort table to give each age-sex combo a unique number
-- and to avoid polluting the MainCohort table

DROP TABLE IF EXISTS Cases;
CREATE TEMPORARY TABLE Cases AS
SELECT "GmPseudo" AS PatientId,
YearOfBirth,
Sex,
EthnicCategory,
Row_Number() OVER(PARTITION BY YearOfBirth, Sex, EthnicCategory ORDER BY "GmPseudo") AS CaseRowNumber
FROM MainCohort;


-- Then we do the same with the PotentialMatches table
DROP TABLE IF EXISTS Matches;
CREATE TEMPORARY TABLE Matches AS
SELECT "GmPseudo" AS PatientId,
YearOfBirth,
Sex,
EthnicCategory,
Row_Number() OVER(PARTITION BY YearOfBirth, Sex, EthnicCategory ORDER BY "GmPseudo") AS AssignedPersonNumber
FROM PotentialMatches;

-- Find the number of people with each characteristic in the main cohort
DROP TABLE IF EXISTS CharacteristicCount;
CREATE TEMPORARY TABLE CharacteristicCount AS
SELECT YearOfBirth, Sex, EthnicCategory, COUNT(*) AS "Count"
FROM Cases
GROUP BY YearOfBirth, Sex, EthnicCategory;

-- Find the number of potential matches for each Age/Sex combination
-- The output of this is useful for seeing how many matches you can get
-- SELECT A.YearOfBirth, A.Sex, B.Count / A.Count AS NumberOfPotentialMatchesPerCohortPatient FROM (SELECT * FROM #CharacteristicCount) A LEFT OUTER JOIN (SELECT YearOfBirth, Sex, COUNT(*) AS [Count] FROM #Matches GROUP BY YearOfBirth, Sex) B ON B.YearOfBirth = A.YearOfBirth AND B.Sex = A.Sex ORDER BY NumberOfPotentialMatches,A.YearOfBirth,A.Sex;

-- The final table contains a row for each match, so e.g. if patient 1 has 4
-- matches then there will be 4 rows in the table for this.
DROP TABLE IF EXISTS CohortStore;
CREATE TEMPORARY TABLE CohortStore (
PatientId BIGINT,
YearOfBirth INT,
Sex nchar(1),
EthnicCategory varchar(50),
MatchingPatientId BIGINT,
MatchingYearOfBirth INT
);

--1. First match try to match people exactly. We do this as follows:
-- - For each YOB/Sex/EthnicCategory combination we find all potential matches. E.g. all patients
-- - in the potential matches with sex='F' and yob=1957 and EthnicCategory = 'White British'
-- - We then try to assign a single match to all cohort members with sex='F' and yob=1957 and
-- - EthnicCategory = 'White British'. If there are still matches unused, we then assign
-- - a second match to all cohort members. This continues until we either run out of matches,
-- - or successfully match everyone with the desired number of matches.

DECLARE
counter INT;

BEGIN
counter := 1;

WHILE (counter <= {param:num-matches}) DO

INSERT INTO CohortStore
SELECT c.PatientId, c.YearOfBirth, c.Sex, c.EthnicCategory, p.PatientId AS MatchedPatientId, c.YearOfBirth
FROM Cases c
INNER JOIN CharacteristicCount cc on cc.YearOfBirth = c.YearOfBirth and cc.Sex = c.Sex and cc.EthnicCategory = c.EthnicCategory
INNER JOIN Matches p
ON p.Sex = c.Sex
AND p.YearOfBirth = c.YearOfBirth
AND p.EthnicCategory = c.EthnicCategory
-- This next line is the trick to only matching each person once
AND p.AssignedPersonNumber = CaseRowNumber + (:counter - 1) * cc."Count";

-- We might not need this, but to be extra sure let's delete any patients who
-- we're already using to match people
DELETE FROM Matches WHERE PatientId IN (SELECT MatchingPatientId FROM CohortStore);

counter := counter + 1;

END WHILE;

END;

--2. Now relax the yob restriction to get extra matches for people with no matches

DECLARE
lastrowinsert1 INT;
CohortStoreRowsAtStart1 INT;

BEGIN
lastrowinsert1 := 1;

WHILE (lastrowinsert1 > 0) DO
CohortStoreRowsAtStart1 := (SELECT COUNT(*) FROM CohortStore);

INSERT INTO CohortStore
SELECT sub.PatientId, sub.YearOfBirth, sub.Sex, sub.EthnicCategory, MatchedPatientId, MAX(m.YearOfBirth) FROM (
SELECT c.PatientId, c.YearOfBirth, c.Sex, c.EthnicCategory, MAX(p.PatientId) AS MatchedPatientId, Row_Number() OVER(PARTITION BY MAX(p.PatientId) ORDER BY p.PatientId) AS AssignedPersonNumber
FROM Cases c
INNER JOIN Matches p
ON p.Sex = c.Sex
AND p.EthnicCategory = c.EthnicCategory
AND p.YearOfBirth >= c.YearOfBirth - {param:yob-flex}
AND p.YearOfBirth <= c.YearOfBirth + {param:yob-flex}
WHERE c.PatientId in (
-- find patients who aren't currently matched
select PatientId from Cases except select PatientId from CohortStore
)
GROUP BY c.PatientId, c.YearOfBirth, c.Sex, c.EthnicCategory, p.PatientId) sub
INNER JOIN Matches m
ON m.Sex = sub.Sex
AND m.EthnicCategory = sub.EthnicCategory
AND m.PatientId = sub.MatchedPatientId
AND m.YearOfBirth >= sub.YearOfBirth - {param:yob-flex}
AND m.YearOfBirth <= sub.YearOfBirth + {param:yob-flex}
WHERE sub.AssignedPersonNumber = 1
GROUP BY sub.PatientId, sub.YearOfBirth, sub.Sex, sub.EthnicCategory, MatchedPatientId;

lastrowinsert1 := CohortStoreRowsAtStart1 - (SELECT COUNT(*) FROM CohortStore);

DELETE FROM Matches WHERE PatientId IN (SELECT MatchingPatientId FROM CohortStore);

END WHILE;

END;

--3. Now relax the yob restriction to get extra matches for people with only 1, 2, 3, ... n-1 matches

DECLARE
Counter2 INT;
CohortStoreRowsAtStart INT;
LastRowInsert INT;

BEGIN
Counter2 := 1;

WHILE (Counter2 < {param:num-matches}) DO
LastRowInsert:= 1;

WHILE (LastRowInsert > 0) DO
CohortStoreRowsAtStart := (SELECT COUNT(*) FROM CohortStore);

DROP TABLE IF EXISTS CohortPatientForEachMatchingPatient;
CREATE TEMPORARY TABLE CohortPatientForEachMatchingPatient AS
SELECT p.PatientId AS MatchedPatientId, c.PatientId, Row_Number() OVER(PARTITION BY p.PatientId ORDER BY p.PatientId) AS MatchedPatientNumber
FROM Matches p
INNER JOIN Cases c
ON p.Sex = c.Sex
AND p.EthnicCategory = c.EthnicCategory
AND p.YearOfBirth >= c.YearOfBirth - {param:yob-flex}
AND p.YearOfBirth <= c.YearOfBirth + {param:yob-flex}
WHERE c.PatientId IN (
-- find patients who only have @Counter2 matches
SELECT PatientId FROM CohortStore GROUP BY PatientId HAVING count(*) = :Counter2
);

DROP TABLE IF EXISTS CohortPatientForEachMatchingPatientWithCohortNumbered;
CREATE TEMPORARY TABLE CohortPatientForEachMatchingPatientWithCohortNumbered AS
SELECT PatientId, MatchedPatientId, Row_Number() OVER(PARTITION BY PatientId ORDER BY MatchedPatientId) AS PatientNumber
FROM CohortPatientForEachMatchingPatient
WHERE MatchedPatientNumber = 1;

INSERT INTO CohortStore
SELECT s.PatientId, c.YearOfBirth, c.Sex, c.EthnicCategory, MatchedPatientId, m.YearOfBirth FROM CohortPatientForEachMatchingPatientWithCohortNumbered s
LEFT OUTER JOIN Cases c ON c.PatientId = s.PatientId
LEFT OUTER JOIN Matches m ON m.PatientId = MatchedPatientId
WHERE PatientNumber = 1;

lastrowinsert := CohortStoreRowsAtStart - (SELECT COUNT(*) FROM CohortStore);

DELETE FROM Matches WHERE PatientId IN (SELECT MatchingPatientId FROM CohortStore);

END WHILE;

Counter2 := Counter2 + 1;
END WHILE;
END;
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

--┌─────────────────────────────────────────────────────────────────┐
--│ Create table of patients who were alive at the study start date │
--└─────────────────────────────────────────────────────────────────┘

-- ** any patients opted out of sharing GP data would not appear in the final table

-- this script requires an input of StudyStartDate

-- takes one parameter:
-- minimum-age : integer - The minimum age of the group of patients. Typically this would be 0 (all patients) or 18 (all adults)

--ALL DEATHS

DROP TABLE IF EXISTS Death;
CREATE TEMPORARY TABLE Death AS
SELECT
DEATH."GmPseudo",
TO_DATE(DEATH."RegisteredDateOfDeath") AS DeathDate,
OM."DiagnosisOriginalMentionCode",
OM."DiagnosisOriginalMentionDesc",
OM."DiagnosisOriginalMentionChapterCode",
OM."DiagnosisOriginalMentionChapterDesc",
OM."DiagnosisOriginalMentionCategory1Code",
OM."DiagnosisOriginalMentionCategory1Desc"
FROM PRESENTATION.NATIONAL_FLOWS_PCMD."DS1804_Pcmd" DEATH
LEFT JOIN PRESENTATION.NATIONAL_FLOWS_PCMD."DS1804_PcmdDiagnosisOriginalMentions" OM
ON OM."XSeqNo" = DEATH."XSeqNo" AND OM."DiagnosisOriginalMentionNumber" = 1;

-- GET LATEST SNAPSHOT OF DEMOGRAPHICS TABLE

DROP TABLE IF EXISTS LatestSnapshot;
CREATE TEMPORARY TABLE LatestSnapshot AS
SELECT
p.*
FROM PRESENTATION.GP_RECORD."DemographicsProtectedCharacteristics_SecondaryUses" p
INNER JOIN (
SELECT "GmPseudo", MAX("Snapshot") AS LatestSnapshot
FROM PRESENTATION.GP_RECORD."DemographicsProtectedCharacteristics_SecondaryUses" p
WHERE DATEDIFF(YEAR, TO_DATE("DateOfBirth"), $StudyStartDate) >= {param:minimum-age} -- adults only
GROUP BY "GmPseudo"
) t2
ON t2."GmPseudo" = p."GmPseudo" AND t2.LatestSnapshot = p."Snapshot";

-- FIND ALL ADULT PATIENTS ALIVE AT STUDY START DATE

DROP TABLE IF EXISTS AlivePatientsAtStart;
CREATE TEMPORARY TABLE AlivePatientsAtStart AS
SELECT
dem.*,
Death.DeathDate
FROM LatestSnapshot dem
LEFT JOIN Death ON Death."GmPseudo" = dem."GmPseudo"
WHERE
(DeathDate IS NULL OR DeathDate > $StudyStartDate); -- alive on study start date

0 comments on commit d0032cf

Please sign in to comment.