Merge pull request #245 from UoM-Data-Science-Platforms/Update-LH003-…

…scripts-Mai Update lh003 scripts
UoM-Data-Science-Platforms · Nov 11, 2024 · 5cdc62e · 5cdc62e
2 parents 58c5386 + 7c0b7aa
commit 5cdc62e
Show file tree

Hide file tree

Showing 15 changed files with 323 additions and 155 deletions.
diff --git a/projects/SDE Lighthouse 03 - Kontopantelis/README.html b/projects/SDE Lighthouse 03 - Kontopantelis/README.html
@@ -123,7 +123,22 @@ <h2>Methodology</h2>
 The RDE has access to a library of resusable SQL queries for common tasks, and sets of clinical codes for different phenotypes, built up from previous studies.
 Prior to data extraction, the code is checked and signed off by another RDE.</p>
 <h2>Reusable queries</h2>
-<p>This project did not require any reusable queries from the local library <a href="https://github.com/rw251/gm-idcr/tree/master/shared/Reusable%20queries%20for%20data%20extraction">https://github.com/rw251/gm-idcr/tree/master/shared/Reusable queries for data extraction</a>.## Clinical code sets</p>
+<p>This project required the following reusable queries:</p>
+<ul>
+<li>Create table of patients who were alive at the study start date</li>
+</ul>
+<p>Further details for each query can be found below.</p>
+<h3>Create table of patients who were alive at the study start date</h3>
+<p>undefined</p>
+<p><em>Input</em></p>
+<pre><code>undefined
+</code></pre>
+<p><em>Output</em></p>
+<pre><code>undefined
+</code></pre>
+<p><em>File</em>: <code>query-get-possible-patients.sql</code></p>
+<p><em>Link</em>: <a href="https://github.com/rw251/gm-idcr/tree/master/shared/Reusable%20queries%20for%20data%20extraction/query-get-possible-patients.sql">https://github.com/rw251/.../query-get-possible-patients.sql</a></p>
+<h2>Clinical code sets</h2>
 <p>This project required the following clinical code sets:</p>
 <ul>
 <li>delirium v1</li>

diff --git a/projects/SDE Lighthouse 03 - Kontopantelis/README.md b/projects/SDE Lighthouse 03 - Kontopantelis/README.md
@@ -37,7 +37,28 @@ Prior to data extraction, the code is checked and signed off by another RDE.
 
 ## Reusable queries
 
-This project did not require any reusable queries from the local library [https://github.com/rw251/gm-idcr/tree/master/shared/Reusable queries for data extraction](https://github.com/rw251/gm-idcr/tree/master/shared/Reusable%20queries%20for%20data%20extraction).## Clinical code sets
+This project required the following reusable queries:
+
+- Create table of patients who were alive at the study start date
+
+Further details for each query can be found below.
+
+### Create table of patients who were alive at the study start date
+undefined
+
+_Input_
+```
+undefined
+```
+
+_Output_
+```
+undefined
+```
+_File_: `query-get-possible-patients.sql`
+
+_Link_: [https://github.com/rw251/.../query-get-possible-patients.sql](https://github.com/rw251/gm-idcr/tree/master/shared/Reusable%20queries%20for%20data%20extraction/query-get-possible-patients.sql)
+## Clinical code sets
 
 This project required the following clinical code sets:
 

diff --git a/projects/SDE Lighthouse 03 - Kontopantelis/extraction-sql/1.patients.sql b/projects/SDE Lighthouse 03 - Kontopantelis/extraction-sql/1.patients.sql
@@ -10,24 +10,112 @@ USE SCHEMA SDE_REPOSITORY.SHARED_UTILITIES;
 --		- Sex
 --		- YearOfBirth
 --		- Ethnicity
---		- YearAndMonthOfDeath
+--		- EthnicityCategory
+--		- EIMD2019Decile1IsMostDeprived10IsLeastDeprived
+--		- FirstDementiaDate
+--		- DeathYearAndMonth
 
 -- NB1 PI did not request date of dementia diagnosis, but it seems likely
 -- that they will need it, so including as well.
 
 -- NB2 Date of death was requested in a separate file, but including it here
 -- for brevity, and because it has a 1-2-1 relationship with patient.
 
+set(StudyStartDate) = to_date('2006-01-01');
+set(StudyEndDate)   = to_date('2024-06-30');
+
+
+--┌─────────────────────────────────────────────────────────────────┐
+--│ Create table of patients who were alive at the study start date │
+--└─────────────────────────────────────────────────────────────────┘
+
+-- ** any patients opted out of sharing GP data would not appear in the final table
+
+-- this script requires an input of StudyStartDate
+
+-- takes one parameter: 
+-- minimum-age : integer - The minimum age of the group of patients. Typically this would be 0 (all patients) or 18 (all adults)
+
+--ALL DEATHS 
+
+DROP TABLE IF EXISTS Death;
+CREATE TEMPORARY TABLE Death AS
+SELECT 
+    DEATH."GmPseudo",
+    TO_DATE(DEATH."RegisteredDateOfDeath") AS DeathDate,
+	OM."DiagnosisOriginalMentionCode",
+    OM."DiagnosisOriginalMentionDesc",
+    OM."DiagnosisOriginalMentionChapterCode",
+    OM."DiagnosisOriginalMentionChapterDesc",
+    OM."DiagnosisOriginalMentionCategory1Code",
+    OM."DiagnosisOriginalMentionCategory1Desc"
+FROM PRESENTATION.NATIONAL_FLOWS_PCMD."DS1804_Pcmd" DEATH
+LEFT JOIN PRESENTATION.NATIONAL_FLOWS_PCMD."DS1804_PcmdDiagnosisOriginalMentions" OM 
+        ON OM."XSeqNo" = DEATH."XSeqNo" AND OM."DiagnosisOriginalMentionNumber" = 1;
+
+-- GET LATEST SNAPSHOT OF DEMOGRAPHICS TABLE
+
+DROP TABLE IF EXISTS LatestSnapshot;
+CREATE TEMPORARY TABLE LatestSnapshot AS
+SELECT 
+    p.*
+FROM PRESENTATION.GP_RECORD."DemographicsProtectedCharacteristics_SecondaryUses" p 
+INNER JOIN (
+    SELECT "GmPseudo", MAX("Snapshot") AS LatestSnapshot
+    FROM PRESENTATION.GP_RECORD."DemographicsProtectedCharacteristics_SecondaryUses" p 
+	WHERE DATEDIFF(YEAR, TO_DATE("DateOfBirth"), $StudyStartDate) >= 18 -- adults only
+    GROUP BY "GmPseudo"
+    ) t2
+ON t2."GmPseudo" = p."GmPseudo" AND t2.LatestSnapshot = p."Snapshot";
+
+-- CREATE A PATIENT SUMMARY TABLE TO WORK OUT WHICH PATIENTS HAVE LEFT GM 
+-- AND THEREFORE THEIR DATA FEED STOPPED 
+
+drop table if exists PatientSummary;
+create temporary table PatientSummary as
+select dem."GmPseudo", 
+        min("Snapshot") as "min", 
+        max("Snapshot") as "max", 
+        max(DeathDate) as DeathDate
+from PRESENTATION.GP_RECORD."DemographicsProtectedCharacteristics_SecondaryUses" dem
+LEFT JOIN Death ON Death."GmPseudo" = dem."GmPseudo"
+group by dem."GmPseudo";
+
+-- FIND THE DATE THAT PATIENT LEFT GM
+
+drop table if exists leftGMDate;
+create temporary table leftGMDate as 
+select *,
+    case when DeathDate is null and "max" < (select max("max") from PatientSummary) then "max" else null end as "leftGMDate"
+from PatientSummary;
+
+-- FIND ALL ADULT PATIENTS ALIVE AT STUDY START DATE
+
+DROP TABLE IF EXISTS AlivePatientsAtStart;
+CREATE TEMPORARY TABLE AlivePatientsAtStart AS 
+SELECT  
+    dem.*, 
+    Death."DEATHDATE" AS "DeathDate",
+	l."leftGMDate"
+FROM LatestSnapshot dem
+LEFT JOIN Death ON Death."GmPseudo" = dem."GmPseudo"
+LEFT JOIN leftGMDate l ON l."GmPseudo" = dem."GmPseudo"
+WHERE 
+    (Death."DEATHDATE" IS NULL OR Death."DEATHDATE" > $StudyStartDate) -- alive on study start date
+	AND 
+	(l."leftGMDate" IS NULL OR l."leftGMDate" > $StudyEndDate); -- if patient left GM (therefore we stop receiving their data), ensure it is after study end date
+
+
 DROP TABLE IF EXISTS SDE_REPOSITORY.SHARED_UTILITIES."Cohort_SDE_Lighthouse_03_Kontopantelis";
 CREATE TABLE SDE_REPOSITORY.SHARED_UTILITIES."Cohort_SDE_Lighthouse_03_Kontopantelis" (
 	"GmPseudo" NUMBER(38,0),
 	"FK_Patient_ID" NUMBER(38,0),
 	"FirstDementiaDate" DATE
 ) AS
 SELECT "GmPseudo", "FK_Patient_ID", MIN("Dementia_DiagnosisDate") AS FirstDementiaDate
-FROM PRESENTATION.GP_RECORD."LongTermConditionRegister_SecondaryUses"
+FROM INTERMEDIATE.GP_RECORD."LongTermConditionRegister_SecondaryUses"
 WHERE "Dementia_DiagnosisDate" IS NOT NULL
-AND "Age" >= 18
+AND "FK_Patient_ID" IN (SELECT "FK_Patient_ID" FROM AlivePatientsAtStart)
 GROUP BY "GmPseudo", "FK_Patient_ID";
 
 
@@ -47,13 +135,11 @@ SELECT
 	"EthnicityLatest_Category" AS "EthnicityCategory",
 	"IMD_Decile" AS "IMD2019Decile1IsMostDeprived10IsLeastDeprived",
 	"FirstDementiaDate",
-	CAST("RegisteredDateOfDeath" AS DATE) AS "RegisteredDateOfDeath"
+	DATE_TRUNC(month, alive."DeathDate") AS "DeathYearAndMonth"
 FROM SDE_REPOSITORY.SHARED_UTILITIES."Cohort_SDE_Lighthouse_03_Kontopantelis" cohort
-LEFT OUTER JOIN PRESENTATION.GP_RECORD."DemographicsProtectedCharacteristics_SecondaryUses" demo
-	ON demo."GmPseudo" = cohort."GmPseudo"
-LEFT OUTER JOIN PRESENTATION.NATIONAL_FLOWS_PCMD."DS1804_Pcmd" mortality
-	ON mortality."GmPseudo" = cohort."GmPseudo"
-QUALIFY row_number() OVER (PARTITION BY demo."GmPseudo" ORDER BY "Snapshot" DESC) = 1;
+LEFT OUTER JOIN AlivePatientsAtStart alive
+	ON alive."GmPseudo" = cohort."GmPseudo"
+QUALIFY row_number() OVER (PARTITION BY alive."GmPseudo" ORDER BY "Snapshot" DESC) = 1;
 
 -- Then we check to see if there are any new GmPseudo ids. We do this by making a temp table 
 -- of all "new" GmPseudo ids. I.e. any GmPseudo ids that we've already got a unique id for
@@ -84,5 +170,6 @@ FROM "AllPseudos_SDE_Lighthouse_03_Kontopantelis";
 -- created in the 0.code-sets.sql file
 DROP TABLE IF EXISTS SDE_REPOSITORY.SHARED_UTILITIES."LH003-1_Patients";
 CREATE TABLE SDE_REPOSITORY.SHARED_UTILITIES."LH003-1_Patients" AS
-SELECT SDE_REPOSITORY.SHARED_UTILITIES.gm_pseudo_hash_SDE_Lighthouse_03_Kontopantelis("GmPseudo") AS "PatientID", * EXCLUDE "GmPseudo"
+SELECT SDE_REPOSITORY.SHARED_UTILITIES.gm_pseudo_hash_SDE_Lighthouse_03_Kontopantelis("GmPseudo") AS "PatientID",
+	* EXCLUDE "GmPseudo"
 FROM SDE_REPOSITORY.SHARED_UTILITIES."LH003-1_Patients_WITH_PSEUDO_IDS";
diff --git a/projects/SDE Lighthouse 03 - Kontopantelis/extraction-sql/2a.lifestyle-bmi.sql b/projects/SDE Lighthouse 03 - Kontopantelis/extraction-sql/2a.lifestyle-bmi.sql
@@ -7,10 +7,9 @@ USE SCHEMA SDE_REPOSITORY.SHARED_UTILITIES;
 -- From application:
 --	Table 2: Lifestyle factors (from 2006 to present)
 --		- PatientID
---		- TestName ( smoking status, BMI, alcohol consumption)
 --		- TestDate
 --		- TestResult
---		- TestUnit
+
 
 
 -- ... processing [[create-output-table::"LH003-2a_Lifestyl_BMI"]] ... 
@@ -25,7 +24,7 @@ SELECT
   "GmPseudo",
 	"EventDate" AS "TestDate",
 	"BMI" AS "TestResult"
-FROM INTERMEDIATE.GP_RECORD."Readings_BMI"
+FROM INTERMEDIATE.GP_RECORD."Readings_BMI_SecondaryUses"
 WHERE "GmPseudo" IN (SELECT "GmPseudo" FROM SDE_REPOSITORY.SHARED_UTILITIES."Cohort_SDE_Lighthouse_03_Kontopantelis")
 AND YEAR("EventDate") >= 2006;
 
@@ -58,5 +57,6 @@ FROM "AllPseudos_SDE_Lighthouse_03_Kontopantelis";
 -- created in the 0.code-sets.sql file
 DROP TABLE IF EXISTS SDE_REPOSITORY.SHARED_UTILITIES."LH003-2a_Lifestyl_BMI";
 CREATE TABLE SDE_REPOSITORY.SHARED_UTILITIES."LH003-2a_Lifestyl_BMI" AS
-SELECT SDE_REPOSITORY.SHARED_UTILITIES.gm_pseudo_hash_SDE_Lighthouse_03_Kontopantelis("GmPseudo") AS "PatientID", * EXCLUDE "GmPseudo"
+SELECT SDE_REPOSITORY.SHARED_UTILITIES.gm_pseudo_hash_SDE_Lighthouse_03_Kontopantelis("GmPseudo") AS "PatientID",
+	* EXCLUDE "GmPseudo"
 FROM SDE_REPOSITORY.SHARED_UTILITIES."LH003-2a_Lifestyl_BMI_WITH_PSEUDO_IDS";
diff --git a/projects/SDE Lighthouse 03 - Kontopantelis/extraction-sql/2b.lifestyle-alcohol-smoking.sql b/projects/SDE Lighthouse 03 - Kontopantelis/extraction-sql/2b.lifestyle-alcohol-smoking.sql
@@ -7,10 +7,13 @@ USE SCHEMA SDE_REPOSITORY.SHARED_UTILITIES;
 -- From application:
 --	Table 2: Lifestyle factors (from 2006 to present)
 --		- PatientID
---		- TestName ( smoking status, BMI, alcohol consumption)
+--		- TestName ( Alcohol, Smoking)
 --		- TestDate
+--		- Description
 --		- TestResult
---		- TestUnit
+--		- TestUnits
+--		- Status
+--		- Consumption
 
 -- NB1 - I'm only restricting BMI values to 2006 to present.
 -- NB2 - The PI confirmed that instead of raw values of when statuses were
@@ -35,7 +38,7 @@ SELECT
 	"Units" AS "TestUnits",
 	"AlcoholStatus" AS "Status",
 	"AlcoholConsumption" AS "Consumption"
-FROM INTERMEDIATE.GP_RECORD."Readings_Alcohol"
+FROM INTERMEDIATE.GP_RECORD."Readings_Alcohol_SecondaryUses"
 WHERE "GmPseudo" IN (SELECT "GmPseudo" FROM SDE_REPOSITORY.SHARED_UTILITIES."Cohort_SDE_Lighthouse_03_Kontopantelis")
 UNION
 SELECT
@@ -50,7 +53,7 @@ SELECT
 		WHEN "SmokingConsumption_Date" = "SmokingStatus_Date" THEN "SmokingConsumption"
 		ELSE NULL
 	END -- "Consumption"
-FROM INTERMEDIATE.GP_RECORD."Readings_Smoking"
+FROM INTERMEDIATE.GP_RECORD."Readings_Smoking_SecondaryUses"
 WHERE "GmPseudo" IN (SELECT "GmPseudo" FROM SDE_REPOSITORY.SHARED_UTILITIES."Cohort_SDE_Lighthouse_03_Kontopantelis");
 
 -- Then we check to see if there are any new GmPseudo ids. We do this by making a temp table 
@@ -82,5 +85,6 @@ FROM "AllPseudos_SDE_Lighthouse_03_Kontopantelis";
 -- created in the 0.code-sets.sql file
 DROP TABLE IF EXISTS SDE_REPOSITORY.SHARED_UTILITIES."LH003-2b_Lifestyle_Alcohol_Smoking";
 CREATE TABLE SDE_REPOSITORY.SHARED_UTILITIES."LH003-2b_Lifestyle_Alcohol_Smoking" AS
-SELECT SDE_REPOSITORY.SHARED_UTILITIES.gm_pseudo_hash_SDE_Lighthouse_03_Kontopantelis("GmPseudo") AS "PatientID", * EXCLUDE "GmPseudo"
+SELECT SDE_REPOSITORY.SHARED_UTILITIES.gm_pseudo_hash_SDE_Lighthouse_03_Kontopantelis("GmPseudo") AS "PatientID",
+	* EXCLUDE "GmPseudo"
 FROM SDE_REPOSITORY.SHARED_UTILITIES."LH003-2b_Lifestyle_Alcohol_Smoking_WITH_PSEUDO_IDS";
diff --git a/projects/SDE Lighthouse 03 - Kontopantelis/extraction-sql/3.comorbidities.sql b/projects/SDE Lighthouse 03 - Kontopantelis/extraction-sql/3.comorbidities.sql
@@ -7,10 +7,7 @@ USE SCHEMA SDE_REPOSITORY.SHARED_UTILITIES;
 -- From application:
 --	Table 3: Comorbidities (using full date range available)
 --		- PatientID
---		- Condition
---		- FirstDate
---		- LatestDate
---		- ConditionOccurences (number of times appeared)
+--		- All available comorbidity dates' columns
 
 -- NB1 - just using all the existing comorbidity data in the GP_Record schema.
 -- NB2 - this is not the format initially requested, but likely what the team
@@ -40,10 +37,8 @@ SELECT
 	"NonDiabeticHyperglycemia_DiagnosisDate", "Obesity_DiagnosisDate", "Osteoporosis_DiagnosisDate", "PainfulCondition_DiagnosisDate",
 	"PalliativeCare_DiagnosisDate", "ParkinsonsDisease_DiagnosisDate", "PepticUlcerDisease_DiagnosisDate",
 	"PeripheralArterialDisease_DiagnosisDate", "ProstateDisorder_DiagnosisDate", "Psoriasis_DiagnosisDate",
-	"RheumatoidArthritis_DiagnosisDate", "Stroke_DiagnosisDate", "ThyroidDisorder_DiagnosisDate", "TIA_DiagnosisDate",
-	"FirstLTC", "FirstLTC_DiagnosisDate", "SecondLTC", "SecondLTC_DiagnosisDate", "ThirdLTC",
-	"ThirdLTC_DiagnosisDate", "FourthLTC", "FourthLTC_DiagnosisDate", "FifthLTC", "FifthLTC_DiagnosisDate"
-FROM INTERMEDIATE.GP_RECORD."LongTermConditionRegister_Diagnosis"
+	"RheumatoidArthritis_DiagnosisDate", "Stroke_DiagnosisDate", "ThyroidDisorder_DiagnosisDate", "TIA_DiagnosisDate"
+FROM INTERMEDIATE.GP_RECORD."LongTermConditionRegister_SecondaryUses"
 WHERE "GmPseudo" IN (SELECT "GmPseudo" FROM SDE_REPOSITORY.SHARED_UTILITIES."Cohort_SDE_Lighthouse_03_Kontopantelis")
 QUALIFY row_number() OVER (PARTITION BY "GmPseudo" ORDER BY "Snapshot" DESC) = 1;
 
@@ -76,5 +71,6 @@ FROM "AllPseudos_SDE_Lighthouse_03_Kontopantelis";
 -- created in the 0.code-sets.sql file
 DROP TABLE IF EXISTS SDE_REPOSITORY.SHARED_UTILITIES."LH003-3_Comorbidities";
 CREATE TABLE SDE_REPOSITORY.SHARED_UTILITIES."LH003-3_Comorbidities" AS
-SELECT SDE_REPOSITORY.SHARED_UTILITIES.gm_pseudo_hash_SDE_Lighthouse_03_Kontopantelis("GmPseudo") AS "PatientID", * EXCLUDE "GmPseudo"
+SELECT SDE_REPOSITORY.SHARED_UTILITIES.gm_pseudo_hash_SDE_Lighthouse_03_Kontopantelis("GmPseudo") AS "PatientID",
+	* EXCLUDE "GmPseudo"
 FROM SDE_REPOSITORY.SHARED_UTILITIES."LH003-3_Comorbidities_WITH_PSEUDO_IDS"; -- this brings back the values from the most recent snapshot