Merge pull request #52 from ccb-hms/issue-50

Issue 50
ccb-hms · Jun 29, 2023 · a5b7430 · a5b7430
2 parents e0ad987 + de059f7
commit a5b7430
Show file tree

Hide file tree

Showing 8 changed files with 263 additions and 78 deletions.
diff --git a/Code/R/download.R b/Code/R/download.R
@@ -318,8 +318,12 @@ for (i in i:length(dataTypes)) {
       # change DOUBLE to float
       createTableQuery = gsub(createTableQuery, pattern = "\" DOUBLE", replace = "\" float", fixed = TRUE)
 
-      # we know that SEQN should always be an INT
+      # we know that SEQN and other primary keys should always be an INT
       createTableQuery = gsub(createTableQuery, pattern = "\"SEQN\" float", replace = "\"SEQN\" INT", fixed = TRUE) # nolint
+      createTableQuery = gsub(createTableQuery, pattern = "\"SAMPLEID\" float", replace = "\"SAMPLEID\" INT", fixed = TRUE) # nolint
+      createTableQuery = gsub(createTableQuery, pattern = "\"DRXFDCD\" float", replace = "\"DRXFDCD\" INT", fixed = TRUE) # nolint
+      createTableQuery = gsub(createTableQuery, pattern = "\"DRXMC\" float", replace = "\"DRXMC\" INT", fixed = TRUE) # nolint
+      createTableQuery = gsub(createTableQuery, pattern = "\"POOLID\" float", replace = "\"POOLID\" INT", fixed = TRUE) # nolint
 
       # remove double quotes, which interferes with the schema specification
       createTableQuery = gsub(createTableQuery, pattern = '"', replace = "", fixed = TRUE)
@@ -339,7 +343,7 @@ for (i in i:length(dataTypes)) {
       SqlTools::dbSendUpdate(cn, insertStatement)
 
       indexStatement = paste(sep="",
-        "CREATE CLUSTERED COLUMNSTORE INDEX idxSeqn ON Raw.", currDataType)
+        "CREATE CLUSTERED COLUMNSTORE INDEX ccix ON Raw.", currDataType)
 
       SqlTools::dbSendUpdate(cn, indexStatement)
 

diff --git a/Code/R/translateRawTables.R b/Code/R/translateRawTables.R
@@ -45,3 +45,8 @@ for (i in 1:nrow(tableList)) {
     SqlTools::dbSendUpdate(cn, stmt)
 }
 
+# shrink transaction log
+SqlTools::dbSendUpdate(cn, "DBCC SHRINKFILE(NhanesLandingZone_log)")
+
+# issue checkpoint
+SqlTools::dbSendUpdate(cn, "CHECKPOINT")
diff --git a/Code/Sql/RerunTableTranslateOnMissingTables.sql b/Code/Sql/RerunTableTranslateOnMissingTables.sql
@@ -0,0 +1,42 @@
+/*
+    This is a debugging tool to help identify tables that fail to translate and re-run 
+    their translations once a bugfix has been proposed.
+*/
+
+DROP TABLE IF EXISTS #tmpRawTables
+DROP TABLE IF EXISTS #tmpTranslatedTables
+
+-- loop over all missing tables in the last release
+ SELECT TABLE_NAME
+ INTO #tmpRawTables
+    FROM INFORMATION_SCHEMA.TABLES
+    WHERE TABLE_SCHEMA = 'Raw'
+
+ SELECT TABLE_NAME
+ INTO #tmpTranslatedTables
+    FROM INFORMATION_SCHEMA.TABLES
+    WHERE TABLE_SCHEMA = 'Translated'
+
+DROP TABLE IF EXISTS #tmpDebugTables
+SELECT R.TABLE_NAME INTO #tmpDebugTables FROM #tmpRawTables R LEFT OUTER JOIN #tmpTranslatedTables T ON R.TABLE_NAME = T.TABLE_NAME WHERE T.TABLE_NAME IS NULL ORDER BY R.TABLE_NAME
+
+SELECT * FROM #tmpDebugTables
+
+DECLARE cTableNames CURSOR FOR SELECT * FROM #tmpDebugTables ORDER BY TABLE_NAME
+DECLARE @currTableName varchar(256)
+OPEN cTableNames
+
+FETCH NEXT FROM cTableNames INTO @currTableName
+WHILE @@FETCH_STATUS = 0  
+BEGIN
+    PRINT @currTableName
+    EXECUTE dbo.spTranslateTable
+        'Raw'
+        ,@currTableName
+        ,'Translated'
+        ,@currTableName
+    FETCH NEXT FROM cTableNames INTO @currTableName
+END
+
+CLOSE cTableNames
+DEALLOCATE cTableNames
diff --git a/Code/Sql/RunTableTranslateOnSingleTable.sql b/Code/Sql/RunTableTranslateOnSingleTable.sql
@@ -0,0 +1,24 @@
+-- Manually run spTranslateTable for debugging
+
+DECLARE @RC int
+DECLARE @SourceTableSchema varchar(8000)
+DECLARE @SourceTableName varchar(128)
+DECLARE @DestinationTableSchema varchar(8000)
+DECLARE @DestinationTableName varchar(128)
+
+-- TODO: Set parameter values here.
+-- CAFE_G
+SET @SourceTableSchema  = 'Raw'
+SET @SourceTableName  = 'POOLTF_G'
+SET @DestinationTableSchema  = 'Translated'
+
+EXECUTE @RC = [dbo].[spTranslateTable] 
+   @SourceTableSchema
+  ,@SourceTableName
+  ,@DestinationTableSchema
+  ,@DestinationTableName = @SourceTableName
+
+EXEC ('SELECT TOP 10 * FROM ' + @DestinationTableSchema + '.' + @SourceTableName + ' ORDER BY SEQN')
+EXEC ('SELECT TOP 10 * FROM ' + @SourceTableSchema + '.' + @SourceTableName + ' ORDER BY SEQN')
+
+-- SELECT * FROM Metadata.VariableCodebook V WHERE V.TableName='POOLTF_G'
diff --git a/Code/Sql/spTranslateTable.sql b/Code/Sql/spTranslateTable.sql
@@ -1,14 +1,35 @@
-    /*
-        Stored procedure to translate variable responses in an NHANES questionnaire table
-    */
+/*
+    Stored procedure to translate variable responses in an NHANES questionnaire table
+*/
 
 CREATE PROC spTranslateTable 
-    @SourceTableSchema varchar(8000),
-    @SourceTableName varchar(128),
-    @DestinationTableSchema varchar(8000),
-    @DestinationTableName varchar(128)
+    @SourceTableSchema varchar(MAX),
+    @SourceTableName varchar(MAX),
+    @DestinationTableSchema varchar(MAX),
+    @DestinationTableName varchar(MAX)
 AS
 
+
+    -- check that the variable codebook actually has data for this table
+    DECLARE @variableTranslationCount INT
+    SELECT @variableTranslationCount = COUNT(*) FROM Metadata.VariableCodebook C WHERE C.TableName = @SourceTableName
+
+    -- if there are no translatable variables for this table, just copy it over
+    IF @variableTranslationCount = 0
+        BEGIN
+
+            PRINT 'There are no variables available to translate for ' + @SourceTableSchema + '.' + @SourceTableName + '.  Copying table as-is.'
+            DECLARE @CopyTableStatement varchar(MAX)
+            SET @CopyTableStatement = '
+                DROP TABLE IF EXISTS ' + @DestinationTableSchema + '.' + @DestinationTableName + '
+                SELECT * INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' FROM ' + @SourceTableSchema + '.' + @SourceTableName + '
+                CREATE CLUSTERED COLUMNSTORE INDEX idxSeqn ON ' + @DestinationTableSchema + '.' + @DestinationTableName
+            EXEC(@CopyTableStatement)
+
+            -- we can't do any of the steps below, so just return
+            RETURN
+        END
+
     -- drop the destination table if it exists
     DECLARE @DropDestinationStmt varchar(8000)
     SET @DropDestinationStmt = 'DROP TABLE IF EXISTS ' + @DestinationTableSchema + '.' + @DestinationTableName
@@ -24,34 +45,110 @@ AS
 
     -- get all column names of the source table from the information schema
     DROP TABLE IF EXISTS #tmpColNames
-    SELECT COLUMN_NAME
+    SELECT COLUMN_NAME, ORDINAL_POSITION
     INTO #tmpColNames
     FROM INFORMATION_SCHEMA.COLUMNS
-    WHERE TABLE_NAME = @SourceTableName AND TABLE_SCHEMA = @SourceTableSchema
+    WHERE 
+        TABLE_NAME = @SourceTableName 
+        AND TABLE_SCHEMA = @SourceTableSchema
+
+    -- debugging
+    -- SELECT * FROM #tmpColNames
+
+    -- enumerate possible primary key column names
+    DROP TABLE IF EXISTS #tmpPkColNames
+    CREATE TABLE #tmpPkColNames (ColumnName varchar(8000), Priority int)
+    INSERT INTO #tmpPkColNames VALUES ('SEQN', 1)
+    INSERT INTO #tmpPkColNames VALUES ('SAMPLEID', 2)
+    INSERT INTO #tmpPkColNames VALUES ('DRXFDCD', 3)
+    INSERT INTO #tmpPkColNames VALUES ('DRXMC', 4)
+    INSERT INTO #tmpPkColNames VALUES ('POOLID', 5)
+
+    -- figure out which primary key column this table has
+    DECLARE @pkColName varchar(MAX)
+    SELECT TOP 1 @pkColName = P.ColumnName FROM #tmpColNames C INNER JOIN #tmpPkColNames P ON C.COLUMN_NAME = P.ColumnName ORDER BY P.Priority
+
+    -- debugging
+    -- PRINT @pkColName
+
+    -- Figure out which columns should remain numeric and not get translated
+    DROP TABLE IF EXISTS #tmpNumericColumns
+
+    SELECT C.COLUMN_NAME, ORDINAL_POSITION
+    INTO #tmpNumericColumns
+    FROM 
+        #tmpColNames C 
+        INNER JOIN Metadata.VariableCodebook V ON
+            V.TableName = @SourceTableName
+            AND C.COLUMN_NAME = V.Variable
+            AND V.ValueDescription = 'Range of Values'
+    WHERE C.COLUMN_NAME != @pkColName
+    GROUP BY C.COLUMN_NAME, ORDINAL_POSITION
+
+    -- debugging
+    -- SELECT * FROM #tmpNumericColumns
+
+    -- remove the numeric column names from the table that contains all columns to be translated
+    DELETE FROM #tmpColNames WHERE COLUMN_NAME IN (SELECT COLUMN_NAME FROM #tmpNumericColumns)
 
-    -- create comma delimited list of columns to be selected from the source table
-    DECLARE @SourceSelectColNames varchar(8000)
-    SELECT @SourceSelectColNames=STRING_AGG('[' + COLUMN_NAME + ']', ', ') 
+    -- check whether there are any categorical columns left
+    DECLARE @categoricalVariableCount INT
+    SELECT @categoricalVariableCount = COUNT(*) FROM #tmpColNames WHERE COLUMN_NAME != @pkColName
+
+    -- debugging
+    -- PRINT 'number categorical cols left:'
+    -- PRINT @categoricalVariableCount
+
+    -- if there are no translatable variables for this table, just copy it over
+    IF @categoricalVariableCount = 0
+        BEGIN
+            EXEC('
+                DROP TABLE IF EXISTS ' + @DestinationTableSchema + '.' + @DestinationTableName + '
+                SELECT * INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' FROM ' + @SourceTableSchema + '.' + @SourceTableName + '
+                CREATE CLUSTERED COLUMNSTORE INDEX idxSeqn ON ' + @DestinationTableSchema + '.' + @DestinationTableName)
+
+            -- we can't do any of the steps below, so just return
+            RETURN
+        END
+
+     -- create comma delimited list of the numeric columns, will be used later to retrieve them from the source table
+    DECLARE @NumericSelectColNames varchar(MAX)
+    SELECT 
+        @NumericSelectColNames = STRING_AGG(CAST('[' + COLUMN_NAME + ']' AS varchar(MAX)), ', ') WITHIN GROUP (ORDER BY ORDINAL_POSITION)
+    FROM #tmpNumericColumns
+
+    -- debugging
+    -- PRINT 'Numeric cols:'
+    -- PRINT @NumericSelectColNames
+
+    -- create comma delimited list of columns to be selected from the source table, including casting to varchar
+    DECLARE @SourceSelectColNames varchar(MAX)
+    SELECT 
+        @SourceSelectColNames = STRING_AGG(CAST('CAST([' + COLUMN_NAME + '] AS varchar(MAX)) AS [' + COLUMN_NAME + ']' AS varchar(MAX)), ', ')
     FROM #tmpColNames
+    WHERE COLUMN_NAME != @pkColName
+
+    SET @SourceSelectColNames = CAST('[' AS varchar(MAX)) + @pkColName + CAST('], ' AS varchar(MAX)) + @SourceSelectColNames
 
+    -- debugging
+    -- PRINT 'source select col names:' 
     -- PRINT @SourceSelectColNames
 
-    -- create comma delimited list of columns to be unpivoted
-    DECLARE @UnpivotColNames varchar(8000)
-    SELECT @UnpivotColNames=STRING_AGG('[' + COLUMN_NAME + ']', ', ') 
+    -- create comma delimited list of columns to be unpivoted, excluding primary key column
+    DECLARE @UnpivotColNames varchar(MAX)
+    SELECT @UnpivotColNames=STRING_AGG(CAST('[' + COLUMN_NAME + ']' AS varchar(MAX)), ', ') WITHIN GROUP (ORDER BY ORDINAL_POSITION)
     FROM #tmpColNames
     WHERE 
-        COLUMN_NAME != 'SEQN'
-        AND COLUMN_NAME != 'DownloadUrl'
-        AND COLUMN_NAME != 'Questionnaire'
-        AND COLUMN_NAME != 'Description'
+        COLUMN_NAME != @pkColName
 
+    -- debugging
+    -- PRINT 'unpivot col names:'
     -- PRINT @UnpivotColNames
 
     -- assemble dynamic SQL to unpivot the original table
-    DECLARE @unpivotStmt varchar(8000)
+    DECLARE @unpivotStmt varchar(MAX)
     SET @unpivotStmt = '
-        SELECT SEQN, Variable, Response 
+        SELECT ' + @pkColName + ', Variable, Response 
         INTO ' + @UnpivotTempTableName + '
         FROM (
             SELECT 
@@ -64,16 +161,17 @@ AS
             )
         ) AS unpvt'
 
+    -- debugging
     -- PRINT @unpivotStmt
     EXEC ('DROP TABLE IF EXISTS ' + @UnpivotTempTableName)
     EXEC(@unpivotStmt)
 
     -- assemble SQL to join the unpivoted table to the variable codebook
     -- to decode the responses
-    DECLARE @TranslateStmt varchar(8000)
+    DECLARE @TranslateStmt varchar(MAX)
     SET @TranslateStmt = '
         SELECT 
-            T.SEQN,
+            T.' + @pkColName + ',
             T.Variable,
             COALESCE(CAST(V.ValueDescription AS VARCHAR), CAST(T.Response AS VARCHAR)) AS ValueDescription
         INTO 
@@ -85,45 +183,87 @@ AS
                 AND CAST(T.Response AS VARCHAR) = CAST(V.CodeOrValue AS VARCHAR)
                 AND V.TableName = ''' + @SourceTableName + '''        
     '
+
+    -- debugging
     -- PRINT @TranslateStmt
     EXEC (@TranslateStmt)
 
     -- assemble SQL to pivot the translated table back into the original schema
-    DECLARE @PivotStmt varchar(8000)
-    SET @PivotStmt = '
-        SELECT * INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' FROM (
-            SELECT 
-                SEQN, 
-                Variable, 
-                ValueDescription 
-            FROM ' + @TranslatedTempTableName + '
-        ) AS SourceTable
-        PIVOT (
-            MAX(ValueDescription)
-            FOR Variable IN (
-                    ' + @UnpivotColNames + '
-                ) 
-        ) AS PivotTable
-    '
+    DECLARE @PivotStmt varchar(MAX)
+
+    -- if there are numeric columns that need to be merged with the translated categotical columns
+    IF @NumericSelectColNames IS NOT NULL
+        BEGIN
+            -- build the query to include a join against the source table
+            SET @PivotStmt = '
+                WITH PivotTable AS(
+                    SELECT * 
+                    FROM (
+                        SELECT 
+                            ' + @pkColName + ', 
+                            Variable, 
+                            ValueDescription 
+                        FROM ' + @TranslatedTempTableName + '
+                    ) AS SourceTable
+                    PIVOT (
+                        MAX(ValueDescription)
+                        FOR Variable IN (
+                                ' + @UnpivotColNames + '
+                            ) 
+                    ) AS PivotTable
+                )
+                SELECT P.*, ' + @NumericSelectColNames + ' INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' 
+                FROM PivotTable P INNER JOIN ' + @SourceTableSchema + '.' + @SourceTableName + ' S ON S.[' + @pkColName + '] = P.[' + @pkColName + ']
+            '
+        END
+    ELSE
+        BEGIN
+            -- since we have no columns to merge from the original source table, just pivot the 
+            -- translated variables and insert into the destination table
+            SET @PivotStmt = '
+                WITH PivotTable AS(
+                    SELECT * 
+                    FROM (
+                        SELECT 
+                            ' + @pkColName + ', 
+                            Variable, 
+                            ValueDescription 
+                        FROM ' + @TranslatedTempTableName + '
+                    ) AS SourceTable
+                    PIVOT (
+                        MAX(ValueDescription)
+                        FOR Variable IN (
+                                ' + @UnpivotColNames + '
+                            ) 
+                    ) AS PivotTable
+                )
+                SELECT P.* INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' 
+                FROM PivotTable P
+            '
+        END
 
+    -- debugging
+    -- PRINT 'Pivot statement:'
     -- PRINT @PivotStmt
     EXEC(@PivotStmt) 
 
     -- insert rows where all of the variable responses were NULL in the original data
     DECLARE @InsertNullStmt varchar(8000)
     SET @InsertNullStmt = '
-        INSERT INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' (SEQN)
-        SELECT A.SEQN FROM ' + @SourceTableSchema + '.' + @SourceTableName + ' A LEFT OUTER JOIN ' + @DestinationTableSchema + '.' + @DestinationTableName + ' T ON A.SEQN = T.SEQN WHERE T.SEQN IS NULL
+        INSERT INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' (' + @pkColName + ')
+        SELECT A.' + @pkColName + ' FROM ' + @SourceTableSchema + '.' + @SourceTableName + ' A LEFT OUTER JOIN ' + @DestinationTableSchema + '.' + @DestinationTableName + ' T ON A.' + @pkColName + ' = T.' + @pkColName + ' WHERE T.' + @pkColName + ' IS NULL
     '
 
     -- PRINT @InsertNullStmt
     EXEC(@InsertNullStmt)
 
     -- create a clustered index on the destination table with compression
     DECLARE @IndexStmt varchar(8000)
-    SET @IndexStmt = 'CREATE CLUSTERED COLUMNSTORE INDEX idxSeqn ON ' + @DestinationTableSchema + '.' + @DestinationTableName
+    SET @IndexStmt = 'CREATE CLUSTERED COLUMNSTORE INDEX ccix ON ' + @DestinationTableSchema + '.' + @DestinationTableName
 
-    --EXEC('SELECT * FROM ' + @DestinationTableName)
+    -- debugging
+    -- EXEC('SELECT TOP 5 * FROM ' + @DestinationTableSchema + '.' + @DestinationTableName + ' ORDER BY SEQN')
+    -- EXEC('SELECT TOP 5 * FROM ' + @SourceTableSchema + '.' + @SourceTableName + ' ORDER BY SEQN')
 
     DECLARE @GarbageCollectionStmt varchar(8000)
     SET @GarbageCollectionStmt = 'DROP TABLE IF EXISTS ' + @UnpivotTempTableName