Skip to content

Commit

Permalink
Merge pull request #52 from ccb-hms/issue-50
Browse files Browse the repository at this point in the history
Issue 50
  • Loading branch information
nathan-palmer authored Jun 29, 2023
2 parents e0ad987 + de059f7 commit a5b7430
Show file tree
Hide file tree
Showing 8 changed files with 263 additions and 78 deletions.
8 changes: 6 additions & 2 deletions Code/R/download.R
Original file line number Diff line number Diff line change
Expand Up @@ -318,8 +318,12 @@ for (i in i:length(dataTypes)) {
# change DOUBLE to float
createTableQuery = gsub(createTableQuery, pattern = "\" DOUBLE", replace = "\" float", fixed = TRUE)

# we know that SEQN should always be an INT
# we know that SEQN and other primary keys should always be an INT
createTableQuery = gsub(createTableQuery, pattern = "\"SEQN\" float", replace = "\"SEQN\" INT", fixed = TRUE) # nolint
createTableQuery = gsub(createTableQuery, pattern = "\"SAMPLEID\" float", replace = "\"SAMPLEID\" INT", fixed = TRUE) # nolint
createTableQuery = gsub(createTableQuery, pattern = "\"DRXFDCD\" float", replace = "\"DRXFDCD\" INT", fixed = TRUE) # nolint
createTableQuery = gsub(createTableQuery, pattern = "\"DRXMC\" float", replace = "\"DRXMC\" INT", fixed = TRUE) # nolint
createTableQuery = gsub(createTableQuery, pattern = "\"POOLID\" float", replace = "\"POOLID\" INT", fixed = TRUE) # nolint

# remove double quotes, which interferes with the schema specification
createTableQuery = gsub(createTableQuery, pattern = '"', replace = "", fixed = TRUE)
Expand All @@ -339,7 +343,7 @@ for (i in i:length(dataTypes)) {
SqlTools::dbSendUpdate(cn, insertStatement)

indexStatement = paste(sep="",
"CREATE CLUSTERED COLUMNSTORE INDEX idxSeqn ON Raw.", currDataType)
"CREATE CLUSTERED COLUMNSTORE INDEX ccix ON Raw.", currDataType)

SqlTools::dbSendUpdate(cn, indexStatement)

Expand Down
5 changes: 5 additions & 0 deletions Code/R/translateRawTables.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,8 @@ for (i in 1:nrow(tableList)) {
SqlTools::dbSendUpdate(cn, stmt)
}

# shrink transaction log
SqlTools::dbSendUpdate(cn, "DBCC SHRINKFILE(NhanesLandingZone_log)")

# issue checkpoint
SqlTools::dbSendUpdate(cn, "CHECKPOINT")
42 changes: 42 additions & 0 deletions Code/Sql/RerunTableTranslateOnMissingTables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
This is a debugging tool to help identify tables that fail to translate and re-run
their translations once a bugfix has been proposed.
*/

DROP TABLE IF EXISTS #tmpRawTables
DROP TABLE IF EXISTS #tmpTranslatedTables

-- loop over all missing tables in the last release
SELECT TABLE_NAME
INTO #tmpRawTables
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA = 'Raw'

SELECT TABLE_NAME
INTO #tmpTranslatedTables
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA = 'Translated'

DROP TABLE IF EXISTS #tmpDebugTables
SELECT R.TABLE_NAME INTO #tmpDebugTables FROM #tmpRawTables R LEFT OUTER JOIN #tmpTranslatedTables T ON R.TABLE_NAME = T.TABLE_NAME WHERE T.TABLE_NAME IS NULL ORDER BY R.TABLE_NAME

SELECT * FROM #tmpDebugTables

DECLARE cTableNames CURSOR FOR SELECT * FROM #tmpDebugTables ORDER BY TABLE_NAME
DECLARE @currTableName varchar(256)
OPEN cTableNames

FETCH NEXT FROM cTableNames INTO @currTableName
WHILE @@FETCH_STATUS = 0
BEGIN
PRINT @currTableName
EXECUTE dbo.spTranslateTable
'Raw'
,@currTableName
,'Translated'
,@currTableName
FETCH NEXT FROM cTableNames INTO @currTableName
END

CLOSE cTableNames
DEALLOCATE cTableNames
24 changes: 24 additions & 0 deletions Code/Sql/RunTableTranslateOnSingleTable.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
-- Manually run spTranslateTable for debugging

DECLARE @RC int
DECLARE @SourceTableSchema varchar(8000)
DECLARE @SourceTableName varchar(128)
DECLARE @DestinationTableSchema varchar(8000)
DECLARE @DestinationTableName varchar(128)

-- TODO: Set parameter values here.
-- CAFE_G
SET @SourceTableSchema = 'Raw'
SET @SourceTableName = 'POOLTF_G'
SET @DestinationTableSchema = 'Translated'

EXECUTE @RC = [dbo].[spTranslateTable]
@SourceTableSchema
,@SourceTableName
,@DestinationTableSchema
,@DestinationTableName = @SourceTableName

EXEC ('SELECT TOP 10 * FROM ' + @DestinationTableSchema + '.' + @SourceTableName + ' ORDER BY SEQN')
EXEC ('SELECT TOP 10 * FROM ' + @SourceTableSchema + '.' + @SourceTableName + ' ORDER BY SEQN')

-- SELECT * FROM Metadata.VariableCodebook V WHERE V.TableName='POOLTF_G'
226 changes: 183 additions & 43 deletions Code/Sql/spTranslateTable.sql
Original file line number Diff line number Diff line change
@@ -1,14 +1,35 @@
/*
Stored procedure to translate variable responses in an NHANES questionnaire table
*/
/*
Stored procedure to translate variable responses in an NHANES questionnaire table
*/

CREATE PROC spTranslateTable
@SourceTableSchema varchar(8000),
@SourceTableName varchar(128),
@DestinationTableSchema varchar(8000),
@DestinationTableName varchar(128)
@SourceTableSchema varchar(MAX),
@SourceTableName varchar(MAX),
@DestinationTableSchema varchar(MAX),
@DestinationTableName varchar(MAX)
AS


-- check that the variable codebook actually has data for this table
DECLARE @variableTranslationCount INT
SELECT @variableTranslationCount = COUNT(*) FROM Metadata.VariableCodebook C WHERE C.TableName = @SourceTableName

-- if there are no translatable variables for this table, just copy it over
IF @variableTranslationCount = 0
BEGIN

PRINT 'There are no variables available to translate for ' + @SourceTableSchema + '.' + @SourceTableName + '. Copying table as-is.'
DECLARE @CopyTableStatement varchar(MAX)
SET @CopyTableStatement = '
DROP TABLE IF EXISTS ' + @DestinationTableSchema + '.' + @DestinationTableName + '
SELECT * INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' FROM ' + @SourceTableSchema + '.' + @SourceTableName + '
CREATE CLUSTERED COLUMNSTORE INDEX idxSeqn ON ' + @DestinationTableSchema + '.' + @DestinationTableName
EXEC(@CopyTableStatement)

-- we can't do any of the steps below, so just return
RETURN
END

-- drop the destination table if it exists
DECLARE @DropDestinationStmt varchar(8000)
SET @DropDestinationStmt = 'DROP TABLE IF EXISTS ' + @DestinationTableSchema + '.' + @DestinationTableName
Expand All @@ -24,34 +45,110 @@ AS

-- get all column names of the source table from the information schema
DROP TABLE IF EXISTS #tmpColNames
SELECT COLUMN_NAME
SELECT COLUMN_NAME, ORDINAL_POSITION
INTO #tmpColNames
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = @SourceTableName AND TABLE_SCHEMA = @SourceTableSchema
WHERE
TABLE_NAME = @SourceTableName
AND TABLE_SCHEMA = @SourceTableSchema

-- debugging
-- SELECT * FROM #tmpColNames

-- enumerate possible primary key column names
DROP TABLE IF EXISTS #tmpPkColNames
CREATE TABLE #tmpPkColNames (ColumnName varchar(8000), Priority int)
INSERT INTO #tmpPkColNames VALUES ('SEQN', 1)
INSERT INTO #tmpPkColNames VALUES ('SAMPLEID', 2)
INSERT INTO #tmpPkColNames VALUES ('DRXFDCD', 3)
INSERT INTO #tmpPkColNames VALUES ('DRXMC', 4)
INSERT INTO #tmpPkColNames VALUES ('POOLID', 5)

-- figure out which primary key column this table has
DECLARE @pkColName varchar(MAX)
SELECT TOP 1 @pkColName = P.ColumnName FROM #tmpColNames C INNER JOIN #tmpPkColNames P ON C.COLUMN_NAME = P.ColumnName ORDER BY P.Priority

-- debugging
-- PRINT @pkColName

-- Figure out which columns should remain numeric and not get translated
DROP TABLE IF EXISTS #tmpNumericColumns

SELECT C.COLUMN_NAME, ORDINAL_POSITION
INTO #tmpNumericColumns
FROM
#tmpColNames C
INNER JOIN Metadata.VariableCodebook V ON
V.TableName = @SourceTableName
AND C.COLUMN_NAME = V.Variable
AND V.ValueDescription = 'Range of Values'
WHERE C.COLUMN_NAME != @pkColName
GROUP BY C.COLUMN_NAME, ORDINAL_POSITION

-- debugging
-- SELECT * FROM #tmpNumericColumns

-- remove the numeric column names from the table that contains all columns to be translated
DELETE FROM #tmpColNames WHERE COLUMN_NAME IN (SELECT COLUMN_NAME FROM #tmpNumericColumns)

-- create comma delimited list of columns to be selected from the source table
DECLARE @SourceSelectColNames varchar(8000)
SELECT @SourceSelectColNames=STRING_AGG('[' + COLUMN_NAME + ']', ', ')
-- check whether there are any categorical columns left
DECLARE @categoricalVariableCount INT
SELECT @categoricalVariableCount = COUNT(*) FROM #tmpColNames WHERE COLUMN_NAME != @pkColName

-- debugging
-- PRINT 'number categorical cols left:'
-- PRINT @categoricalVariableCount

-- if there are no translatable variables for this table, just copy it over
IF @categoricalVariableCount = 0
BEGIN
EXEC('
DROP TABLE IF EXISTS ' + @DestinationTableSchema + '.' + @DestinationTableName + '
SELECT * INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' FROM ' + @SourceTableSchema + '.' + @SourceTableName + '
CREATE CLUSTERED COLUMNSTORE INDEX idxSeqn ON ' + @DestinationTableSchema + '.' + @DestinationTableName)

-- we can't do any of the steps below, so just return
RETURN
END

-- create comma delimited list of the numeric columns, will be used later to retrieve them from the source table
DECLARE @NumericSelectColNames varchar(MAX)
SELECT
@NumericSelectColNames = STRING_AGG(CAST('[' + COLUMN_NAME + ']' AS varchar(MAX)), ', ') WITHIN GROUP (ORDER BY ORDINAL_POSITION)
FROM #tmpNumericColumns

-- debugging
-- PRINT 'Numeric cols:'
-- PRINT @NumericSelectColNames

-- create comma delimited list of columns to be selected from the source table, including casting to varchar
DECLARE @SourceSelectColNames varchar(MAX)
SELECT
@SourceSelectColNames = STRING_AGG(CAST('CAST([' + COLUMN_NAME + '] AS varchar(MAX)) AS [' + COLUMN_NAME + ']' AS varchar(MAX)), ', ')
FROM #tmpColNames
WHERE COLUMN_NAME != @pkColName

SET @SourceSelectColNames = CAST('[' AS varchar(MAX)) + @pkColName + CAST('], ' AS varchar(MAX)) + @SourceSelectColNames

-- debugging
-- PRINT 'source select col names:'
-- PRINT @SourceSelectColNames

-- create comma delimited list of columns to be unpivoted
DECLARE @UnpivotColNames varchar(8000)
SELECT @UnpivotColNames=STRING_AGG('[' + COLUMN_NAME + ']', ', ')
-- create comma delimited list of columns to be unpivoted, excluding primary key column
DECLARE @UnpivotColNames varchar(MAX)
SELECT @UnpivotColNames=STRING_AGG(CAST('[' + COLUMN_NAME + ']' AS varchar(MAX)), ', ') WITHIN GROUP (ORDER BY ORDINAL_POSITION)
FROM #tmpColNames
WHERE
COLUMN_NAME != 'SEQN'
AND COLUMN_NAME != 'DownloadUrl'
AND COLUMN_NAME != 'Questionnaire'
AND COLUMN_NAME != 'Description'
COLUMN_NAME != @pkColName

-- debugging
-- PRINT 'unpivot col names:'
-- PRINT @UnpivotColNames

-- assemble dynamic SQL to unpivot the original table
DECLARE @unpivotStmt varchar(8000)
DECLARE @unpivotStmt varchar(MAX)
SET @unpivotStmt = '
SELECT SEQN, Variable, Response
SELECT ' + @pkColName + ', Variable, Response
INTO ' + @UnpivotTempTableName + '
FROM (
SELECT
Expand All @@ -64,16 +161,17 @@ AS
)
) AS unpvt'

-- debugging
-- PRINT @unpivotStmt
EXEC ('DROP TABLE IF EXISTS ' + @UnpivotTempTableName)
EXEC(@unpivotStmt)

-- assemble SQL to join the unpivoted table to the variable codebook
-- to decode the responses
DECLARE @TranslateStmt varchar(8000)
DECLARE @TranslateStmt varchar(MAX)
SET @TranslateStmt = '
SELECT
T.SEQN,
T.' + @pkColName + ',
T.Variable,
COALESCE(CAST(V.ValueDescription AS VARCHAR), CAST(T.Response AS VARCHAR)) AS ValueDescription
INTO
Expand All @@ -85,45 +183,87 @@ AS
AND CAST(T.Response AS VARCHAR) = CAST(V.CodeOrValue AS VARCHAR)
AND V.TableName = ''' + @SourceTableName + '''
'

-- debugging
-- PRINT @TranslateStmt
EXEC (@TranslateStmt)

-- assemble SQL to pivot the translated table back into the original schema
DECLARE @PivotStmt varchar(8000)
SET @PivotStmt = '
SELECT * INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' FROM (
SELECT
SEQN,
Variable,
ValueDescription
FROM ' + @TranslatedTempTableName + '
) AS SourceTable
PIVOT (
MAX(ValueDescription)
FOR Variable IN (
' + @UnpivotColNames + '
)
) AS PivotTable
'
DECLARE @PivotStmt varchar(MAX)

-- if there are numeric columns that need to be merged with the translated categotical columns
IF @NumericSelectColNames IS NOT NULL
BEGIN
-- build the query to include a join against the source table
SET @PivotStmt = '
WITH PivotTable AS(
SELECT *
FROM (
SELECT
' + @pkColName + ',
Variable,
ValueDescription
FROM ' + @TranslatedTempTableName + '
) AS SourceTable
PIVOT (
MAX(ValueDescription)
FOR Variable IN (
' + @UnpivotColNames + '
)
) AS PivotTable
)
SELECT P.*, ' + @NumericSelectColNames + ' INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + '
FROM PivotTable P INNER JOIN ' + @SourceTableSchema + '.' + @SourceTableName + ' S ON S.[' + @pkColName + '] = P.[' + @pkColName + ']
'
END
ELSE
BEGIN
-- since we have no columns to merge from the original source table, just pivot the
-- translated variables and insert into the destination table
SET @PivotStmt = '
WITH PivotTable AS(
SELECT *
FROM (
SELECT
' + @pkColName + ',
Variable,
ValueDescription
FROM ' + @TranslatedTempTableName + '
) AS SourceTable
PIVOT (
MAX(ValueDescription)
FOR Variable IN (
' + @UnpivotColNames + '
)
) AS PivotTable
)
SELECT P.* INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + '
FROM PivotTable P
'
END

-- debugging
-- PRINT 'Pivot statement:'
-- PRINT @PivotStmt
EXEC(@PivotStmt)

-- insert rows where all of the variable responses were NULL in the original data
DECLARE @InsertNullStmt varchar(8000)
SET @InsertNullStmt = '
INSERT INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' (SEQN)
SELECT A.SEQN FROM ' + @SourceTableSchema + '.' + @SourceTableName + ' A LEFT OUTER JOIN ' + @DestinationTableSchema + '.' + @DestinationTableName + ' T ON A.SEQN = T.SEQN WHERE T.SEQN IS NULL
INSERT INTO ' + @DestinationTableSchema + '.' + @DestinationTableName + ' (' + @pkColName + ')
SELECT A.' + @pkColName + ' FROM ' + @SourceTableSchema + '.' + @SourceTableName + ' A LEFT OUTER JOIN ' + @DestinationTableSchema + '.' + @DestinationTableName + ' T ON A.' + @pkColName + ' = T.' + @pkColName + ' WHERE T.' + @pkColName + ' IS NULL
'

-- PRINT @InsertNullStmt
EXEC(@InsertNullStmt)

-- create a clustered index on the destination table with compression
DECLARE @IndexStmt varchar(8000)
SET @IndexStmt = 'CREATE CLUSTERED COLUMNSTORE INDEX idxSeqn ON ' + @DestinationTableSchema + '.' + @DestinationTableName
SET @IndexStmt = 'CREATE CLUSTERED COLUMNSTORE INDEX ccix ON ' + @DestinationTableSchema + '.' + @DestinationTableName

--EXEC('SELECT * FROM ' + @DestinationTableName)
-- debugging
-- EXEC('SELECT TOP 5 * FROM ' + @DestinationTableSchema + '.' + @DestinationTableName + ' ORDER BY SEQN')
-- EXEC('SELECT TOP 5 * FROM ' + @SourceTableSchema + '.' + @SourceTableName + ' ORDER BY SEQN')

DECLARE @GarbageCollectionStmt varchar(8000)
SET @GarbageCollectionStmt = 'DROP TABLE IF EXISTS ' + @UnpivotTempTableName
Expand Down
Loading

0 comments on commit a5b7430

Please sign in to comment.