diff --git a/sql-network/sql-network-01-nhanes-p1.ipynb b/sql-network/sql-network-01-nhanes-p1.ipynb new file mode 100644 index 0000000..536b151 --- /dev/null +++ b/sql-network/sql-network-01-nhanes-p1.ipynb @@ -0,0 +1,1744 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%defaultDatasource jdbc:h2:mem:db" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reference values for NHANES for the 2005-2006 survey\n", + "\n", + "* Extracted from data of the NHANES Web site (https://wwwn.cdc.gov/nchs/nhanes/).\n", + "\n", + "## Importing normal ranges of values indicated in the NHANES documentation\n", + "\n", + "The following query imports the file `reference-ranges.csv` that contains reference ranges from NHANES.\n", + "\n", + "* For each variable it is indicated\n", + " - applicable gender\n", + " - age range (ageStart until ageEnd)\n", + "\n", + "* The range is indicated in the form of mininum and maximum values considered normal." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a1be122a-b5b2-42ae-a324-96699334b482", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0ad46312-825e-429c-8770-9f1e135f113b", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP TABLE IF EXISTS ReferenceRanges;\n", + "CREATE TABLE ReferenceRanges (\n", + " variable VARCHAR(8),\n", + " gender VARCHAR(1),\n", + " ageStart SMALLINT,\n", + " ageEnd SMALLINT,\n", + " min DECIMAL(7,1),\n", + " max DECIMAL(7,1),\n", + " PRIMARY KEY(variable,gender,ageStart,ageEnd)\n", + ") AS SELECT\n", + " variable,gender,ageStart,ageEnd,min,max\n", + "FROM CSVREAD('../data/nhanes2005-2006/reference-ranges.csv');\n", + "\n", + "SELECT DISTINCT variable FROM ReferenceRanges;\n", + "SELECT * FROM ReferenceRanges;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Survey NHANES 2005-2006\n", + "\n", + "* Extracted from data of the NHANES Web site (https://wwwn.cdc.gov/nchs/nhanes/).\n", + "\n", + "## Importing data from the survey NHANES 2005-2006\n", + "\n", + "* The following query imports the `combined-selected-variables.csv` that contains a tuple for each individual, with a selected set of variables that are used do diagnose anemia, as mentioned in Figure 1. It was filtered only the individuals with values for all fields.\n", + "\n", + "![evaluation of anemia](evaluation-of-anemia.gif \"Figure 1\")\n", + "*Figure 1*: Evaluation of anemia in the adult according to the mean corpuscular volume. CBC: complete blood count; MCV: mean corpuscular volume; RBCs: red blood cells; Fe: iron; TIBC: total iron-binding capacity (transferrin); LDH: lactate dehydrogenase [6]." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f1267ae5-4d61-431b-9292-12a6202cba98", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "526dafd3-1d6e-40ba-bc26-c8eeefa9c603", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP TABLE IF EXISTS Survey;\n", + "CREATE TABLE Survey (\n", + " SEQN VARCHAR(8),\n", + " RIAGENDR VARCHAR(1),\n", + " RIDAGEYR SMALLINT,\n", + " LBXIRN DECIMAL(7,1),\n", + " LBXTIB DECIMAL(7,1),\n", + " LBXSLDSI DECIMAL(7,1),\n", + " LBXWBCSI DECIMAL(7,1),\n", + " LBXLYPCT DECIMAL(7,1),\n", + " LBXMOPCT DECIMAL(7,1),\n", + " LBXNEPCT DECIMAL(7,1),\n", + " LBXEOPCT DECIMAL(7,1),\n", + " LBXBAPCT DECIMAL(7,1),\n", + " LBXRBCSI DECIMAL(7,1),\n", + " LBXHGB DECIMAL(7,1),\n", + " LBXHCT DECIMAL(7,1),\n", + " LBXMCVSI DECIMAL(7,1),\n", + " LBXMCHSI DECIMAL(7,1),\n", + " LBXMC DECIMAL(7,1),\n", + " LBXRDW DECIMAL(7,1),\n", + " LBXPLTSI DECIMAL(7,1),\n", + " LBXMPSI DECIMAL(7,1),\n", + " PRIMARY KEY(SEQN)\n", + ") AS SELECT\n", + " SEQN,RIAGENDR,RIDAGEYR,LBXIRN,LBXTIB,LBXSLDSI,LBXWBCSI,LBXLYPCT,LBXMOPCT,LBXNEPCT,LBXEOPCT,LBXBAPCT,LBXRBCSI,LBXHGB,LBXHCT,LBXMCVSI,LBXMCHSI,LBXMC,LBXRDW,LBXPLTSI,LBXMPSI\n", + "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');\n", + "\n", + "SELECT COUNT(*) FROM Survey;\n", + "SELECT * FROM Survey;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Codes and description of NHANES variables\n", + "\n", + "* The following query imports the `reference-ranges-variables.csv` that contains codes and description of the variables adopted in this study." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3f8a660d-60e6-417b-8fad-0e9098d253c1", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP TABLE IF EXISTS VariableDescription;\n", + "CREATE TABLE VariableDescription (\n", + " variable VARCHAR(8),\n", + " acronym VARCHAR(8),\n", + " name VARCHAR(50),\n", + " unit VARCHAR(30),\n", + " file VARCHAR(20),\n", + " ranges VARCHAR(100),\n", + " PRIMARY KEY(variable)\n", + ") AS SELECT\n", + " variable,acronym,name,unit,file,ranges\n", + "FROM CSVREAD('../data/nhanes2005-2006/reference-ranges-variables.csv');\n", + "\n", + "SELECT * FROM VariableDescription;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Binary evaluation of individuals out of the normal ranges\n", + "\n", + "* For each variable, this table defines an extra binary column _b which is initialized with 0 and will receive 1 is the variable is out of the NHANES range." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generation of the starting matrix initialized with 0" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d494781b-8a69-485c-931c-8f106e53d9ab", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fdef6be5-3279-4153-a856-f6e1afef3fff", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP TABLE IF EXISTS SurveyB;\n", + "CREATE TABLE SurveyB (\n", + " SEQN VARCHAR(8),\n", + " RIAGENDR VARCHAR(1),\n", + " RIDAGEYR SMALLINT,\n", + " LBXIRN DECIMAL(7,1),\n", + " LBXIRN_b SMALLINT DEFAULT 0,\n", + " LBXTIB DECIMAL(7,1),\n", + " LBXTIB_b SMALLINT DEFAULT 0,\n", + " LBXSLDSI DECIMAL(7,1),\n", + " LBXSLDSI_b SMALLINT DEFAULT 0,\n", + " LBXWBCSI DECIMAL(7,1),\n", + " LBXWBCSI_b SMALLINT DEFAULT 0,\n", + " LBXLYPCT DECIMAL(7,1),\n", + " LBXLYPCT_b SMALLINT DEFAULT 0,\n", + " LBXMOPCT DECIMAL(7,1),\n", + " LBXMOPCT_b SMALLINT DEFAULT 0,\n", + " LBXNEPCT DECIMAL(7,1),\n", + " LBXNEPCT_b SMALLINT DEFAULT 0,\n", + " LBXEOPCT DECIMAL(7,1),\n", + " LBXEOPCT_b SMALLINT DEFAULT 0,\n", + " LBXBAPCT DECIMAL(7,1),\n", + " LBXBAPCT_b SMALLINT DEFAULT 0,\n", + " LBXRBCSI DECIMAL(7,1),\n", + " LBXRBCSI_b SMALLINT DEFAULT 0,\n", + " LBXHGB DECIMAL(7,1),\n", + " LBXHGB_b SMALLINT DEFAULT 0,\n", + " LBXHCT DECIMAL(7,1),\n", + " LBXHCT_b SMALLINT DEFAULT 0,\n", + " LBXMCVSI DECIMAL(7,1),\n", + " LBXMCVSI_b SMALLINT DEFAULT 0,\n", + " LBXMCHSI DECIMAL(7,1),\n", + " LBXMCHSI_b SMALLINT DEFAULT 0,\n", + " LBXMC DECIMAL(7,1),\n", + " LBXMC_b SMALLINT DEFAULT 0,\n", + " LBXRDW DECIMAL(7,1),\n", + " LBXRDW_b SMALLINT DEFAULT 0,\n", + " LBXPLTSI DECIMAL(7,1),\n", + " LBXPLTSI_b SMALLINT DEFAULT 0,\n", + " LBXMPSI DECIMAL(7,1),\n", + " LBXMPSI_b SMALLINT DEFAULT 0,\n", + " PRIMARY KEY(SEQN)\n", + ") AS SELECT\n", + " SEQN,RIAGENDR,RIDAGEYR,LBXIRN,0,LBXTIB,0,LBXSLDSI,0,LBXWBCSI,0,LBXLYPCT,0,LBXMOPCT,0,LBXNEPCT,0,LBXEOPCT,0,LBXBAPCT,0,LBXRBCSI,0,LBXHGB,0,LBXHCT,0,LBXMCVSI,0,LBXMCHSI,0,LBXMC,0,LBXRDW,0,LBXPLTSI,0,LBXMPSI,0\n", + "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');\n", + "\n", + "SELECT COUNT(*) FROM SurveyB;\n", + "SELECT * FROM SurveyB;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Matrix building\n", + "\n", + "* Each variable is compared with the limits of the NHANES ranges, and the binary _b columns are updated." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "-- Computing LBXIRN\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXIRN_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXIRN' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXIRN=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXIRN>RRb.max);\n", + "\n", + "-- Computing LBXTIB\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXTIB_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXTIB' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXTIB=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXTIB>RRb.max);\n", + "\n", + "-- Computing LBXSLDSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXSLDSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXSLDSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXSLDSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXSLDSI>RRb.max);\n", + "\n", + "-- Computing LBXWBCSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXWBCSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXWBCSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXWBCSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXWBCSI>RRb.max);\n", + "\n", + "-- Computing LBXLYPCT\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXLYPCT_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXLYPCT' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXLYPCT=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXLYPCT>RRb.max);\n", + "\n", + "-- Computing LBXMOPCT\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXMOPCT_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMOPCT' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMOPCT=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMOPCT>RRb.max);\n", + "\n", + "-- Computing LBXNEPCT\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXNEPCT_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXNEPCT' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXNEPCT=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXNEPCT>RRb.max);\n", + "\n", + "-- Computing LBXEOPCT\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXEOPCT_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXEOPCT' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXEOPCT=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXEOPCT>RRb.max);\n", + "\n", + "-- Computing LBXBAPCT\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXBAPCT_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXBAPCT' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXBAPCT=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXBAPCT>RRb.max);\n", + "\n", + "-- Computing LBXRBCSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXRBCSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXRBCSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXRBCSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXRBCSI>RRb.max);\n", + "\n", + "-- Computing LBXHGB\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXHGB_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXHGB' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHGB=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHGB>RRb.max);\n", + "\n", + "-- Computing LBXHCT\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXHCT_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXHCT' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHCT=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHCT>RRb.max);\n", + "\n", + "-- Computing LBXMCVSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXMCVSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMCVSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMCVSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMCVSI>RRb.max);\n", + "\n", + "-- Computing LBXMCHSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXMCHSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMCHSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMCHSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMCHSI>RRb.max);\n", + "\n", + "-- Computing LBXMC\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXMC_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMC' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMC=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMC>RRb.max);\n", + "\n", + "-- Computing LBXRDW\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXRDW_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXRDW' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXRDW=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXRDW>RRb.max);\n", + "\n", + "-- Computing LBXPLTSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXPLTSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXPLTSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXPLTSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXPLTSI>RRb.max);\n", + "\n", + "-- Computing LBXMPSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXMPSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMPSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMPSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXMPSI>RRb.max);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Matrix\n", + "\n", + "* Building of the final matrix that has the identification of the person, a binary _b matrix, and a profile built by the concatenation of lines in the binary matrix.\n", + "* The profile represents the in a binary form what is out of the ranges in each person.\n", + "* Only anormal persons are filtered.\n", + "\n", + "* The resulting matrix produces a CSV file." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7c5c8635-ad3f-4e3b-9acb-0a37f2ce5180", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e6df3da0-d319-4a14-80da-38f3cfe76c34", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ee22d94b-f472-469b-898f-f387da519831", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS DeviationProfiles;\n", + "DROP VIEW IF EXISTS CorrelationMatrix;\n", + "\n", + "CREATE VIEW CorrelationMatrix AS\n", + "SELECT DISTINCT SB.SEQN, \n", + " CONCAT(SB.LBXIRN_b, SB.LBXTIB_b, SB.LBXSLDSI_b, SB.LBXWBCSI_b, SB.LBXLYPCT_b, SB.LBXMOPCT_b, SB.LBXNEPCT_b, SB.LBXEOPCT_b, SB.LBXBAPCT_b, SB.LBXRBCSI_b, SB.LBXHGB_b, SB.LBXHCT_b, SB.LBXMCVSI_b, SB.LBXMCHSI_b, SB.LBXMC_b, SB.LBXRDW_b, SB.LBXPLTSI_b, SB.LBXMPSI_b) AS profile,\n", + " SB.LBXIRN_b, SB.LBXTIB_b, SB.LBXSLDSI_b, SB.LBXWBCSI_b, SB.LBXLYPCT_b, SB.LBXMOPCT_b, SB.LBXNEPCT_b, SB.LBXEOPCT_b, SB.LBXBAPCT_b, SB.LBXRBCSI_b, SB.LBXHGB_b, SB.LBXHCT_b, SB.LBXMCVSI_b, SB.LBXMCHSI_b, SB.LBXMC_b, SB.LBXRDW_b, SB.LBXPLTSI_b, SB.LBXMPSI_b\n", + "FROM SurveyB SB, ReferenceRanges RR\n", + "WHERE SB.RIAGENDR=RR.gender AND SB.RIDAGEYR>=RR.ageStart AND SB.RIDAGEYR<=RR.ageEnd AND\n", + "(LBXIRN_b>0 OR LBXTIB_b>0 OR LBXSLDSI_b>0 OR LBXWBCSI_b>0 OR LBXLYPCT_b>0 OR LBXMOPCT_b>0 OR LBXNEPCT_b>0 OR LBXEOPCT_b>0 OR LBXBAPCT_b>0 OR LBXRBCSI_b>0 OR LBXHGB_b>0 OR LBXHCT_b>0 OR LBXMCVSI_b>0 OR LBXMCHSI_b>0 OR LBXMC_b>0 OR LBXRDW_b>0 OR LBXPLTSI_b>0 OR LBXMPSI_b>0);\n", + "\n", + "SELECT COUNT(*) FROM CorrelationMatrix;\n", + "SELECT * FROM CorrelationMatrix;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/correlation-matrix.csv', 'SELECT * FROM CorrelationMatrix');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiles network\n", + "\n", + "* Persons are here related from their binary profiles, producing a profiles network." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Grouping profiles\n", + "\n", + "* Profiles are grouped according ro a binary pattern and people with the same profile are aggregated." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "43cb8c38-a53d-4c94-b618-7288e6287e15", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6fda72d2-e206-406a-a2a0-3f5e036294db", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "05ca46f0-1cf1-469f-bb70-713db17f683b", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS DeviationProfiles;\n", + "\n", + "CREATE VIEW DeviationProfiles AS\n", + "SELECT CM.profile, COUNT(*) AS individuals\n", + "FROM CorrelationMatrix CM\n", + "GROUP BY CM.profile;\n", + "\n", + "SELECT SUM(individuals) FROM DeviationProfiles;\n", + "SELECT * FROM DeviationProfiles;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/profile-number-deviation.csv', 'SELECT DP.profile AS id, DP.individuals AS weight FROM DeviationProfiles DP');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Matrix with deviation intensity\n", + "\n", + "* This second matrix records the deviation of variables that overcomes the limits and how much the overcome." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generation of the starting matrix initialized with 0" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "DROP TABLE IF EXISTS SurveyD;\n", + "CREATE TABLE SurveyD (\n", + " SEQN VARCHAR(8),\n", + " RIAGENDR VARCHAR(1),\n", + " RIDAGEYR SMALLINT,\n", + " LBXIRN DECIMAL(7,1),\n", + " LBXIRN_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXTIB DECIMAL(7,1),\n", + " LBXTIB_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXSLDSI DECIMAL(7,1),\n", + " LBXSLDSI_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXWBCSI DECIMAL(7,1),\n", + " LBXWBCSI_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXLYPCT DECIMAL(7,1),\n", + " LBXLYPCT_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXMOPCT DECIMAL(7,1),\n", + " LBXMOPCT_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXNEPCT DECIMAL(7,1),\n", + " LBXNEPCT_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXEOPCT DECIMAL(7,1),\n", + " LBXEOPCT_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXBAPCT DECIMAL(7,1),\n", + " LBXBAPCT_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXRBCSI DECIMAL(7,1),\n", + " LBXRBCSI_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXHGB DECIMAL(7,1),\n", + " LBXHGB_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXHCT DECIMAL(7,1),\n", + " LBXHCT_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXMCVSI DECIMAL(7,1),\n", + " LBXMCVSI_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXMCHSI DECIMAL(7,1),\n", + " LBXMCHSI_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXMC DECIMAL(7,1),\n", + " LBXMC_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXRDW DECIMAL(7,1),\n", + " LBXRDW_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXPLTSI DECIMAL(7,1),\n", + " LBXPLTSI_d DECIMAL(7,1) DEFAULT 0,\n", + " LBXMPSI DECIMAL(7,1),\n", + " LBXMPSI_d DECIMAL(7,1) DEFAULT 0,\n", + " PRIMARY KEY(SEQN)\n", + ") AS SELECT\n", + " SEQN,RIAGENDR,RIDAGEYR,LBXIRN,0,LBXTIB,0,LBXSLDSI,0,LBXWBCSI,0,LBXLYPCT,0,LBXMOPCT,0,LBXNEPCT,0,LBXEOPCT,0,LBXBAPCT,0,LBXRBCSI,0,LBXHGB,0,LBXHCT,0,LBXMCVSI,0,LBXMCHSI,0,LBXMC,0,LBXRDW,0,LBXPLTSI,0,LBXMPSI,0\n", + "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Matrix building\n", + "\n", + "* Each variable is compared with the limits of the NHANES ranges, and the deviation _d columns receive the difference." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "-- Computing LBXIRN\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXIRN_d =\n", + "(SELECT RRa.min-SD.LBXIRN\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXIRN' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXIRN=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXIRN=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXIRN>RRa.max)\n", + "WHERE SD.LBXIRN_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXIRN' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXIRN>RRb.max);\n", + "\n", + "-- Computing LBXTIB\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXTIB_d =\n", + "(SELECT RRa.min-SD.LBXTIB\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXTIB' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXTIB=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXTIB=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXTIB>RRa.max)\n", + "WHERE SD.LBXTIB_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXTIB' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXTIB>RRb.max);\n", + "\n", + "-- Computing LBXSLDSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXSLDSI_d =\n", + "(SELECT RRa.min-SD.LBXSLDSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXSLDSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXSLDSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXSLDSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXSLDSI>RRa.max)\n", + "WHERE SD.LBXSLDSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXSLDSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXSLDSI>RRb.max);\n", + "\n", + "-- Computing LBXWBCSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXWBCSI_d =\n", + "(SELECT RRa.min-SD.LBXWBCSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXWBCSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXWBCSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXWBCSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXWBCSI>RRa.max)\n", + "WHERE SD.LBXWBCSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXWBCSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXWBCSI>RRb.max);\n", + "\n", + "-- Computing LBXLYPCT\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXLYPCT_d =\n", + "(SELECT RRa.min-SD.LBXLYPCT\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXLYPCT' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXLYPCT=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXLYPCT=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXLYPCT>RRa.max)\n", + "WHERE SD.LBXLYPCT_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXLYPCT' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXLYPCT>RRb.max);\n", + "\n", + "-- Computing LBXMOPCT\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXMOPCT_d =\n", + "(SELECT RRa.min-SD.LBXMOPCT\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXMOPCT' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMOPCT=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMOPCT=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMOPCT>RRa.max)\n", + "WHERE SD.LBXMOPCT_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMOPCT' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMOPCT>RRb.max);\n", + "\n", + "-- Computing LBXNEPCT\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXNEPCT_d =\n", + "(SELECT RRa.min-SD.LBXNEPCT\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXNEPCT' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXNEPCT=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXNEPCT=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXNEPCT>RRa.max)\n", + "WHERE SD.LBXNEPCT_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXNEPCT' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXNEPCT>RRb.max);\n", + "\n", + "-- Computing LBXEOPCT\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXEOPCT_d =\n", + "(SELECT RRa.min-SD.LBXEOPCT\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXEOPCT' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXEOPCT=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXEOPCT=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXEOPCT>RRa.max)\n", + "WHERE SD.LBXEOPCT_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXEOPCT' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXEOPCT>RRb.max);\n", + "\n", + "-- Computing LBXBAPCT\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXBAPCT_d =\n", + "(SELECT RRa.min-SD.LBXBAPCT\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXBAPCT' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXBAPCT=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXBAPCT=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXBAPCT>RRa.max)\n", + "WHERE SD.LBXBAPCT_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXBAPCT' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXBAPCT>RRb.max);\n", + "\n", + "-- Computing LBXRBCSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXRBCSI_d =\n", + "(SELECT RRa.min-SD.LBXRBCSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXRBCSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXRBCSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXRBCSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXRBCSI>RRa.max)\n", + "WHERE SD.LBXRBCSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXRBCSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXRBCSI>RRb.max);\n", + "\n", + "-- Computing LBXHGB\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXHGB_d =\n", + "(SELECT RRa.min-SD.LBXHGB\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXHGB' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHGB=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHGB=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHGB>RRa.max)\n", + "WHERE SD.LBXHGB_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXHGB' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHGB>RRb.max);\n", + "\n", + "-- Computing LBXHCT\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXHCT_d =\n", + "(SELECT RRa.min-SD.LBXHCT\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXHCT' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHCT=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHCT=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHCT>RRa.max)\n", + "WHERE SD.LBXHCT_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXHCT' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHCT>RRb.max);\n", + "\n", + "-- Computing LBXMCVSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXMCVSI_d =\n", + "(SELECT RRa.min-SD.LBXMCVSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXMCVSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMCVSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMCVSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMCVSI>RRa.max)\n", + "WHERE SD.LBXMCVSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMCVSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMCVSI>RRb.max);\n", + "\n", + "-- Computing LBXMCHSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXMCHSI_d =\n", + "(SELECT RRa.min-SD.LBXMCHSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXMCHSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMCHSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMCHSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMCHSI>RRa.max)\n", + "WHERE SD.LBXMCHSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMCHSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMCHSI>RRb.max);\n", + "\n", + "-- Computing LBXMC\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXMC_d =\n", + "(SELECT RRa.min-SD.LBXMC\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXMC' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMC=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMC=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMC>RRa.max)\n", + "WHERE SD.LBXMC_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMC' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMC>RRb.max);\n", + "\n", + "-- Computing LBXRDW\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXRDW_d =\n", + "(SELECT RRa.min-SD.LBXRDW\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXRDW' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXRDW=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXRDW=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXRDW>RRa.max)\n", + "WHERE SD.LBXRDW_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXRDW' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXRDW>RRb.max);\n", + "\n", + "-- Computing LBXPLTSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXPLTSI_d =\n", + "(SELECT RRa.min-SD.LBXPLTSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXPLTSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXPLTSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXPLTSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXPLTSI>RRa.max)\n", + "WHERE SD.LBXPLTSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXPLTSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXPLTSI>RRb.max);\n", + "\n", + "-- Computing LBXMPSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXMPSI_d =\n", + "(SELECT RRa.min-SD.LBXMPSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXMPSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMPSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMPSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXMPSI>RRa.max)\n", + "WHERE SD.LBXMPSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXMPSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXMPSI>RRb.max);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Matrix\n", + "\n", + "* Building of the final matrix that has the identification of the person and a deviation _d matrix.\n", + "* Only anormal persons are filtered." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4544c15f-b121-4f5f-a2af-4ef4b4f5c680", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b6308290-90d1-4d2d-9d5c-8fc611d21ead", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ed4d8b24-c20a-4bec-953a-7bd4bb02e237", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS CorrelationMatrixWeighted;\n", + "\n", + "CREATE VIEW CorrelationMatrixWeighted AS\n", + "SELECT DISTINCT SD.SEQN, SD.LBXIRN_d, SD.LBXTIB_d, SD.LBXSLDSI_d, SD.LBXWBCSI_d, SD.LBXLYPCT_d, SD.LBXMOPCT_d, SD.LBXNEPCT_d, SD.LBXEOPCT_d, SD.LBXBAPCT_d, SD.LBXRBCSI_d, SD.LBXHGB_d, SD.LBXHCT_d, SD.LBXMCVSI_d, SD.LBXMCHSI_d, SD.LBXMC_d, SD.LBXRDW_d, SD.LBXPLTSI_d, SD.LBXMPSI_d\n", + "FROM SurveyD SD, ReferenceRanges RR\n", + "WHERE SD.RIAGENDR=RR.gender AND SD.RIDAGEYR>=RR.ageStart AND SD.RIDAGEYR<=RR.ageEnd AND\n", + "(LBXIRN_d>0 OR LBXTIB_d>0 OR LBXSLDSI_d>0 OR LBXWBCSI_d>0 OR LBXLYPCT_d>0 OR LBXMOPCT_d>0 OR LBXNEPCT_d>0 OR LBXEOPCT_d>0 OR LBXBAPCT_d>0 OR LBXRBCSI_d>0 OR LBXHGB_d>0 OR LBXHCT_d>0 OR LBXMCVSI_d>0 OR LBXMCHSI_d>0 OR LBXMC_d>0 OR LBXRDW_d>0 OR LBXPLTSI_d>0 OR LBXMPSI_d>0);\n", + "\n", + "SELECT COUNT(*) FROM CorrelationMatrixWeighted;\n", + "SELECT * FROM CorrelationMatrixWeighted;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/correlation-matrix-weighted.csv', 'SELECT * FROM CorrelationMatrixWeighted');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Variables Network\n", + "\n", + "* In this network each node is a variable and each edge indicates that two variables are correlated in a certain intensity." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List of the variable pairs\n", + "\n", + "* This view prepares the list of correlation pairs initialized with 0." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "DROP VIEW IF EXISTS VariablesCorrelation;\n", + "DROP VIEW IF EXISTS Variables;\n", + "\n", + "CREATE VIEW Variables AS\n", + "SELECT DISTINCT variable AS var1 FROM ReferenceRanges;\n", + "\n", + "CREATE VIEW VariablesCorrelation AS\n", + "SELECT DISTINCT Variables.var1, ReferenceRanges.variable AS var2, 0 AS correlation\n", + "FROM Variables, ReferenceRanges\n", + "WHERE Variables.var1 < ReferenceRanges.variable;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Survey verticalization\n", + "\n", + "* Persons and variables that are originally presented as a matrix are transformed in a list: person, variable and value. This list will facilitate the subsequent analyses." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "47970" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DROP VIEW IF EXISTS VerticalSurvey;\n", + "\n", + "CREATE VIEW VerticalSurvey AS\n", + " SELECT SU.SEQN, RR.variable, SU.LBXIRN AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXIRN'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXTIB AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXTIB'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXSLDSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXSLDSI'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXWBCSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXWBCSI'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXLYPCT AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXLYPCT'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXMOPCT AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXMOPCT'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXNEPCT AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXNEPCT'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXEOPCT AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXEOPCT'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXBAPCT AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXBAPCT'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXRBCSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXRBCSI'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXHGB AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXHGB'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXHCT AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXHCT'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXMCVSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXMCVSI'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXMCHSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXMCHSI'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXMC AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXMC'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXRDW AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXRDW'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXPLTSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXPLTSI'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXMPSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXMPSI'\n", + ";\n", + "\n", + "-- transformation of the view in a table to enable updates\n", + "DROP TABLE IF EXISTS VerticalSurveyD;\n", + "CREATE TABLE VerticalSurveyD (\n", + " SEQN VARCHAR(8),\n", + " variable VARCHAR(8),\n", + " value DECIMAL(7,1),\n", + " deviation DECIMAL(7,1),\n", + " PRIMARY KEY(SEQN, variable)\n", + ") AS SELECT * FROM VerticalSurvey;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/vertical-survey.csv', 'SELECT SEQN,variable,value FROM VerticalSurvey');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computation of the deviation value for the variables that are out of the limits" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7efc0ee4-bd7f-41e8-b395-f56bb208de51", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "UPDATE VerticalSurveyD VS\n", + "SET VS.deviation =\n", + "(SELECT RRa.min-VS.value\n", + " FROM Survey SUa, ReferenceRanges RRa\n", + " WHERE RRa.variable=VS.variable AND SUa.SEQN=VS.SEQN AND SUa.RIAGENDR=RRa.gender AND SUa.RIDAGEYR>=RRa.ageStart AND SUa.RIDAGEYR<=RRa.ageEnd AND VS.value=RRb.ageStart AND SUb.RIDAGEYR<=RRb.ageEnd AND VS.value=RRa.ageStart AND SUa.RIDAGEYR<=RRa.ageEnd AND VS.value>RRa.max)\n", + "WHERE EXISTS\n", + "(SELECT RRb.max\n", + " FROM Survey SUb, ReferenceRanges RRb\n", + " WHERE RRb.variable=VS.variable AND SUb.SEQN=VS.SEQN AND SUb.RIAGENDR=RRb.gender AND SUb.RIDAGEYR>=RRb.ageStart AND SUb.RIDAGEYR<=RRb.ageEnd AND VS.value>RRb.max);\n", + " \n", + "SELECT * FROM VerticalSurveyD WHERE deviation > 0;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Number of abnormalities by variable" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "23c31d6a-0a98-4665-9f30-501edc3d378e", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f4c81b00-3661-4e46-b0ad-9b0b9ebffab5", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT variable as id, COUNT(*) as weight FROM VerticalSurveyD VS WHERE deviation>0 GROUP BY variable;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/variable-number-deviation.csv', 'SELECT variable as id, COUNT(*) as weight FROM VerticalSurveyD VS WHERE deviation>0 GROUP BY variable');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Variable correlation by person\n", + "\n", + "* Pairwise correlation of variables that cooccur in the same person." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d0fa10df-6e6c-4083-a85e-d720d5b3e0a8", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS VariablePairCorrelation;\n", + "DROP VIEW IF EXISTS IndividualVariablesCorrelation;\n", + "\n", + "CREATE VIEW IndividualVariablesCorrelation AS\n", + "SELECT VS1.SEQN, CM.profile, VC.var1, VC.var2\n", + "FROM VariablesCorrelation VC, VerticalSurveyD VS1, VerticalSurveyD VS2, CorrelationMatrix CM\n", + "WHERE VS1.SEQN = VS2.SEQN AND VS1.variable = VC.var1 AND VS2.variable = VC.var2 AND \n", + " VS1.deviation > 0 AND VS2.deviation > 0 AND\n", + " VS1.SEQN = CM.SEQN;\n", + "\n", + "SELECT * FROM IndividualVariablesCorrelation\n", + "ORDER BY var1, var2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correlation of variable pairs\n", + "\n", + "* Aggregation of correlations of variable pairs.\n", + "* Preparation to build a network where variables are vertices and edges connect variables that surpassed the limits together for the same person." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f935fa7b-f714-4ccd-9a41-cae28f83d45c", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b334ab70-5641-49ef-95a4-cc8a3d5f312b", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS VariablePairCorrelation;\n", + "CREATE VIEW VariablePairCorrelation AS\n", + "SELECT var1 AS source, var2 as TARGET, COUNT(*) AS weight\n", + "FROM IndividualVariablesCorrelation\n", + "GROUP BY var1, var2;\n", + "\n", + "SELECT * FROM VariablePairCorrelation;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/variable-pair-correlation.csv', 'SELECT * FROM VariablePairCorrelation');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Variable Network\n", + "\n", + "* Variable network produced in the Gephi from the CVS created in the previous step.\n", + "\n", + "![variable network](variable-network-full.png \"Variable Network\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profile Network\n", + "\n", + "* Returning to the profile network." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correlation analysis of profile pairs\n", + "\n", + "* Each time that two persons share a variable out of the ranges, an edge is created between them.\n", + "* The edges are grouped by profile pairs. For each pair is computed the number of individuals/variables that cooccur." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "496235" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DROP VIEW IF EXISTS ProfileCorrelation;\n", + "\n", + "CREATE VIEW ProfileCorrelation AS\n", + " SELECT CM1.SEQN AS SEQN1, CM1.profile AS profile1, CM2.SEQN AS SEQN2, CM2.profile AS profile2\n", + " FROM VerticalSurveyD VS1, VerticalSurveyD VS2, CorrelationMatrix CM1, CorrelationMatrix CM2\n", + " WHERE VS1.SEQN < VS2.SEQN AND VS1.variable = VS2.variable AND\n", + " VS1.deviation > 0 AND VS2.deviation > 0 AND\n", + " VS1.SEQN = CM1.SEQN AND VS2.SEQN = CM2.SEQN;\n", + " \n", + "-- Gravação de pares de perfis com similaridade para rede\n", + "CALL CSVWRITE('../data/nhanes2005-2006/profile-pair-correlation.csv', 'SELECT * FROM ProfileCorrelation');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profile Network\n", + "\n", + "![profile network](profile-network-full.png \"Profile Network\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "27d0fc97-ec39-491f-a2d4-3437ab3ba39a", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "75da7429-7e7f-4406-a92f-5a3f268313bb", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "30a5bb4c-400a-4d92-84b3-c2dbf4db947b", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "27bc090b-3327-4cc0-8fe7-8097ecd47f43", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS FBFullCorrelationWeight;\n", + "DROP VIEW IF EXISTS FishbonesCorrelationWeight;\n", + "DROP VIEW IF EXISTS FishbonesCorrelation;\n", + "\n", + "-- filtrando WBC, HgB, HCT, PLT (LBXWBCSI, LBXHGB, LBXHCT, LBXPLTSI respectivamente)\n", + "\n", + "CREATE VIEW FishbonesCorrelation AS\n", + " SELECT CM1.SEQN AS SEQN1, CM1.profile AS profile1, CM2.SEQN AS SEQN2, CM2.profile AS profile2\n", + " FROM VerticalSurveyD VS1, VerticalSurveyD VS2, CorrelationMatrix CM1, CorrelationMatrix CM2\n", + " WHERE (VS1.variable = 'LBXWBCSI' OR VS1.variable = 'LBXHGB' OR VS1.variable = 'LBXHCT' OR VS1.variable = 'LBXPLTSI') AND\n", + " VS1.SEQN < VS2.SEQN AND VS1.variable = VS2.variable AND\n", + " VS1.deviation > 0 AND VS2.deviation > 0 AND\n", + " VS1.SEQN = CM1.SEQN AND VS2.SEQN = CM2.SEQN;\n", + "\n", + "CREATE VIEW FBFullCorrelationWeight AS\n", + " SELECT FC.profile1 AS source, FC.profile2 as target, COUNT(*) as weight\n", + " FROM FishbonesCorrelation FC\n", + " GROUP BY FC.profile1, FC.profile2;\n", + "\n", + "SELECT COUNT(*) FROM FBFullCorrelationWeight;\n", + "SELECT * FROM FBFullCorrelationWeight;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/fishbones-correlation.csv', 'SELECT * FROM FishbonesCorrelation');\n", + "CALL CSVWRITE('../data/nhanes2005-2006/fbfull-pair-correlation.csv', 'SELECT * FROM FBFullCorrelationWeight');" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SQL", + "language": "SQL", + "name": "sql" + }, + "language_info": { + "codemirror_mode": "sql", + "file_extension": ".sql", + "mimetype": "", + "name": "SQL", + "nbconverter_exporter": "", + "version": "" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": false, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}