From 08f962ff51ae50c2e55e82d7e2aecaa3f364a58f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Santanch=C3=A8?= Date: Tue, 7 May 2019 16:13:26 -0300 Subject: [PATCH] NHANES SQL-Network examples --- .../sql-network-01-nhanes-fishbones.ipynb | 1201 +++++++++++++++++ ...> sql-network-02-nhanes-complete-p1.ipynb} | 0 2 files changed, 1201 insertions(+) create mode 100644 sql-network/sql-network-01-nhanes-fishbones.ipynb rename sql-network/{sql-network-01-nhanes-p1.ipynb => sql-network-02-nhanes-complete-p1.ipynb} (100%) diff --git a/sql-network/sql-network-01-nhanes-fishbones.ipynb b/sql-network/sql-network-01-nhanes-fishbones.ipynb new file mode 100644 index 0000000..6e92805 --- /dev/null +++ b/sql-network/sql-network-01-nhanes-fishbones.ipynb @@ -0,0 +1,1201 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%defaultDatasource jdbc:h2:mem:db" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reference values for NHANES for the 2005-2006 survey\n", + "\n", + "* Extracted from data of the NHANES Web site (https://wwwn.cdc.gov/nchs/nhanes/).\n", + "\n", + "## Importing normal ranges of values indicated in the NHANES documentation\n", + "\n", + "The following query imports the file `reference-ranges.csv` that contains reference ranges from NHANES.\n", + "\n", + "* For each variable it is indicated\n", + " - applicable gender\n", + " - age range (ageStart until ageEnd)\n", + "\n", + "* The range is indicated in the form of mininum and maximum values considered normal." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "307d4d69-e9e4-4737-af18-01b198a055d5", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d65a379c-5107-429f-a518-ad7c68687e0e", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP TABLE IF EXISTS ReferenceRanges;\n", + "CREATE TABLE ReferenceRanges (\n", + " variable VARCHAR(8),\n", + " gender VARCHAR(1),\n", + " ageStart SMALLINT,\n", + " ageEnd SMALLINT,\n", + " min DECIMAL(7,1),\n", + " max DECIMAL(7,1),\n", + " PRIMARY KEY(variable,gender,ageStart,ageEnd)\n", + ") AS SELECT\n", + " variable,gender,ageStart,ageEnd,min,max\n", + "FROM CSVREAD('../data/nhanes2005-2006/reference-ranges.csv');\n", + "\n", + "SELECT DISTINCT variable FROM ReferenceRanges;\n", + "SELECT * FROM ReferenceRanges;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Survey NHANES 2005-2006\n", + "\n", + "* Extracted from data of the NHANES Web site (https://wwwn.cdc.gov/nchs/nhanes/).\n", + "\n", + "## Importing data from the survey NHANES 2005-2006\n", + "\n", + "* The following query imports the `combined-selected-variables.csv` that contains a tuple for each individual, with a selected set of variables that are used do diagnose anemia, as mentioned in Figure 1. It was filtered only the individuals with values for all fields.\n", + "\n", + "![evaluation of anemia](evaluation-of-anemia.gif \"Figure 1\")\n", + "*Figure 1*: Evaluation of anemia in the adult according to the mean corpuscular volume. CBC: complete blood count; MCV: mean corpuscular volume; RBCs: red blood cells; Fe: iron; TIBC: total iron-binding capacity (transferrin); LDH: lactate dehydrogenase [6].\n", + "\n", + "* We selected four commonly used blood test variables, as shows the following figure (known as Fishbones).\n", + "\n", + "![btc fishbones](Hematology_Fishbone_Schematic.png \"BTC Fishbones\")\n", + "By Major Small - Own work, CC BY 3.0, Link" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fdc26656-3244-4b9c-b117-917327a4053f", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0d45768d-be82-4192-a45a-90a31d24d001", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP TABLE IF EXISTS Survey;\n", + "\n", + "CREATE TABLE Survey (\n", + " SEQN VARCHAR(8),\n", + " RIAGENDR VARCHAR(1),\n", + " RIDAGEYR SMALLINT,\n", + " LBXWBCSI DECIMAL(7,1),\n", + " LBXHGB DECIMAL(7,1),\n", + " LBXHCT DECIMAL(7,1),\n", + " LBXPLTSI DECIMAL(7,1),\n", + " PRIMARY KEY(SEQN)\n", + ") AS SELECT\n", + " SEQN,RIAGENDR,RIDAGEYR,LBXWBCSI,LBXHGB,LBXHCT,LBXPLTSI\n", + "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');\n", + "\n", + "SELECT COUNT(*) FROM Survey;\n", + "SELECT * FROM Survey;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Codes and description of NHANES variables\n", + "\n", + "* The codes and description of variables are stored in a table." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5e73d2ce-09a8-4b5f-b024-73ae9baf3f66", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP TABLE IF EXISTS VariableDescription;\n", + "CREATE TABLE VariableDescription (\n", + " variable VARCHAR(8),\n", + " acronym VARCHAR(8),\n", + " name VARCHAR(50),\n", + " unit VARCHAR(30),\n", + " file VARCHAR(20),\n", + " ranges VARCHAR(100),\n", + " PRIMARY KEY(variable)\n", + ") AS SELECT\n", + " variable,acronym,name,unit,file,ranges\n", + "FROM CSVREAD('../data/nhanes2005-2006/reference-ranges-variables.csv');\n", + "\n", + "SELECT * FROM VariableDescription;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Binary evaluation of individuals out of the normal ranges\n", + "\n", + "* For each variable, this table defines an extra binary column _b which is initialized with 0 and will receive 1 is the variable is out of the NHANES range." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generation of the starting matrix initialized with 0" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "DROP TABLE IF EXISTS SurveyB;\n", + "CREATE TABLE SurveyB (\n", + " SEQN VARCHAR(8),\n", + " RIAGENDR VARCHAR(1),\n", + " RIDAGEYR SMALLINT,\n", + " LBXWBCSI DECIMAL(7,1),\n", + " LBXWBCSI_b SMALLINT DEFAULT 0,\n", + " LBXHGB DECIMAL(7,1),\n", + " LBXHGB_b SMALLINT DEFAULT 0,\n", + " LBXHCT DECIMAL(7,1),\n", + " LBXHCT_b SMALLINT DEFAULT 0,\n", + " LBXPLTSI DECIMAL(7,1),\n", + " LBXPLTSI_b SMALLINT DEFAULT 0,\n", + " PRIMARY KEY(SEQN)\n", + ") AS SELECT\n", + " SEQN,RIAGENDR,RIDAGEYR,LBXWBCSI,0,LBXHGB,0,LBXHCT,0,LBXPLTSI,0\n", + "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Matrix building\n", + "\n", + "* Each variable is compared with the limits of the NHANES ranges, and the binary _b columns are updated." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "-- Computing LBXWBCSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXWBCSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXWBCSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXWBCSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXWBCSI>RRb.max);\n", + "\n", + "-- Computing LBXHGB\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXHGB_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXHGB' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHGB=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHGB>RRb.max);\n", + "\n", + "-- Computing LBXHCT\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXHCT_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXHCT' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHCT=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHCT>RRb.max);\n", + "\n", + "-- Computing LBXPLTSI\n", + "UPDATE SurveyB SB\n", + "SET SB.LBXPLTSI_b = 1\n", + "WHERE EXISTS\n", + "(SELECT RRb.min\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXPLTSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXPLTSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXPLTSI>RRb.max);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Matrix\n", + "\n", + "* Building of the final matrix that has the identification of the person, a binary _b matrix, and a profile built by the concatenation of lines in the binary matrix.\n", + "* The profile represents the in a binary form what is out of the ranges in each person.\n", + "* Only anormal persons are filtered.\n", + "\n", + "* The resulting matrix produces a CSV file." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "42156788-a55d-4602-bca5-066ac9ae5e8d", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f09ed8c2-0388-45ee-9883-6ab3f5c5ee8a", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "338795e9-ba06-424a-8de3-ee417aa37dee", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS DeviationProfiles;\n", + "DROP VIEW IF EXISTS CorrelationMatrix;\n", + "\n", + "CREATE VIEW CorrelationMatrix AS\n", + "SELECT DISTINCT SB.SEQN, \n", + " CONCAT(SB.LBXWBCSI_b, SB.LBXHGB_b, SB.LBXHCT_b, SB.LBXPLTSI_b) AS profile,\n", + " SB.LBXWBCSI_b, SB.LBXHGB_b, SB.LBXHCT_b, SB.LBXPLTSI_b\n", + "FROM SurveyB SB, ReferenceRanges RR\n", + "WHERE SB.RIAGENDR=RR.gender AND SB.RIDAGEYR>=RR.ageStart AND SB.RIDAGEYR<=RR.ageEnd AND\n", + "(LBXWBCSI_b>0 OR LBXHGB_b>0 OR LBXHCT_b>0 OR LBXPLTSI_b>0);\n", + "\n", + "SELECT COUNT(*) FROM CorrelationMatrix;\n", + "SELECT * FROM CorrelationMatrix;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/correlation-matrix-fb.csv', 'SELECT * FROM CorrelationMatrix');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiles network\n", + "\n", + "* Persons are here related from their binary profiles, producing a profiles network." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Grouping profiles\n", + "\n", + "* Profiles are grouped according ro a binary pattern and people with the same profile are aggregated." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ec6217b2-2fc6-49f0-9852-0525d86c16ff", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2ea7309e-436c-4d42-ae6f-578c7d8c9616", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ce02aac4-c199-4b95-9eeb-a7d749ee0a14", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS DeviationProfiles;\n", + "\n", + "CREATE VIEW DeviationProfiles AS\n", + "SELECT CM.profile, COUNT(*) AS individuals\n", + "FROM CorrelationMatrix CM\n", + "GROUP BY CM.profile;\n", + "\n", + "SELECT SUM(individuals) FROM DeviationProfiles;\n", + "SELECT * FROM DeviationProfiles;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/profile-deviation-fb.csv', 'SELECT DP.profile AS id, DP.individuals AS weight FROM DeviationProfiles DP');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Matrix with deviation intensity\n", + "\n", + "* This second matrix records the deviation of variables that overcomes the limits and how much the overcome." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geração de nova matriz de base inicializada com 0" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6de00da1-b1dc-4f41-b3ea-b75d47963e99", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP TABLE IF EXISTS SurveyD;\n", + "CREATE TABLE SurveyD (\n", + " SEQN VARCHAR(8),\n", + " RIAGENDR VARCHAR(1),\n", + " RIDAGEYR SMALLINT,\n", + " LBXWBCSI DECIMAL(7,1),\n", + " LBXWBCSI_d SMALLINT DEFAULT 0,\n", + " LBXHGB DECIMAL(7,1),\n", + " LBXHGB_d SMALLINT DEFAULT 0,\n", + " LBXHCT DECIMAL(7,1),\n", + " LBXHCT_d SMALLINT DEFAULT 0,\n", + " LBXPLTSI DECIMAL(7,1),\n", + " LBXPLTSI_d SMALLINT DEFAULT 0,\n", + " PRIMARY KEY(SEQN)\n", + ") AS SELECT\n", + " SEQN,RIAGENDR,RIDAGEYR,LBXWBCSI,0,LBXHGB,0,LBXHCT,0,LBXPLTSI,0\n", + "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');\n", + "\n", + "SELECT * FROM SurveyD;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Matrix building\n", + "\n", + "* Each variable is compared with the limits of the NHANES ranges, and the deviation _d columns receive the difference." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "-- Computing LBXWBCSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXWBCSI_d =\n", + "(SELECT RRa.min-SD.LBXWBCSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXWBCSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXWBCSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXWBCSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXWBCSI>RRa.max)\n", + "WHERE SD.LBXWBCSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXWBCSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXWBCSI>RRb.max);\n", + "\n", + "-- Computing LBXHGB\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXHGB_d =\n", + "(SELECT RRa.min-SD.LBXHGB\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXHGB' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHGB=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHGB=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHGB>RRa.max)\n", + "WHERE SD.LBXHGB_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXHGB' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHGB>RRb.max);\n", + "\n", + "-- Computing LBXHCT\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXHCT_d =\n", + "(SELECT RRa.min-SD.LBXHCT\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXHCT' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHCT=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHCT=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHCT>RRa.max)\n", + "WHERE SD.LBXHCT_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXHCT' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHCT>RRb.max);\n", + "\n", + "-- Computing LBXPLTSI\n", + "UPDATE SurveyD SD\n", + "SET SD.LBXPLTSI_d =\n", + "(SELECT RRa.min-SD.LBXPLTSI\n", + " FROM ReferenceRanges RRa\n", + " WHERE RRa.variable='LBXPLTSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXPLTSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXPLTSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXPLTSI>RRa.max)\n", + "WHERE SD.LBXPLTSI_d = 0 AND\n", + "EXISTS (SELECT RRb.max\n", + " FROM ReferenceRanges RRb\n", + " WHERE RRb.variable='LBXPLTSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXPLTSI>RRb.max);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Matrix\n", + "\n", + "* Building of the final matrix that has the identification of the person and a deviation _d matrix.\n", + "* Only anormal persons are filtered." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8e5a3e74-b103-4410-a39f-601a4eed0d8c", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c399904e-c81d-4b26-83ab-188595410cb3", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a3b26122-5d8e-4dfd-aa5d-c3562dc333bb", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS CorrelationMatrixWeighted;\n", + "\n", + "CREATE VIEW CorrelationMatrixWeighted AS\n", + "SELECT DISTINCT SD.SEQN, \n", + " SD.LBXWBCSI_d, SD.LBXHGB_d, SD.LBXHCT_d, SD.LBXPLTSI_d\n", + "FROM SurveyD SD, ReferenceRanges RR\n", + "WHERE SD.RIAGENDR=RR.gender AND SD.RIDAGEYR>=RR.ageStart AND SD.RIDAGEYR<=RR.ageEnd AND\n", + "(LBXWBCSI_d>0 OR LBXHGB_d>0 OR LBXHCT_d>0 OR LBXPLTSI_d>0);\n", + "\n", + "SELECT COUNT(*) FROM CorrelationMatrixWeighted;\n", + "SELECT * FROM CorrelationMatrixWeighted;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/correlation-matrix-weighted-fb.csv', 'SELECT * FROM CorrelationMatrixWeighted');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Variables Network\n", + "\n", + "* In this network each node is a variable and each edge indicates that two variables are correlated in a certain intensity." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List of the variable pairs\n", + "\n", + "* This view prepares the list of correlation pairs initialized with 0." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "DROP VIEW IF EXISTS VariablesCorrelation;\n", + "DROP VIEW IF EXISTS Variables;\n", + "\n", + "CREATE VIEW Variables AS\n", + "SELECT DISTINCT variable AS var1 FROM ReferenceRanges;\n", + "\n", + "CREATE VIEW VariablesCorrelation AS\n", + "SELECT DISTINCT Variables.var1, ReferenceRanges.variable AS var2, 0 AS correlation\n", + "FROM Variables, ReferenceRanges\n", + "WHERE Variables.var1 < ReferenceRanges.variable;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Survey verticalization\n", + "\n", + "* Persons and variables that are originally presented as a matrix are transformed in a list: person, variable and value. This list will facilitate the subsequent analyses." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10660" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DROP VIEW IF EXISTS VerticalSurvey;\n", + "\n", + "CREATE VIEW VerticalSurvey AS\n", + " SELECT SU.SEQN, RR.variable, SU.LBXWBCSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXWBCSI'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXHGB AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXHGB'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXHCT AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXHCT'\n", + "UNION\n", + " SELECT SU.SEQN, RR.variable, SU.LBXPLTSI AS value, 0 AS deviation\n", + " FROM Survey SU, ReferenceRanges RR\n", + " WHERE RR.variable='LBXPLTSI'\n", + ";\n", + "\n", + "-- transformation of the view in a table to enable updates\n", + "DROP TABLE IF EXISTS VerticalSurveyD;\n", + "CREATE TABLE VerticalSurveyD (\n", + " SEQN VARCHAR(8),\n", + " variable VARCHAR(8),\n", + " value DECIMAL(7,1),\n", + " deviation DECIMAL(7,1),\n", + " PRIMARY KEY(SEQN, variable)\n", + ") AS SELECT * FROM VerticalSurvey;\n", + " \n", + "CALL CSVWRITE('../data/nhanes2005-2006/vertical-survey-fb.csv', 'SELECT SEQN,variable,value FROM VerticalSurvey');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computation of the deviation value for the variables that are out of the limits" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f4263a7e-862c-4953-ada5-ef51d953fb3e", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "UPDATE VerticalSurveyD VS\n", + "SET VS.deviation =\n", + "(SELECT RRa.min-VS.value\n", + " FROM Survey SUa, ReferenceRanges RRa\n", + " WHERE RRa.variable=VS.variable AND SUa.SEQN=VS.SEQN AND SUa.RIAGENDR=RRa.gender AND SUa.RIDAGEYR>=RRa.ageStart AND SUa.RIDAGEYR<=RRa.ageEnd AND VS.value=RRb.ageStart AND SUb.RIDAGEYR<=RRb.ageEnd AND VS.value=RRa.ageStart AND SUa.RIDAGEYR<=RRa.ageEnd AND VS.value>RRa.max)\n", + "WHERE EXISTS\n", + "(SELECT RRb.max\n", + " FROM Survey SUb, ReferenceRanges RRb\n", + " WHERE RRb.variable=VS.variable AND SUb.SEQN=VS.SEQN AND SUb.RIAGENDR=RRb.gender AND SUb.RIDAGEYR>=RRb.ageStart AND SUb.RIDAGEYR<=RRb.ageEnd AND VS.value>RRb.max);\n", + " \n", + "SELECT * FROM VerticalSurveyD WHERE deviation > 0;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Average of the variables" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8e81a2d2-5160-43e9-a507-e0be33419d23", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f242550e-4623-432f-8286-f477cbbf1bd0", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT variable as id, COUNT(*) as weight FROM VerticalSurveyD VS WHERE deviation>0 GROUP BY variable;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/variable-number-deviation-fb.csv', 'SELECT variable as id, COUNT(*) as weight FROM VerticalSurveyD VS WHERE deviation>0 GROUP BY variable');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Variable correlation by person\n", + "\n", + "* Pairwise correlation of variables that cooccur in the same person." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "88963eae-7110-4248-9bad-41879b3c84bb", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS VariablePairCorrelation;\n", + "DROP VIEW IF EXISTS IndividualVariablesCorrelation;\n", + "\n", + "CREATE VIEW IndividualVariablesCorrelation AS\n", + "SELECT VS1.SEQN, CM.profile, VC.var1, VC.var2\n", + "FROM VariablesCorrelation VC, VerticalSurveyD VS1, VerticalSurveyD VS2, CorrelationMatrix CM\n", + "WHERE VS1.SEQN = VS2.SEQN AND VS1.variable = VC.var1 AND VS2.variable = VC.var2 AND \n", + " VS1.deviation > 0 AND VS2.deviation > 0 AND\n", + " VS1.SEQN = CM.SEQN;\n", + "\n", + "SELECT * FROM IndividualVariablesCorrelation\n", + "ORDER BY var1, var2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correlation of variable pairs\n", + "\n", + "* Aggregation of correlations of variable pairs.\n", + "* Preparation to build a network where variables are vertices and edges connect variables that surpassed the limits together for the same person." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "479abd75-7c61-4eb7-9eec-42d29d59162b", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "57dfb168-32d5-403b-bdf5-15f2bb7dc510", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS VariablePairCorrelation;\n", + "CREATE VIEW VariablePairCorrelation AS\n", + "SELECT var1 AS source, var2 as TARGET, COUNT(*) AS weight\n", + "FROM IndividualVariablesCorrelation\n", + "GROUP BY var1, var2;\n", + "\n", + "SELECT * FROM VariablePairCorrelation;\n", + "\n", + "CALL CSVWRITE('../data/nhanes2005-2006/variable-pair-correlation-fb.csv', 'SELECT * FROM VariablePairCorrelation');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise\n", + "\n", + "Import the file previously created `/data/nhanes2005-2006/variable-pair-correlation-fb.csv` in the Gephi. Which analises can you do?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profile Network\n", + "\n", + "* Returning to the profile network." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correlation analysis of profile pairs\n", + "\n", + "* Each time that two persons share a variable out of the ranges, an edge is created between them.\n", + "* The edges are grouped by profile pairs. For each pair is computed the number of individuals/variables that cooccur." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "67421" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DROP VIEW IF EXISTS ProfileCorrelation;\n", + "\n", + "CREATE VIEW ProfileCorrelation AS\n", + " SELECT CM1.SEQN AS SEQN1, CM1.profile AS profile1, CM2.SEQN AS SEQN2, CM2.profile AS profile2\n", + " FROM VerticalSurveyD VS1, VerticalSurveyD VS2, CorrelationMatrix CM1, CorrelationMatrix CM2\n", + " WHERE VS1.SEQN < VS2.SEQN AND VS1.variable = VS2.variable AND\n", + " VS1.deviation > 0 AND VS2.deviation > 0 AND\n", + " VS1.SEQN = CM1.SEQN AND VS2.SEQN = CM2.SEQN;\n", + " \n", + "-- Gravação de pares de perfis com similaridade para rede\n", + "CALL CSVWRITE('../data/nhanes2005-2006/profile-pair-correlation-fb.csv', 'SELECT * FROM ProfileCorrelation');" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d4c04120-71d3-4ffe-a62c-4f77c8b945f6", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b720c0fb-72b3-459f-ab0f-5ade875a6e81", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP VIEW IF EXISTS ProfileCorrelationNWeight;\n", + "DROP VIEW IF EXISTS ProfileCorrelationUnique;\n", + "\n", + "CREATE VIEW ProfileCorrelationUnique AS\n", + " SELECT DISTINCT * FROM ProfileCorrelation;\n", + "\n", + "CREATE VIEW ProfileCorrelationNWeight AS\n", + " SELECT PC.profile1 AS source, PC.profile2 as target, COUNT(*) as weight\n", + " FROM ProfileCorrelationUnique PC\n", + " GROUP BY PC.profile1, PC.profile2;\n", + " \n", + "SELECT COUNT(*), SUM(weight) FROM ProfileCorrelationNWeight;\n", + "SELECT * FROM ProfileCorrelationNWeight;" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a4f62d8e-7180-41cf-b0bb-5caa1e7ba600", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2e065a2e-6ac7-4ac5-9b37-011b0fb19aa7", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "CREATE VIEW ProfileCorrNWeight AS\n", + "SELECT source, target, weight w FROM ProfileCorrelationNWeight WHERE source < target\n", + "UNION\n", + "SELECT target, source, weight w FROM ProfileCorrelationNWeight WHERE source > target;\n", + "\n", + "CREATE VIEW ProfileCorrFinalNWeight AS\n", + "SELECT source, target, SUM(w) AS weight\n", + "FROM ProfileCorrNWeight\n", + "GROUP BY source, target;\n", + "\n", + "SELECT * FROM ProfileCorrFinalNWeight;\n", + "\n", + "-- Gravação de pares de perfis com similaridade para rede\n", + "CALL CSVWRITE('../data/nhanes2005-2006/profile-pair-correlation-number-fb.csv', 'SELECT * FROM ProfileCorrFinalNWeight');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise\n", + "\n", + "Import the file previously created `/data/nhanes2005-2006/profile-pair-correlation-number-fb.csv` in the Gephi. Which analises can you do?" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c71d7419-3dc4-4326-b062-311bda6d78cd", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4aec4ad7-e8e9-434d-9eda-50b0bd74b7d0", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e2ae3ecd-7136-44f8-be4a-524ef1bb3213", + "version_major": 2, + "version_minor": 0 + }, + "method": "display_data" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "CREATE VIEW ProfileCorrelationSWeight AS\n", + " SELECT PC.profile1 AS source, PC.profile2 as target, COUNT(*) as weight\n", + " FROM ProfileCorrelation PC\n", + " GROUP BY PC.profile1, PC.profile2;\n", + " \n", + "SELECT COUNT(*), SUM(weight) FROM ProfileCorrelationSWeight;\n", + "SELECT * FROM ProfileCorrelationSWeight;\n", + "\n", + "-- Gravação de pares de perfis com similaridade para rede\n", + "CALL CSVWRITE('../data/nhanes2005-2006/profile-pair-correlation-similarity-fb.csv', 'SELECT * FROM ProfileCorrelationSWeight');" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SQL", + "language": "SQL", + "name": "sql" + }, + "language_info": { + "codemirror_mode": "sql", + "file_extension": ".sql", + "mimetype": "", + "name": "SQL", + "nbconverter_exporter": "", + "version": "" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": false, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sql-network/sql-network-01-nhanes-p1.ipynb b/sql-network/sql-network-02-nhanes-complete-p1.ipynb similarity index 100% rename from sql-network/sql-network-01-nhanes-p1.ipynb rename to sql-network/sql-network-02-nhanes-complete-p1.ipynb