diff --git a/sql-network/sql-network-01-nhanes-fishbones.ipynb b/sql-network/sql-network-01-nhanes-fishbones.ipynb
new file mode 100644
index 0000000..6e92805
--- /dev/null
+++ b/sql-network/sql-network-01-nhanes-fishbones.ipynb
@@ -0,0 +1,1201 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%defaultDatasource jdbc:h2:mem:db"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Reference values for NHANES for the 2005-2006 survey\n",
+ "\n",
+ "* Extracted from data of the NHANES Web site (https://wwwn.cdc.gov/nchs/nhanes/).\n",
+ "\n",
+ "## Importing normal ranges of values indicated in the NHANES documentation\n",
+ "\n",
+ "The following query imports the file `reference-ranges.csv` that contains reference ranges from NHANES.\n",
+ "\n",
+ "* For each variable it is indicated\n",
+ " - applicable gender\n",
+ " - age range (ageStart until ageEnd)\n",
+ "\n",
+ "* The range is indicated in the form of mininum and maximum values considered normal."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "307d4d69-e9e4-4737-af18-01b198a055d5",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d65a379c-5107-429f-a518-ad7c68687e0e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP TABLE IF EXISTS ReferenceRanges;\n",
+ "CREATE TABLE ReferenceRanges (\n",
+ " variable VARCHAR(8),\n",
+ " gender VARCHAR(1),\n",
+ " ageStart SMALLINT,\n",
+ " ageEnd SMALLINT,\n",
+ " min DECIMAL(7,1),\n",
+ " max DECIMAL(7,1),\n",
+ " PRIMARY KEY(variable,gender,ageStart,ageEnd)\n",
+ ") AS SELECT\n",
+ " variable,gender,ageStart,ageEnd,min,max\n",
+ "FROM CSVREAD('../data/nhanes2005-2006/reference-ranges.csv');\n",
+ "\n",
+ "SELECT DISTINCT variable FROM ReferenceRanges;\n",
+ "SELECT * FROM ReferenceRanges;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Survey NHANES 2005-2006\n",
+ "\n",
+ "* Extracted from data of the NHANES Web site (https://wwwn.cdc.gov/nchs/nhanes/).\n",
+ "\n",
+ "## Importing data from the survey NHANES 2005-2006\n",
+ "\n",
+ "* The following query imports the `combined-selected-variables.csv` that contains a tuple for each individual, with a selected set of variables that are used do diagnose anemia, as mentioned in Figure 1. It was filtered only the individuals with values for all fields.\n",
+ "\n",
+ "![evaluation of anemia](evaluation-of-anemia.gif \"Figure 1\")\n",
+ "*Figure 1*: Evaluation of anemia in the adult according to the mean corpuscular volume. CBC: complete blood count; MCV: mean corpuscular volume; RBCs: red blood cells; Fe: iron; TIBC: total iron-binding capacity (transferrin); LDH: lactate dehydrogenase [6].\n",
+ "\n",
+ "* We selected four commonly used blood test variables, as shows the following figure (known as Fishbones).\n",
+ "\n",
+ "![btc fishbones](Hematology_Fishbone_Schematic.png \"BTC Fishbones\")\n",
+ "By Major Small - Own work, CC BY 3.0, Link"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fdc26656-3244-4b9c-b117-917327a4053f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0d45768d-be82-4192-a45a-90a31d24d001",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP TABLE IF EXISTS Survey;\n",
+ "\n",
+ "CREATE TABLE Survey (\n",
+ " SEQN VARCHAR(8),\n",
+ " RIAGENDR VARCHAR(1),\n",
+ " RIDAGEYR SMALLINT,\n",
+ " LBXWBCSI DECIMAL(7,1),\n",
+ " LBXHGB DECIMAL(7,1),\n",
+ " LBXHCT DECIMAL(7,1),\n",
+ " LBXPLTSI DECIMAL(7,1),\n",
+ " PRIMARY KEY(SEQN)\n",
+ ") AS SELECT\n",
+ " SEQN,RIAGENDR,RIDAGEYR,LBXWBCSI,LBXHGB,LBXHCT,LBXPLTSI\n",
+ "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');\n",
+ "\n",
+ "SELECT COUNT(*) FROM Survey;\n",
+ "SELECT * FROM Survey;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Codes and description of NHANES variables\n",
+ "\n",
+ "* The codes and description of variables are stored in a table."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5e73d2ce-09a8-4b5f-b024-73ae9baf3f66",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP TABLE IF EXISTS VariableDescription;\n",
+ "CREATE TABLE VariableDescription (\n",
+ " variable VARCHAR(8),\n",
+ " acronym VARCHAR(8),\n",
+ " name VARCHAR(50),\n",
+ " unit VARCHAR(30),\n",
+ " file VARCHAR(20),\n",
+ " ranges VARCHAR(100),\n",
+ " PRIMARY KEY(variable)\n",
+ ") AS SELECT\n",
+ " variable,acronym,name,unit,file,ranges\n",
+ "FROM CSVREAD('../data/nhanes2005-2006/reference-ranges-variables.csv');\n",
+ "\n",
+ "SELECT * FROM VariableDescription;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Binary evaluation of individuals out of the normal ranges\n",
+ "\n",
+ "* For each variable, this table defines an extra binary column _b which is initialized with 0 and will receive 1 is the variable is out of the NHANES range."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Generation of the starting matrix initialized with 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DROP TABLE IF EXISTS SurveyB;\n",
+ "CREATE TABLE SurveyB (\n",
+ " SEQN VARCHAR(8),\n",
+ " RIAGENDR VARCHAR(1),\n",
+ " RIDAGEYR SMALLINT,\n",
+ " LBXWBCSI DECIMAL(7,1),\n",
+ " LBXWBCSI_b SMALLINT DEFAULT 0,\n",
+ " LBXHGB DECIMAL(7,1),\n",
+ " LBXHGB_b SMALLINT DEFAULT 0,\n",
+ " LBXHCT DECIMAL(7,1),\n",
+ " LBXHCT_b SMALLINT DEFAULT 0,\n",
+ " LBXPLTSI DECIMAL(7,1),\n",
+ " LBXPLTSI_b SMALLINT DEFAULT 0,\n",
+ " PRIMARY KEY(SEQN)\n",
+ ") AS SELECT\n",
+ " SEQN,RIAGENDR,RIDAGEYR,LBXWBCSI,0,LBXHGB,0,LBXHCT,0,LBXPLTSI,0\n",
+ "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Matrix building\n",
+ "\n",
+ "* Each variable is compared with the limits of the NHANES ranges, and the binary _b columns are updated."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "-- Computing LBXWBCSI\n",
+ "UPDATE SurveyB SB\n",
+ "SET SB.LBXWBCSI_b = 1\n",
+ "WHERE EXISTS\n",
+ "(SELECT RRb.min\n",
+ " FROM ReferenceRanges RRb\n",
+ " WHERE RRb.variable='LBXWBCSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXWBCSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXWBCSI>RRb.max);\n",
+ "\n",
+ "-- Computing LBXHGB\n",
+ "UPDATE SurveyB SB\n",
+ "SET SB.LBXHGB_b = 1\n",
+ "WHERE EXISTS\n",
+ "(SELECT RRb.min\n",
+ " FROM ReferenceRanges RRb\n",
+ " WHERE RRb.variable='LBXHGB' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHGB=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHGB>RRb.max);\n",
+ "\n",
+ "-- Computing LBXHCT\n",
+ "UPDATE SurveyB SB\n",
+ "SET SB.LBXHCT_b = 1\n",
+ "WHERE EXISTS\n",
+ "(SELECT RRb.min\n",
+ " FROM ReferenceRanges RRb\n",
+ " WHERE RRb.variable='LBXHCT' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHCT=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXHCT>RRb.max);\n",
+ "\n",
+ "-- Computing LBXPLTSI\n",
+ "UPDATE SurveyB SB\n",
+ "SET SB.LBXPLTSI_b = 1\n",
+ "WHERE EXISTS\n",
+ "(SELECT RRb.min\n",
+ " FROM ReferenceRanges RRb\n",
+ " WHERE RRb.variable='LBXPLTSI' AND SB.RIAGENDR=RRb.gender AND SB.RIDAGEYR>=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXPLTSI=RRb.ageStart AND SB.RIDAGEYR<=RRb.ageEnd AND SB.LBXPLTSI>RRb.max);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Final Matrix\n",
+ "\n",
+ "* Building of the final matrix that has the identification of the person, a binary _b matrix, and a profile built by the concatenation of lines in the binary matrix.\n",
+ "* The profile represents the in a binary form what is out of the ranges in each person.\n",
+ "* Only anormal persons are filtered.\n",
+ "\n",
+ "* The resulting matrix produces a CSV file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "42156788-a55d-4602-bca5-066ac9ae5e8d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f09ed8c2-0388-45ee-9883-6ab3f5c5ee8a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "338795e9-ba06-424a-8de3-ee417aa37dee",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP VIEW IF EXISTS DeviationProfiles;\n",
+ "DROP VIEW IF EXISTS CorrelationMatrix;\n",
+ "\n",
+ "CREATE VIEW CorrelationMatrix AS\n",
+ "SELECT DISTINCT SB.SEQN, \n",
+ " CONCAT(SB.LBXWBCSI_b, SB.LBXHGB_b, SB.LBXHCT_b, SB.LBXPLTSI_b) AS profile,\n",
+ " SB.LBXWBCSI_b, SB.LBXHGB_b, SB.LBXHCT_b, SB.LBXPLTSI_b\n",
+ "FROM SurveyB SB, ReferenceRanges RR\n",
+ "WHERE SB.RIAGENDR=RR.gender AND SB.RIDAGEYR>=RR.ageStart AND SB.RIDAGEYR<=RR.ageEnd AND\n",
+ "(LBXWBCSI_b>0 OR LBXHGB_b>0 OR LBXHCT_b>0 OR LBXPLTSI_b>0);\n",
+ "\n",
+ "SELECT COUNT(*) FROM CorrelationMatrix;\n",
+ "SELECT * FROM CorrelationMatrix;\n",
+ "\n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/correlation-matrix-fb.csv', 'SELECT * FROM CorrelationMatrix');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Profiles network\n",
+ "\n",
+ "* Persons are here related from their binary profiles, producing a profiles network."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Grouping profiles\n",
+ "\n",
+ "* Profiles are grouped according ro a binary pattern and people with the same profile are aggregated."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ec6217b2-2fc6-49f0-9852-0525d86c16ff",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2ea7309e-436c-4d42-ae6f-578c7d8c9616",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ce02aac4-c199-4b95-9eeb-a7d749ee0a14",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP VIEW IF EXISTS DeviationProfiles;\n",
+ "\n",
+ "CREATE VIEW DeviationProfiles AS\n",
+ "SELECT CM.profile, COUNT(*) AS individuals\n",
+ "FROM CorrelationMatrix CM\n",
+ "GROUP BY CM.profile;\n",
+ "\n",
+ "SELECT SUM(individuals) FROM DeviationProfiles;\n",
+ "SELECT * FROM DeviationProfiles;\n",
+ "\n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/profile-deviation-fb.csv', 'SELECT DP.profile AS id, DP.individuals AS weight FROM DeviationProfiles DP');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Matrix with deviation intensity\n",
+ "\n",
+ "* This second matrix records the deviation of variables that overcomes the limits and how much the overcome."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Geração de nova matriz de base inicializada com 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "6de00da1-b1dc-4f41-b3ea-b75d47963e99",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP TABLE IF EXISTS SurveyD;\n",
+ "CREATE TABLE SurveyD (\n",
+ " SEQN VARCHAR(8),\n",
+ " RIAGENDR VARCHAR(1),\n",
+ " RIDAGEYR SMALLINT,\n",
+ " LBXWBCSI DECIMAL(7,1),\n",
+ " LBXWBCSI_d SMALLINT DEFAULT 0,\n",
+ " LBXHGB DECIMAL(7,1),\n",
+ " LBXHGB_d SMALLINT DEFAULT 0,\n",
+ " LBXHCT DECIMAL(7,1),\n",
+ " LBXHCT_d SMALLINT DEFAULT 0,\n",
+ " LBXPLTSI DECIMAL(7,1),\n",
+ " LBXPLTSI_d SMALLINT DEFAULT 0,\n",
+ " PRIMARY KEY(SEQN)\n",
+ ") AS SELECT\n",
+ " SEQN,RIAGENDR,RIDAGEYR,LBXWBCSI,0,LBXHGB,0,LBXHCT,0,LBXPLTSI,0\n",
+ "FROM CSVREAD('../data/nhanes2005-2006/combined-selected-variables.csv');\n",
+ "\n",
+ "SELECT * FROM SurveyD;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Matrix building\n",
+ "\n",
+ "* Each variable is compared with the limits of the NHANES ranges, and the deviation _d columns receive the difference."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "-- Computing LBXWBCSI\n",
+ "UPDATE SurveyD SD\n",
+ "SET SD.LBXWBCSI_d =\n",
+ "(SELECT RRa.min-SD.LBXWBCSI\n",
+ " FROM ReferenceRanges RRa\n",
+ " WHERE RRa.variable='LBXWBCSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXWBCSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXWBCSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXWBCSI>RRa.max)\n",
+ "WHERE SD.LBXWBCSI_d = 0 AND\n",
+ "EXISTS (SELECT RRb.max\n",
+ " FROM ReferenceRanges RRb\n",
+ " WHERE RRb.variable='LBXWBCSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXWBCSI>RRb.max);\n",
+ "\n",
+ "-- Computing LBXHGB\n",
+ "UPDATE SurveyD SD\n",
+ "SET SD.LBXHGB_d =\n",
+ "(SELECT RRa.min-SD.LBXHGB\n",
+ " FROM ReferenceRanges RRa\n",
+ " WHERE RRa.variable='LBXHGB' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHGB=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHGB=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHGB>RRa.max)\n",
+ "WHERE SD.LBXHGB_d = 0 AND\n",
+ "EXISTS (SELECT RRb.max\n",
+ " FROM ReferenceRanges RRb\n",
+ " WHERE RRb.variable='LBXHGB' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHGB>RRb.max);\n",
+ "\n",
+ "-- Computing LBXHCT\n",
+ "UPDATE SurveyD SD\n",
+ "SET SD.LBXHCT_d =\n",
+ "(SELECT RRa.min-SD.LBXHCT\n",
+ " FROM ReferenceRanges RRa\n",
+ " WHERE RRa.variable='LBXHCT' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHCT=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHCT=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXHCT>RRa.max)\n",
+ "WHERE SD.LBXHCT_d = 0 AND\n",
+ "EXISTS (SELECT RRb.max\n",
+ " FROM ReferenceRanges RRb\n",
+ " WHERE RRb.variable='LBXHCT' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXHCT>RRb.max);\n",
+ "\n",
+ "-- Computing LBXPLTSI\n",
+ "UPDATE SurveyD SD\n",
+ "SET SD.LBXPLTSI_d =\n",
+ "(SELECT RRa.min-SD.LBXPLTSI\n",
+ " FROM ReferenceRanges RRa\n",
+ " WHERE RRa.variable='LBXPLTSI' AND SD.RIAGENDR=RRa.gender AND SD.RIDAGEYR>=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXPLTSI=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXPLTSI=RRa.ageStart AND SD.RIDAGEYR<=RRa.ageEnd AND SD.LBXPLTSI>RRa.max)\n",
+ "WHERE SD.LBXPLTSI_d = 0 AND\n",
+ "EXISTS (SELECT RRb.max\n",
+ " FROM ReferenceRanges RRb\n",
+ " WHERE RRb.variable='LBXPLTSI' AND SD.RIAGENDR=RRb.gender AND SD.RIDAGEYR>=RRb.ageStart AND SD.RIDAGEYR<=RRb.ageEnd AND SD.LBXPLTSI>RRb.max);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Final Matrix\n",
+ "\n",
+ "* Building of the final matrix that has the identification of the person and a deviation _d matrix.\n",
+ "* Only anormal persons are filtered."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8e5a3e74-b103-4410-a39f-601a4eed0d8c",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c399904e-c81d-4b26-83ab-188595410cb3",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a3b26122-5d8e-4dfd-aa5d-c3562dc333bb",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP VIEW IF EXISTS CorrelationMatrixWeighted;\n",
+ "\n",
+ "CREATE VIEW CorrelationMatrixWeighted AS\n",
+ "SELECT DISTINCT SD.SEQN, \n",
+ " SD.LBXWBCSI_d, SD.LBXHGB_d, SD.LBXHCT_d, SD.LBXPLTSI_d\n",
+ "FROM SurveyD SD, ReferenceRanges RR\n",
+ "WHERE SD.RIAGENDR=RR.gender AND SD.RIDAGEYR>=RR.ageStart AND SD.RIDAGEYR<=RR.ageEnd AND\n",
+ "(LBXWBCSI_d>0 OR LBXHGB_d>0 OR LBXHCT_d>0 OR LBXPLTSI_d>0);\n",
+ "\n",
+ "SELECT COUNT(*) FROM CorrelationMatrixWeighted;\n",
+ "SELECT * FROM CorrelationMatrixWeighted;\n",
+ "\n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/correlation-matrix-weighted-fb.csv', 'SELECT * FROM CorrelationMatrixWeighted');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Variables Network\n",
+ "\n",
+ "* In this network each node is a variable and each edge indicates that two variables are correlated in a certain intensity."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## List of the variable pairs\n",
+ "\n",
+ "* This view prepares the list of correlation pairs initialized with 0."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DROP VIEW IF EXISTS VariablesCorrelation;\n",
+ "DROP VIEW IF EXISTS Variables;\n",
+ "\n",
+ "CREATE VIEW Variables AS\n",
+ "SELECT DISTINCT variable AS var1 FROM ReferenceRanges;\n",
+ "\n",
+ "CREATE VIEW VariablesCorrelation AS\n",
+ "SELECT DISTINCT Variables.var1, ReferenceRanges.variable AS var2, 0 AS correlation\n",
+ "FROM Variables, ReferenceRanges\n",
+ "WHERE Variables.var1 < ReferenceRanges.variable;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Survey verticalization\n",
+ "\n",
+ "* Persons and variables that are originally presented as a matrix are transformed in a list: person, variable and value. This list will facilitate the subsequent analyses."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "10660"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "DROP VIEW IF EXISTS VerticalSurvey;\n",
+ "\n",
+ "CREATE VIEW VerticalSurvey AS\n",
+ " SELECT SU.SEQN, RR.variable, SU.LBXWBCSI AS value, 0 AS deviation\n",
+ " FROM Survey SU, ReferenceRanges RR\n",
+ " WHERE RR.variable='LBXWBCSI'\n",
+ "UNION\n",
+ " SELECT SU.SEQN, RR.variable, SU.LBXHGB AS value, 0 AS deviation\n",
+ " FROM Survey SU, ReferenceRanges RR\n",
+ " WHERE RR.variable='LBXHGB'\n",
+ "UNION\n",
+ " SELECT SU.SEQN, RR.variable, SU.LBXHCT AS value, 0 AS deviation\n",
+ " FROM Survey SU, ReferenceRanges RR\n",
+ " WHERE RR.variable='LBXHCT'\n",
+ "UNION\n",
+ " SELECT SU.SEQN, RR.variable, SU.LBXPLTSI AS value, 0 AS deviation\n",
+ " FROM Survey SU, ReferenceRanges RR\n",
+ " WHERE RR.variable='LBXPLTSI'\n",
+ ";\n",
+ "\n",
+ "-- transformation of the view in a table to enable updates\n",
+ "DROP TABLE IF EXISTS VerticalSurveyD;\n",
+ "CREATE TABLE VerticalSurveyD (\n",
+ " SEQN VARCHAR(8),\n",
+ " variable VARCHAR(8),\n",
+ " value DECIMAL(7,1),\n",
+ " deviation DECIMAL(7,1),\n",
+ " PRIMARY KEY(SEQN, variable)\n",
+ ") AS SELECT * FROM VerticalSurvey;\n",
+ " \n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/vertical-survey-fb.csv', 'SELECT SEQN,variable,value FROM VerticalSurvey');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Computation of the deviation value for the variables that are out of the limits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f4263a7e-862c-4953-ada5-ef51d953fb3e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "UPDATE VerticalSurveyD VS\n",
+ "SET VS.deviation =\n",
+ "(SELECT RRa.min-VS.value\n",
+ " FROM Survey SUa, ReferenceRanges RRa\n",
+ " WHERE RRa.variable=VS.variable AND SUa.SEQN=VS.SEQN AND SUa.RIAGENDR=RRa.gender AND SUa.RIDAGEYR>=RRa.ageStart AND SUa.RIDAGEYR<=RRa.ageEnd AND VS.value=RRb.ageStart AND SUb.RIDAGEYR<=RRb.ageEnd AND VS.value=RRa.ageStart AND SUa.RIDAGEYR<=RRa.ageEnd AND VS.value>RRa.max)\n",
+ "WHERE EXISTS\n",
+ "(SELECT RRb.max\n",
+ " FROM Survey SUb, ReferenceRanges RRb\n",
+ " WHERE RRb.variable=VS.variable AND SUb.SEQN=VS.SEQN AND SUb.RIAGENDR=RRb.gender AND SUb.RIDAGEYR>=RRb.ageStart AND SUb.RIDAGEYR<=RRb.ageEnd AND VS.value>RRb.max);\n",
+ " \n",
+ "SELECT * FROM VerticalSurveyD WHERE deviation > 0;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Average of the variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8e81a2d2-5160-43e9-a507-e0be33419d23",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f242550e-4623-432f-8286-f477cbbf1bd0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "SELECT variable as id, COUNT(*) as weight FROM VerticalSurveyD VS WHERE deviation>0 GROUP BY variable;\n",
+ "\n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/variable-number-deviation-fb.csv', 'SELECT variable as id, COUNT(*) as weight FROM VerticalSurveyD VS WHERE deviation>0 GROUP BY variable');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Variable correlation by person\n",
+ "\n",
+ "* Pairwise correlation of variables that cooccur in the same person."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "88963eae-7110-4248-9bad-41879b3c84bb",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP VIEW IF EXISTS VariablePairCorrelation;\n",
+ "DROP VIEW IF EXISTS IndividualVariablesCorrelation;\n",
+ "\n",
+ "CREATE VIEW IndividualVariablesCorrelation AS\n",
+ "SELECT VS1.SEQN, CM.profile, VC.var1, VC.var2\n",
+ "FROM VariablesCorrelation VC, VerticalSurveyD VS1, VerticalSurveyD VS2, CorrelationMatrix CM\n",
+ "WHERE VS1.SEQN = VS2.SEQN AND VS1.variable = VC.var1 AND VS2.variable = VC.var2 AND \n",
+ " VS1.deviation > 0 AND VS2.deviation > 0 AND\n",
+ " VS1.SEQN = CM.SEQN;\n",
+ "\n",
+ "SELECT * FROM IndividualVariablesCorrelation\n",
+ "ORDER BY var1, var2;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Correlation of variable pairs\n",
+ "\n",
+ "* Aggregation of correlations of variable pairs.\n",
+ "* Preparation to build a network where variables are vertices and edges connect variables that surpassed the limits together for the same person."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "479abd75-7c61-4eb7-9eec-42d29d59162b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "57dfb168-32d5-403b-bdf5-15f2bb7dc510",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP VIEW IF EXISTS VariablePairCorrelation;\n",
+ "CREATE VIEW VariablePairCorrelation AS\n",
+ "SELECT var1 AS source, var2 as TARGET, COUNT(*) AS weight\n",
+ "FROM IndividualVariablesCorrelation\n",
+ "GROUP BY var1, var2;\n",
+ "\n",
+ "SELECT * FROM VariablePairCorrelation;\n",
+ "\n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/variable-pair-correlation-fb.csv', 'SELECT * FROM VariablePairCorrelation');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exercise\n",
+ "\n",
+ "Import the file previously created `/data/nhanes2005-2006/variable-pair-correlation-fb.csv` in the Gephi. Which analises can you do?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Profile Network\n",
+ "\n",
+ "* Returning to the profile network."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Correlation analysis of profile pairs\n",
+ "\n",
+ "* Each time that two persons share a variable out of the ranges, an edge is created between them.\n",
+ "* The edges are grouped by profile pairs. For each pair is computed the number of individuals/variables that cooccur."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "67421"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "DROP VIEW IF EXISTS ProfileCorrelation;\n",
+ "\n",
+ "CREATE VIEW ProfileCorrelation AS\n",
+ " SELECT CM1.SEQN AS SEQN1, CM1.profile AS profile1, CM2.SEQN AS SEQN2, CM2.profile AS profile2\n",
+ " FROM VerticalSurveyD VS1, VerticalSurveyD VS2, CorrelationMatrix CM1, CorrelationMatrix CM2\n",
+ " WHERE VS1.SEQN < VS2.SEQN AND VS1.variable = VS2.variable AND\n",
+ " VS1.deviation > 0 AND VS2.deviation > 0 AND\n",
+ " VS1.SEQN = CM1.SEQN AND VS2.SEQN = CM2.SEQN;\n",
+ " \n",
+ "-- Gravação de pares de perfis com similaridade para rede\n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/profile-pair-correlation-fb.csv', 'SELECT * FROM ProfileCorrelation');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d4c04120-71d3-4ffe-a62c-4f77c8b945f6",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b720c0fb-72b3-459f-ab0f-5ade875a6e81",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "DROP VIEW IF EXISTS ProfileCorrelationNWeight;\n",
+ "DROP VIEW IF EXISTS ProfileCorrelationUnique;\n",
+ "\n",
+ "CREATE VIEW ProfileCorrelationUnique AS\n",
+ " SELECT DISTINCT * FROM ProfileCorrelation;\n",
+ "\n",
+ "CREATE VIEW ProfileCorrelationNWeight AS\n",
+ " SELECT PC.profile1 AS source, PC.profile2 as target, COUNT(*) as weight\n",
+ " FROM ProfileCorrelationUnique PC\n",
+ " GROUP BY PC.profile1, PC.profile2;\n",
+ " \n",
+ "SELECT COUNT(*), SUM(weight) FROM ProfileCorrelationNWeight;\n",
+ "SELECT * FROM ProfileCorrelationNWeight;"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a4f62d8e-7180-41cf-b0bb-5caa1e7ba600",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2e065a2e-6ac7-4ac5-9b37-011b0fb19aa7",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "CREATE VIEW ProfileCorrNWeight AS\n",
+ "SELECT source, target, weight w FROM ProfileCorrelationNWeight WHERE source < target\n",
+ "UNION\n",
+ "SELECT target, source, weight w FROM ProfileCorrelationNWeight WHERE source > target;\n",
+ "\n",
+ "CREATE VIEW ProfileCorrFinalNWeight AS\n",
+ "SELECT source, target, SUM(w) AS weight\n",
+ "FROM ProfileCorrNWeight\n",
+ "GROUP BY source, target;\n",
+ "\n",
+ "SELECT * FROM ProfileCorrFinalNWeight;\n",
+ "\n",
+ "-- Gravação de pares de perfis com similaridade para rede\n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/profile-pair-correlation-number-fb.csv', 'SELECT * FROM ProfileCorrFinalNWeight');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exercise\n",
+ "\n",
+ "Import the file previously created `/data/nhanes2005-2006/profile-pair-correlation-number-fb.csv` in the Gephi. Which analises can you do?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c71d7419-3dc4-4326-b062-311bda6d78cd",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "4aec4ad7-e8e9-434d-9eda-50b0bd74b7d0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "e2ae3ecd-7136-44f8-be4a-524ef1bb3213",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "method": "display_data"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "CREATE VIEW ProfileCorrelationSWeight AS\n",
+ " SELECT PC.profile1 AS source, PC.profile2 as target, COUNT(*) as weight\n",
+ " FROM ProfileCorrelation PC\n",
+ " GROUP BY PC.profile1, PC.profile2;\n",
+ " \n",
+ "SELECT COUNT(*), SUM(weight) FROM ProfileCorrelationSWeight;\n",
+ "SELECT * FROM ProfileCorrelationSWeight;\n",
+ "\n",
+ "-- Gravação de pares de perfis com similaridade para rede\n",
+ "CALL CSVWRITE('../data/nhanes2005-2006/profile-pair-correlation-similarity-fb.csv', 'SELECT * FROM ProfileCorrelationSWeight');"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "SQL",
+ "language": "SQL",
+ "name": "sql"
+ },
+ "language_info": {
+ "codemirror_mode": "sql",
+ "file_extension": ".sql",
+ "mimetype": "",
+ "name": "SQL",
+ "nbconverter_exporter": "",
+ "version": ""
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": false,
+ "sideBar": false,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": false,
+ "toc_window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/sql-network/sql-network-01-nhanes-p1.ipynb b/sql-network/sql-network-02-nhanes-complete-p1.ipynb
similarity index 100%
rename from sql-network/sql-network-01-nhanes-p1.ipynb
rename to sql-network/sql-network-02-nhanes-complete-p1.ipynb