-
Notifications
You must be signed in to change notification settings - Fork 0
/
LinearRegression.py
236 lines (143 loc) · 6.55 KB
/
LinearRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
# coding: utf-8
# # Import software libraries and load the dataset #
# In[1]:
import sys # Read system parameters.
import numpy as np # Work with multi-dimensional arrays and matrices.
import pandas as pd # Manipulate and analyze data.
import matplotlib as mpl # Create 2D charts.
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sb # Perform data visualization.
import sklearn # Perform data mining and analysis.
from sklearn import datasets
# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(mpl.__version__))
print('- Seaborn {}'.format(sb.__version__))
print('- scikit-learn {}\n'.format(sklearn.__version__))
# Load the dataset.
diabetes = datasets.load_diabetes()
print('Loaded {} records.'.format(len(diabetes.data)))
# # Get acquainted with the dataset
# In[2]:
import pandas as pd
# Convert array to pandas DataFrame.
df_diabetes = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
# View data types and see if there are missing entries.
print(df_diabetes.dtypes)
# View first 10 records.
print(df_diabetes.head(10))
# # Examine the distribution of various features
# In[3]:
# Use Matplotlib to plot distribution histograms for all features.
import matplotlib.pyplot as plt
# Define a color palette for histograms
colors = ['skyblue', 'lightgreen', 'lightcoral', 'orange', 'yellow', 'purple', 'pink', 'lightblue', 'lime', 'salmon', 'lightgray']
# Plot distribution histograms for all features
plt.figure(figsize=(15, 10))
for i, feature in enumerate(df_diabetes.columns):
plt.subplot(3, 4, i+1)
plt.hist(df_diabetes[feature], bins=20, color=colors[i], edgecolor='black')
plt.title(feature)
plt.tight_layout()
plt.show()
# # Examine a general summary of statistics
# In[4]:
# View summary statistics (mean, standard deviation, min, max, etc.) for each feature.
summary_statistics = df_diabetes.describe()
print(summary_statistics)
# # Look for columns that correlate with `target` (disease progression)#
# In[5]:
import numpy as np
import pandas as pd
from sklearn import datasets
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
# Create a DataFrame from the dataset
data = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
# Add the target column 'disease_progression' to the DataFrame
data['disease_progression'] = diabetes.target
# Assign the DataFrame to the variable df_diabetes
df_diabetes = data
# View the first few records of the DataFrame to ensure it's loaded correctly
print(df_diabetes.head())
# In[6]:
# View the correlation values for each feature compared to the label "disease_progression" (target column)
print('Correlations with Target (disease_progression)')
print(df_diabetes.corr()['disease_progression'].sort_values(ascending=False))
# In[7]:
# Check the column names in the DataFrame
print(df_diabetes.columns)
# # Split the label from the dataset
# In[8]:
import pandas as pd
from sklearn.model_selection import train_test_split
# 'disease_progression' is the dependent variable (value to be predicted), so it will be
# removed from the training data and put into a separate DataFrame for labels.
label_column = 'disease_progression'
training_columns = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
# Split the training and test datasets and their labels.
X_train, X_test, y_train, y_test = train_test_split(df_diabetes[training_columns],
df_diabetes[label_column],
random_state=543)
# Compare the number of rows and columns in the original data to the training and test sets.
print(f'Original set: {df_diabetes.shape}')
print('------------------------------')
print(f'Training features: {X_train.shape}')
print(f'Test features: {X_test.shape}')
print(f'Training labels: {y_train.shape}')
print(f'Test labels: {y_test.shape}')
# # Create a linear regression model
# In[9]:
# Construct a basic linear regression class object.
# Fit the training data to the regression object.
from sklearn.linear_model import LinearRegression
# Create a linear regression object
linear_regression = LinearRegression(fit_intercept=False)
# Fit the training data to the regression object
linear_regression.fit(X_train, y_train)
# Print the coefficients of the linear regression model
print("Coefficients:", linear_regression.coef_)
# # Compare the first ten predictions to actual values
# In[10]:
# Make predictions on the test set
y_pred = linear_regression.predict(X_test)
results_comparison = X_test.copy()
results_comparison['PredictedDiseaseProgression'] = np.round(y_pred, 2)
results_comparison['ActualDiseaseProgression'] = y_test.copy()
# View examples of the predictions compared to actual energy output.
results_comparison.head(10)
# # Calculate the error between predicted and actual values
# In[11]:
# Print the mean squared error (MSE) for the model's predictions on the test set.
# Import mean squared error function
from sklearn.metrics import mean_squared_error as mse
# Calculate the mean squared error (MSE) for the model's predictions on the test set
cost = mse(y_test, y_pred)
print('Cost (mean squared error): {:.2f}'.format(cost))
# # Plot lines of best fit for four features
# In[12]:
# Use Seaborn to create subplots for the four features that have the strongest correlation with the label.
# Also plot a line of best fit for each feature.
import seaborn as sns
import matplotlib.pyplot as plt
# Select the features with the strongest correlation with the "DiseaseProgression" label
strongest_corr_features = ['age', 'bmi', 's5', 's1']
# Create subplots for the four features
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
# Define a color palette for scatter points
colors = ['blue', 'green', 'orange', 'purple']
# Plot lines of best fit for each feature
for i, feature in enumerate(strongest_corr_features):
row = i // 2
col = i % 2
sns.regplot(x=X_test[feature], y=np.ravel(y_pred), scatter_kws={'color': colors[i]}, line_kws={'color': colors[i]}, ax=axes[row, col])
axes[row, col].set_xlabel(feature)
axes[row, col].set_ylabel('Disease Progression')
plt.tight_layout()
plt.show()
# In[ ]: