-
Notifications
You must be signed in to change notification settings - Fork 1
/
3. Intro to Plotting.py
395 lines (297 loc) · 10.8 KB
/
3. Intro to Plotting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
# Introduction to Plotting
import seaborn as sns
import matplotlib.pyplot as plt
anscombe = sns.load_dataset("anscombe")
print(anscombe)
# 3.2 Matplotlib
dataset_1 = anscombe[anscombe['dataset'] == 'I']
dataset_2 = anscombe[anscombe['dataset'] == 'II']
dataset_3 = anscombe[anscombe['dataset'] == 'III']
dataset_4 = anscombe[anscombe['dataset'] == 'IV']
plt.plot(dataset_1['x'], dataset_1['y']) # will print with lines
plt.show()
plt.plot(dataset_1['x'], dataset_1['y'], 'o') # will print with blue dots
plt.show()
fig = plt.figure()
axes1 = fig.add_subplot(2, 2, 1) # 2 rows, 2 columns, plot location 1
axes2 = fig.add_subplot(2, 2, 2) # 2 rows, 2 columns, plot location 2
axes3 = fig.add_subplot(2, 2, 3) # 2 rows, 2 columns, plot location 3
axes4 = fig.add_subplot(2, 2, 4) # 2 rows, 2 columns, plot location 4
# add a plot to each of the axes created above
axes1.plot(dataset_1['x'], dataset_1['y'], 'o')
axes2.plot(dataset_2['x'], dataset_2['y'], 'o')
axes3.plot(dataset_3['x'], dataset_3['y'], 'o')
axes4.plot(dataset_4['x'], dataset_4['y'], 'o')
# add a small title to each of the four subplots
axes1.set_title("dataset_1")
axes2.set_title("dataset_2")
axes3.set_title("dataset_3")
axes4.set_title("dataset_4")
# add a title for the entire figure:
fig.suptitle("Anscombe Data")
# add a tight layout:
fig.tight_layout()
plt.show()
# 3.3 Statistical Graphs Using Matplotlib
tips = sns.load_dataset("tips")
print(tips.head())
tips.info # 244 rows, 7 columns
# 3.3.1 Univariate
# 3.3.3.1 Histograms
fig = plt.figure()
axes1 = fig.add_subplot(1, 1, 1)
axes1.hist(tips['total_bill'], bins=100)
axes1.set_title('Histogram of Total Bill')
axes1.set_xlabel('Frequency')
axes1.set_ylabel('Total Bill')
fig.show()
# 3.3.2 Bivariate
# 3.3.2.1 Scatterplot
scatter_plot = plt.figure()
axes1 = scatter_plot.add_subplot(1, 1, 1)
axes1.scatter(tips['total_bill'], tips['tip'])
axes1.set_title('Scatterplot of Total Bill vs Tip')
axes1.set_xlabel('Total Bill')
axes1.set_ylabel('Tip')
scatter_plot.show()
# 3.3.2.2 Boxplot
boxplot = plt.figure()
axes1 = boxplot.add_subplot(1, 1, 1)
axes1.boxplot(
[tips[tips['sex'] == 'Female']['tip'],
tips[tips['sex'] == 'Male']['tip']],
labels=['Female', 'Male'])
axes1.set_xlabel('Sex')
axes1.set_ylabel('Tip')
axes1.set_title('Boxplot of Tips by Sex')
boxplot.show()
# 3.3.3 Multivariate Data
# crate a color based on sex
def recode_sex(sex):
if sex == 'Female':
return 0
else:
return 1
tips['sex_color'] = tips['sex'].apply(recode_sex)
scatter_plot = plt.figure()
axes1 = scatter_plot.add_subplot(1, 1, 1)
axes1.scatter(
x=tips['total_bill'],
y=tips['tip'],
# set the size of the dots based on party size
# multiply the value by 10 to make the points bigger and easier to see
s=tips['size'] * 10,
# set the color for the sex
c=tips['sex_color'],
# set the alpha value so points are more transparent
alpha=0.5)
axes1.set_title('Total bill vs Tip Colored by Sex and Sized by Size')
axes1.set_xlabel('Total Bill')
axes1.set_ylabel('Tip')
scatter_plot.show()
# 3.4 Seaborn
# 3.4.1 Univariate
# 3.4.1.1 Histograms
# Histograms are crated using sns.distplot
hist, ax = plt.subplots()
# use the distplot function from seaborn to create our plot
ax = sns.distplot(tips['total_bill'])
ax.set_title('Total Bill Histogram with Density Plot')
plt.show() # shows both a histogram and a density plot
# Show a histogram only
hist, ax = plt.subplots()
ax = sns.distplot(tips['total_bill'], kde=False)
ax.set_title('Total Bill Histogram')
ax.set_xlabel('Total Bill')
ax.set_ylabel('Unit Probability')
plt.show()
# 3.4.1.2 Density Plot (Kernel Density Estimation)
den, ax = plt.subplots()
ax = sns.distplot(tips['total_bill'], hist=False)
ax.set_title('Total Bill Density')
ax.set_xlabel('Total Bill')
ax.set_ylabel('Unit Probability')
plt.show()
# 3.4.1.3 Rug Plot = A One-Dimensional Representation of a Variable's Distribution
hist_den_run, ax = plt.subplots()
ax = sns.distplot(tips['total_bill'], rug=True)
ax.set_title('Total Bill Histogram with Density and Rug Plot')
ax.set_xlabel('Total Bill')
plt.show()
# 3.4.1.4 Count Plot (Bar Plot)
count, ax = plt.subplots()
ax = sns.countplot('day', data=tips)
ax.set_title('Count of days')
ax.set_xlabel('Day of the Week')
ax.set_ylabel('Frequency')
plt.show()
# 3.4.2 Bivariate Data
# 3.4.2.1 Scatterplot
scatter, ax = plt.subplots()
ax = sns.regplot(x='total_bill', y='tip', data=tips)
ax.set_title('Scatterplot of Total Bill and Tip')
ax.set_xlabel('Total Bill')
ax.set_ylabel('Tip')
plt.show()
# lmplot - creates a figure, whereas regplot creates axes
fig = sns.lmplot(x='total_bill', y='tip', data=tips)
plt.show()
# create a scatterplot that includes a univariate plot on each axis
join = sns.jointplot(x='total_bill', y='tip', data=tips)
join.set_axis_labels(xlabel='Total Bill', ylabel='Tip')
join.fig.suptitle('Joint Plot of Total Bill and Tip', fontsize=10, y=1.03)
plt.show()
# 3.4.2.2 Hexabin Plot
hexabin = sns.jointplot(x='total_bill', y='tip', data=tips, kind="hex")
hexabin.set_axis_labels(xlabel='Total Bill', ylabel='Tip')
hexabin.fig.suptitle('Hexbin Joint Plot of Total Bill and Tip', fontsize=10, y=1.03)
plt.show()
# 3.4.2.3 2D Density Plot
kde, ax = plt.subplots()
ax = sns.kdeplot(data=tips['total_bill'],
data2 = tips['tip'],
shade = True)
ax.set_title('Kernel Density Plot of Total Bill and Tip')
ax.set_xlabel('Total Bill')
ax.set_ylabel('Tip')
plt.show()
# provides density plots on both axes
kde_joint = sns.jointplot(x = 'total_bill', y = 'tip', data = tips, kind = 'kde')
plt.show()
# 3.4.2.4 Multiple Bar Plots on one result
bar, ax = plt.subplots()
ax=sns.barplot(x = 'time', y = 'total_bill', data = tips)
ax.set_title('Bar plot of average total bill for time of day')
ax.set_xlabel('Time of day')
ax.set_ylabel('Average total bill')
plt.show()
# 3.4.2.5 Boxplot
# Unlike the previously mentioned plots, a boxplot (Figure 3.24) shows
# multiple statistics: the minimum, first quartile, median, third quartile,
# maximum, and, if applicable, outliers based on the interquartile range.
box, ax = plt.subplots()
ax = sns.boxplot(x = 'time', y = 'total_bill', data = tips)
ax.set_title('Boxplot of total bill by time of day')
ax.set_xlabel('Time of day')
ax.set_ylabel('Total Bill')
plt.show()
# 3.4.2.6 Violin Plt
# Violin plots (Figure 3.25) are able to show the same values as a boxplot,
# but plot the “boxes” as a kernel density estimation.
violin, ax = plt.subplots()
ax = sns.violinplot(x = 'time', y = 'total_bill', data = tips)
ax.set_title('Violin plot of total bill by time of day')
ax.set_xlabel('Time of day')
ax.set_ylabel('Total Bill')
plt.show()
# 3.4.2.7 Pairwise relationships
fig = sns.pairplot(tips)
pair_grid = sns.PairGrid(tips)
# we can use plt.scatter instead of sns.regplot
pair_grid = pair_grid.map_upper(sns.regplot)
pair_grid = pair_grid.map_lower(sns.kdeplot)
pair_grid = pair_grid.map_diag(sns.distplot, rug = True)
plt.show()
# 3.4.3 Multivariate Data
# As mentioned in Section 3.3.3, there is no de facto template for plotting
# multivariate data. Possible ways to include more information are to use
# color, size, and shape to distinguish data within the plot.
# 3.4.3.1 Colors
violin, ax = plt.subplots()
ax = sns.violinplot(x = 'time', y = 'total_bill',
hue = 'sex', data = tips, split = True)
plt.show()
# The hue parameter can be passed into various other plotting functions
# note the use of lmplot instead of regplot here
scatter = sns.lmplot(x ='total_bill', y = 'tip', data = tips, hue = 'sex',
fit_reg = False)
plt.show()
# 3.4.3.2 Size and Shape
# the code below returns an error
scatter = sns.lmplot(x = 'total_bill', y = 'tip',
data = tips, fit_reg = False,
hue = 'sex',
scatter_kws={'s':tips['size']*10})
plt.show()
# 3.4.3.3 Facets
anscombe_plot = sns.lmplot(x='x', y='y', data=anscombe,
fit_reg=False,
col='dataset', col_wrap=2)
plt.show()
# create a FacetGrid
facet = sns.FacetGrid(tips, col = 'time')
# for each value in time, plot a histogram of the bill
facet.map(sns.distplot, 'total_bill', rug = True)
plt.show()
# The individual facets need not be univariate plots, as seen in Figure 3.35.
facet = sns.FacetGrid(tips, col='day', hue='sex')
facet = facet.map(plt.scatter, 'total_bill', 'tip')
facet = facet.add_legend()
plt.show()
#Another thing you can do with facets is to have one variable be faceted
# on the x-axis, and another variable faceted on the y-axis.
# We accomplish this by passing a row parameter.
facet = sns.FacetGrid(tips, col='time', row='smoker', hue='sex')
facet.map(plt.scatter, 'total_bill', 'tip')
plt.show()
#If you do not want all of the hue elements to overlap
# (i.e., you want this behavior in scatterplots, but not violin plots),
# you can use the sns.factorplot function.
facet = sns.catplot(x='day', y='total_bill',
hue='sex', data = tips,
row = 'smoker', col='time',
kind='violin')
plt.show()
# 3.5 Pandas Objects
# 3.5.1 Histograms from within Pandas
fig, ax = plt.subplots()
ax = tips['total_bill'].plot.hist()
plt.show()
# with an alpha channel, so we can see channel transparency
fig, ax = plt.subplots()
ax = tips[['total_bill', 'tip']].plot.hist(alpha=0.5, bins = 20, ax = ax)
plt.show()
# 3.5.2 Density Plot
fig, ax = plt.subplots()
ax = tips['tip'].plot.kde()
plt.show()
# 3.5.3 Scatterplot
fig, ax = plt.subplots()
ax = tips.plot.scatter(x = 'total_bill', y = 'tip', ax = ax)
plt.show()
# 3.5.4 Hexbin Plot
fig, ax = plt. subplots()
ax = tips.plot.hexbin(x = 'total_bill', y = 'tip', ax = ax)
plt.show()
# 3.5.5 Boxplot
fig, ax = plt.subplots()
ax = tips.plot.box(ax = ax)
plt.show()
# 3.6 Seaborn Themes and Styles
#The seaborn plots shown in this chapter have all used the default plot styles.
# We can change the plot style with the sns.set_style function.
# Typically, this function is run just once at the top of your code;
# all subsequent plots will use the same style set.
# intial plot for comparison:
fig, ax = plt.subplots()
ax = sns.violinplot(x = 'time', y = 'total_bill',
hue = 'sex', data = tips, split = True)
plt.show()
# set style and plot
sns.set_style('whitegrid')
fig, ax = plt.subplots()
ax = sns.violinplot(x = 'time', y = 'total_bill',
hue = 'sex', data = tips,
split = True)
plt.show()
# the following code shows what all the styles look like:
fig = plt.figure()
seaborn_styles = ['darkgrid', 'whitegrid', 'dark', 'white', 'ticks']
for idx, style in enumerate(seaborn_styles):
plot_position = idx+1
with sns.axes_style(style):
ax = fig.add_subplot(2, 3, plot_position)
violin = sns.violinplot(x = 'time', y = 'total_bill',
data = tips, ax = ax)
fig.tight_layout()
plt.show()