-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo.py
90 lines (61 loc) · 2.07 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("dulieuxettuyendaihoc.csv", header = 0, delimiter = ",", encoding = 'unicode_escape')
a = df
df.head(10)
df.tail(10)
df.columns
# datatypes
df.dtypes
# data size
df.shape
# RangeIndex(start=0, stop=100, step=1)
df.index
# includes RangeIndex, dtype and non-null count of columns, memory usage
# df.info()
# remove null data
df.dropna(how = 'all')
# remove duplicated rows
df.drop_duplicates()
# fill null cell with value
df.fillna('Unknown')
# create new col
df['TBDH'] = (df['DH1'] + df['DH2'] + df['DH3'])/3
# headmap to visualize data
plt.figure(figsize=(10, 6))
sns.heatmap(df.isna().transpose(), cmap = "YlGnBu", cbar_kws = {'label': 'missing data'})
plt.savefig('./generated/missingdata.png', dpi = 100)
# get specified cols, return a data frame
df_dh = df[['DH1', 'DH2', 'DH3', 'TBDH', 'KV', 'KT']]
# get specified rows by range, return a data frame, excluded
df[2:5]
# get specified row by name
df.loc[2]
# get specified rows by range, return a data frame, included
df.loc[2:4]
# get specified rows and columns, return a data frame
df.loc[2:4, ['DH1', 'DH2', 'DH3', 'KT']]
# get specified row by index
df.iloc[2]
# get specified rows by range, return a data frame, excluded
df.iloc[2:4]
df.iloc[:5]
df.iloc[95:]
# get specified rows and columns by range, return a dataframe, excluded
df.iloc[2:5,1:5]
# sort data
df.sort_values(by = ['DH1', 'DH2'], ascending = [True, False])
# get data by conditions
df[df['KT'] == 'C']
df[(df['DH1'] > 5) & (df['DH2'] > 5)]
# modify data by conditions
df.loc[df['TBDH'] < 5, 'KETQUA'] = 'FAIL'
df.loc[df['TBDH'] >= 5, 'KETQUA'] = 'PASS'
# aggregate
df_dh.aggregate({'DH1': ['sum', 'mean'], 'DH2': ['min', 'max'], 'DH3': ['mean', 'max']})
# group by
df_dh.groupby('KV')['KV'].agg(['count'])
df_dh.groupby('KV')['DH1'].agg(['min', 'mean', 'std', 'max'])
a = pd.pivot_table(df, values = ['DH1', 'DH2', 'DH3'], columns = 'KETQUA', aggfunc = ['min', 'mean', 'max'])
print(a)