-
Notifications
You must be signed in to change notification settings - Fork 0
/
random_forest2.py
115 lines (110 loc) · 5.04 KB
/
random_forest2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
from sklearn.ensemble import RandomForestRegressor # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from feature_selector import FeatureSelector
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *
import os
def random_forest2(dataset,target,dataset_file_name):
dataset=dataset.fillna(dataset.mean())
cat_var=dataset.select_dtypes(include=['O']).columns.values
cat_plots=[]
for i in cat_var:
if len(set(dataset[i]))<=10:
ax=sn.countplot(x=i,data=dataset)
plt.title('Values of Attribute \"'+i+"\"",loc='center')
plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
plt.tight_layout()
filename='static/Plots/Random Forest/'+dataset_file_name+'/Categorical Variables/Attribute '+i+'.png'
cat_plots.append(filename.replace(" ","%20"))
plt.savefig(filename,bbox_inches='tight',dpi=600)
plt.cla()
plt.close()
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
cat_var=dataset.select_dtypes(include=['O']).columns.values
for i in cat_var:
labelencoder.fit(list(dataset[i]))
dataset[i] = labelencoder.transform(list(dataset[i]))
feature_cols=[i for i in dataset.columns.values if i!=target]
fs = FeatureSelector(data = dataset, labels = feature_cols)
fs.identify_missing(missing_threshold = 0.3)
missing_features = fs.ops['missing']
feature_cols=[i for i in feature_cols if i not in missing_features]
fs = FeatureSelector(data = dataset, labels = feature_cols)
fs.identify_collinear(correlation_threshold = 0.98)
collinear_features = fs.ops['collinear']
feature_cols=[i for i in feature_cols if i not in collinear_features]
fs = FeatureSelector(data = dataset, labels = feature_cols)
fs.identify_single_unique()
single_unique=fs.ops['single_unique']
feature_cols=[i for i in feature_cols if i not in single_unique]
X=dataset[feature_cols]
y=dataset[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% test
rf=RandomForestRegressor(n_estimators=1000,random_state=42)
rf.fit(X_train,y_train)
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_cols, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
fi=[]
for i in feature_importances:
if i[1]>0.05:
fi.append(i[0])
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)
X_train = X_train[fi]
X_test = X_test[fi]
rf_most_important.fit(X_train, y_train)
y_pred=rf_most_important.predict(X_test)
errors = abs(y_pred - y_test)
mae=round(np.mean(errors), 2)
var_score=metrics.explained_variance_score(y_test, y_pred)
r2_score=metrics.r2_score(y_test, y_pred)
conf_mat=[]
if len(set(dataset[target]))<=10:
ax=sn.heatmap(metrics.confusion_matrix(y_test,y_pred),annot=True)
ax.set_title('Confusion Matrix of Trained Random Forest Classifier',loc='center')
ax.set_xlabel('Actual target value')
ax.set_ylabel('Predicted Value')
filename='static/Plots/Random Forest/'+dataset_file_name+'/Metrics/confusion_matrix.png'
conf_mat.append(filename.replace(" ","%20"))
plt.savefig(filename,dpi=1000)
plt.cla()
plt.close()
def plot_boxplot(dataset,cat_var,target):
cols=list(dataset.columns)
cols=[i for i in cols if i not in cat_var and i!=target]
trace=[]
for i in cols:
trace0=go.Box(y=dataset[i].values,name=i)
trace.append(trace0)
layout = go.Layout(
title=go.layout.Title(
text='Boxplot of numerical variables',
),
xaxis=go.layout.XAxis(
title=go.layout.xaxis.Title(
text='Attribute',
)
),
yaxis=go.layout.YAxis(
title=go.layout.yaxis.Title(
text='Values',
)
))
fig = go.Figure(data=trace, layout=layout)
py.plot(fig, filename='static/Plots/Random Forest/'+dataset_file_name+'/Numerical Variables/Boxplot of numerical variables.html',auto_open=False)
return
plot_boxplot(dataset,cat_var,target)
num_plots=[]
filename1='static/Plots/Random Forest/'+dataset_file_name+'/Numerical Variables/Boxplot of numerical variables.html'
num_plots.append(filename1.replace(" ","%20"))
return mae,var_score,r2_score,num_plots,cat_plots,conf_mat