-
Notifications
You must be signed in to change notification settings - Fork 1
/
Naive Bayes.py
119 lines (118 loc) · 4.45 KB
/
Naive Bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
import pandas as pd
import math
def predictForTest(stats,test):
#initialize the values of probability
test['prediction']='False'
test['pos']=None
test['neg']=None
for index,rows in test.iterrows():
posfactor,negfactor=1.0,1.0
#Doing individually for each attribute
a=rows['heart_issue']
for i in range(len(stats[0])):
if stats[0][i]==a:
posfactor*=stats[0][i+1] #updating the positive value
negfactor*=stats[0][i+2] #updating the negative value
break
a=rows['insurance']
for i in range(len(stats[1])):
if stats[1][i]==a:
posfactor*=stats[1][i+1]
negfactor*=stats[1][i+2]
break
a=rows['stress']
for i in range(len(stats[2])):
if stats[2][i]==a:
posfactor*=stats[2][i+1]
negfactor*=stats[2][i+2]
break
rows['pos']=posfactor
rows['neg']=negfactor
if posfactor>negfactor:
rows['prediction']='True'
print(test.head(15))
x=len(test.index)
name='ResultForTestFile'+str(x)+'Rows.csv'
test.to_csv(name,sep=',',index=False)
def findCount(train,test):
allColumns=list(train)
stats=[] #includes the attribute, its all possible values and its positive and negative values as well
attrs=[] #includes each attribute and its possible row values
for column in allColumns[0:-1]:
rowAttr=[]
rowVals=[]
rowVals.append(column)
for index,rows in train.iterrows():
if rows[column] in rowVals:
continue
else:
rowAttr.append(rows[column])
rowVals.append(rows[column])
rowVals.append(0)
rowVals.append(0)
stats.append(rowVals)
attrs.append(rowAttr)
lastrow=[] #To store the values in the last column of the dataset
count=[]
for index,rows in train.iterrows():
if rows['attack'] in lastrow:
for i in range(len(lastrow)):
if rows['attack']==lastrow[i]:
count[i]+=1
else:
lastrow.append(rows['attack'])
count.append(1)
heart_issue=[]
insurance=[]
stress=[]
attack=[]
for index,rows in train.iterrows(): #retrieving the values from the dataframe columns
heart_issue.append(rows['heart_issue'])
insurance.append(rows['insurance'])
stress.append(rows['stress'])
attack.append(rows['attack'])
for i in range(len(heart_issue)):
for j in range(len(stats[0])):
if stats[0][j]==heart_issue[i] and str(attack[i])=='True': #updating the positive and negative counts
stats[0][j+1]+=1
elif stats[0][j]==heart_issue[i] and str(attack[i])=='False':
stats[0][j+2]+=1
for i in range(len(insurance)):
for j in range(len(stats[1])):
if stats[1][j]==insurance[i] and str(attack[i])=='True':
stats[1][j+1]+=1
elif stats[1][j]==insurance[i] and str(attack[i])=='False':
stats[1][j+2]+=1
for i in range(len(stress)):
for j in range(len(stats[2])):
if stats[2][j]==stress[i] and str(attack[i])=='True':
stats[2][j+1]+=1
elif stats[2][j]==stress[i] and str(attack[i])=='False':
stats[2][j+2]+=1
for i in range(len(stats)):
pos=1
for j in range(len(stats[i][1:])):
try:
stats[i][pos+1]/=count[1] #getting the probability fraction
stats[i][pos+2]/=count[0]
pos+=3 #based on the stats array
except:
continue
for i in range(len(stats)):
print(stats[i][0],':',stats[i][1:]) #printing the values of the stats array
predictForTest(stats,test)
train=pd.read_csv('train.csv')
print('---------------\nTraining data:\n---------------')
print(train.head())
print('\nShape of training dataset is:',train.shape)
test1=pd.read_csv('test.csv') #First testing dataset
print('---------------\nTesting data 1:\n---------------')
print(test1.head())
print('\nShape of first testing dataset is:',test1.shape)
findCount(train,test1)
test2=pd.read_csv('test2.csv') #second testing dataset
print('---------------\nTesting data 2:\n---------------')
print(test2.head())
print('\nShape of second testing dataset is:',test2.shape)
findCount(train,test2)