-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_Entropy.R
94 lines (75 loc) · 3.23 KB
/
0_Entropy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
rm(list=ls())
##################################################################################
# Most direct way to extract UCI data
urldata <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroom <- read.table(url(urldata), header = FALSE,
sep = ",", colClasses = "factor")
# A more complicated way to extract names
urlnames <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names"
mess <- scan(urlnames, what="numeric", sep="\t")[70:102]
library(qdapRegex)
mess.names <- unlist(rm_between(mess,". ", ":", extract=TRUE))
mess.names <- mess.names[!is.na(mess.names)]
##################################################################################
# Old-fashioned way: Download data & Upload it
mushroom <- read.table("G:/My Drive/Teaching/Big_Data_Analytics/Data/agaricus-lepiota.data",
header = FALSE, sep = ",")
# Simple way to assign column names
colnames(mushroom) <- c("edible",
"capshape","capsurface","capcolor",
"bruises","odor",
"gillattachment","gillspacing","gillsize","gillcolor",
"stalkshape","stalkroot",
"stalksurfaceabovering","stalksurfacebelowring",
"stalkcolorabovering","stalkcolorbelowring",
"veiltype","veilcolor",
"ringnumber","ringtype",
"sporeprintcolor","population","habitat")
####################################################################################
# What is the sample size?
n <- nrow(mushroom)
k <- ncol(mushroom)-1 # one of the variables is target edible
df1 <- mushroom[mushroom$stalkroot != "?",]
mushroom <- subset(mushroom,stalkroot != "?") # What is the new sample size?
attach(mushroom)
# Entropy of parent (Figure 3.6)
class(edible)
y0 = as.numeric(edible)
n0 = length(y0)
y0.freq = table(y0)
ype = y0.freq[1]/n0
# ypp = 1-ype
entropy0 = -ype*log2(ype)-(1-ype)*log2(1-ype)
# Child (Figure 3.7)
ngc = nlevels(gillcolor) # m=12
gc.freq = table(gillcolor, by=edible)
gc.entr = matrix(0,nrow=ngc,ncol=1) # Entropy container
for (i in 1:ngc){
ni = gc.freq[i,1]+gc.freq[i,2]
gcpei = gc.freq[i,1]/ni
gcppi = 1-gcpei
gc.entr[i] = -gcpei*log2(gcpei) - gcppi*log2(gcppi)
}
# Prepare the data for plot
rownames(gc.entr) = rownames(gc.freq)
colnames(gc.entr) = "Entropy"
gc.entr[is.nan(gc.entr)] = 0
# new data = gc.entr[is.nan(gc.entr)] eq.to subsetting the data
# versus
# gc.entr[is.nan(gc.entr)] = action
gc.prop = as.data.frame(rowSums(gc.freq)/n0) # p(cb)+p(ce)+ p(cy)=1
colnames(gc.prop) = "Proportion"
gc.data = cbind(gc.entr, gc.prop)
gc.ord = gc.data[order(gc.data[,"Entropy"]),]
# Name the variables
group = rownames(gc.ord)
proportion = gc.ord[,2]
entropy = gc.ord[,1]
gc.ord$cp = cumsum(proportion) # cumulative sum of proportion (call it cp)
# Entropy Chart 3.7
# aes = aesthetics
library(ggplot2)
ggplot(gc.ord, aes(xmin=cp-proportion, xmax=cp, ymin=0, ymax=entropy)) +
geom_rect(aes(fill=group)) +
xlab("Proportion") +
ylab("Entropy")