.Rhistory

install.packages("ggplot2")
install.packages("gtools")
install.packages("data.table")
library('data.table')
library('ggplot2')
library('gtools')
library('ggplot2')
install.packages("ggplot2")
yes
install.packages("ggplot2")
library('data.table')
library('ggplot2')
library('gtools')
## (a) Plot the distribution
X <- c(2, 3, 4, 5, 6)
f_x <- c(.1,.3,.3,.2,.1)
d <- data.table(X, f_x)
## This data.table is in "wide format"
## Convert to "long format"
dd <- melt(d, id.vars=1, measure.vars=2, value.name='p')
dd
## Plot the distributions
ggplot(dd, aes(x=X, y=p)) +
geom_point() +
geom_segment(aes(xend=X, yend=0)) +
facet_wrap(~variable) +
theme(aspect.ratio = 1)
## (b) Calculate E(X)
dd[, sum(X*p), .(variable)]
## (c) Find P(X>=4) and P(2<X<=4)
dd[X >= 4, sum(p)]
dd[2 < X & X <= 4, sum(p)]
#define function
dd[, Y := (2*X-8)^2]
#define probability
dd[, py := p]
#define expected value
dd[, sum(Y*py), .(variable)]
## Find:
X <- c(0, 1, 2, 3)
f_x <- c(.3,.4,.2,.1)
d <- data.table(X, f_x)
dd <- melt(d, id.vars=1, measure.vars=2, value.name='p')
## (a) P(X>=2)
dd[X >= 2, sum(p)]
## (b) P(0<X<=2)
dd[0 < X & X <= 2, sum(p)]
## (c) E(X)
dd[, pop_mean := sum(X*p), .(variable)]
## (d) Var(X); sd(X)
## Var(X) = sum(x^2*p_x) - mu^2
#mu is expected value E(X)
dd[, pop_var := sum((X^2)*p) - pop_mean^2, .(variable)]
dd[, pop_sd := sqrt(pop_var)]
##E(Y) = a*E(X) + E(b)
e_y <- (1/10)*120 - 120/10
## Var(Y) = (a^2)*var(X)
var_y <- ((1/10)^2) * 100
##E(Y) = a*E(X) + E(b)
e_y <- (1/20)*120 - 100/20
## Var(Y) = (a^2)*var(X)
var_y <- ((1/20)^2) * 100
## (a) Compute the following probabilities:
p_success <- 1/3
## p(X > 3)
pgeom(2, p_success, lower.tail=FALSE)
## p(X >= 3)
pgeom(1, p_success, lower.tail=FALSE)
## p(X <= 3)
pgeom(2, p_success, lower.tail=TRUE)
## p(X <= 3) + p(X > 3)
pgeom(2, p_success, lower.tail=TRUE) + pgeom(2, p_success, lower.tail=FALSE)
## (b) Plot geometric probability distribution for x = 0:10
x <- 0:10
y <- dgeom(x, 1/3)
d <- data.table(x,y)
ggplot(d, aes(x=x,y=y)) +
geom_point() + # we use point because it's a discrete distribution
xlab('X = number trials until first success') +
ylab('Probability') +
theme(aspect.ratio = 1)
## (c) Draw a sample of size n= 1000 from the above geometric distribution and
## plot it with a histogram
n <- 1000
sample <- rgeom(n, 1/3)
d <- data.table(sample)
b <- seq(0.1,10,1)
ggplot(d, aes(sample)) +
geom_histogram(aes(y=..density..),breaks=b) +
theme(aspect.ratio = 1)
## (d) Plot the geometric distribution for the following values of P(Success):
## p(Success) = 0.2
## p(Success) = 0.5
## p(Success) = 0.8
## Use facet_wrap to plot on a single figure.
p1 <- .2
p2 <- .5
p3 <- .8
x <- 0:10
y1 <- dgeom(x, p1)
y2 <- dgeom(x, p2)
y3 <- dgeom(x, p3)
p <- rep(c(p1,p2,p3), each=length(x))
d <- data.table(x,y1,y2,y3)
dd <- melt(d, measure.vars = c('y1', 'y2', 'y3'))
dd[, p := p]
ggplot(dd, aes(x=x,y=value)) +
geom_point() +
geom_segment(aes(xend=x, yend=0)) +
xlab('X = number trials until first success') +
ylab('Probability') +
theme(aspect.ratio = 1) +
facet_wrap(~p)
## (e) Compute the expected value of the geometric distributions that you
## plotted in (d). For the geometric distribution, x can assume any value from
## zero to infinity, but for now, assume x only ranges from 0 to 10. Add
## vertical lines to indicate the population mean to each panel of the figure
## you made in part (d).
dd[, pop_mean := sum(x*value), .(p)]
ggplot(dd, aes(x=x,y=value)) +
geom_point() +
geom_segment(aes(xend=x, yend=0)) +
xlab('X = number trials until first success') +
ylab('Probability') +
theme(aspect.ratio = 1) +
facet_wrap(~p) +
geom_vline(aes(xintercept=pop_mean, colour='red'))
## define the probability of success
p1 <- .2
p2 <- .5
p3 <- .8
## how many samples do we want to draw?
n <- 10000
## define the probability of success
p1 <- .2
p2 <- .5
p3 <- .8
## how many samples
n <- 10000
## Sample from a geometric distribution
s1 <- rgeom(n, p1)
s2 <- rgeom(n, p2)
s3 <- rgeom(n, p3)
## sample means of these samples
mean(s1)
mean(s2)
mean(s3)
## now data.table
d1 <- data.table(s1)
d2 <- data.table(s2)
d3 <- data.table(s3)
dd1 <- data.table(table(d1))
dd2 <- data.table(table(d2))
dd3 <- data.table(table(d3))
dd1[, p := p1]
dd2[, p := p2]
dd3[, p := p3]
dd <- rbindlist(list(dd1,dd2,dd3))
setnames(dd, 'd1', 'x')
dd[, p_est := N / n]
dd[, x := as.integer(x)]
ggplot(dd, aes(x=x, y=p_est)) +
geom_bar(stat='identity') +
theme(aspect.ratio = 1)  +
facet_wrap(~p, scales='free')
ggplot(dd, aes(x=x, y=p_est)) +
geom_bar(stat='identity') +
theme(aspect.ratio = 1)  +
facet_wrap(~p, scales='free') +
geom_vline(aes(xintercept=sample_mean, colour='red'))
## (h) For each panel in (f), draw a vertical line indicating the sample mean
dd1 <- data.table(table(d1), sample_mean=mean(s1))
dd2 <- data.table(table(d2), sample_mean=mean(s2))
dd3 <- data.table(table(d3), sample_mean=mean(s3))
dd1[, p := p1]
dd2[, p := p2]
dd3[, p := p3]
dd <- rbindlist(list(dd1,dd2,dd3))
setnames(dd, 'd1', 'x')
dd[, p_est := N / n]
dd[, x := as.integer(x)]
ggplot(dd, aes(x=x, y=p_est)) +
geom_bar(stat='identity') +
theme(aspect.ratio = 1)  +
facet_wrap(~p, scales='free') +
geom_vline(aes(xintercept=sample_mean, colour='red'))
## (i) Repeat (g) and (h) a few times to see the variation in sample mean, and
## the absence of variance in the population mean. Make a figure that shows the
## results of each of these experiments in separate panel of one figure.
num_exps <- 3
dd_rec  <- vector(mode='list', length=num_exps)
for(i in 1:num_exps) {
## define the probability of success
p1 <- .2
p2 <- .5
p3 <- .8
## how many samples do we want to draw?
## TODO: This is an interesting parameter to play with
n <- 100
## Sample from a geometric distribution
s1 <- rgeom(n, p1)
s2 <- rgeom(n, p2)
s3 <- rgeom(n, p3)
## start assembling a data.table that we can use to plot an estimated PMF
d1 <- data.table(s1)
d2 <- data.table(s2)
d3 <- data.table(s3)
## This time, when creating data.table from samples, include sample mean
dd1 <- data.table(table(d1), sample_mean=mean(s1))
dd2 <- data.table(table(d2), sample_mean=mean(s2))
dd3 <- data.table(table(d3), sample_mean=mean(s3))
dd1[, p := p1]
dd2[, p := p2]
dd3[, p := p3]
dd <- rbindlist(list(dd1,dd2,dd3))
setnames(dd, 'd1', 'x')
dd[, p_est := N / n]
dd[, x := as.integer(x)]
## add pop_mean
dd[, pop_mean := 1/p]
## add indicator column to index experiment number
dd[, exp := i]
dd_rec[[i]] <- dd
}
dd <- rbindlist(dd_rec)
## TODO: this plot can be cleaned up... as can the entire programming structure
## of the section...
ggplot(dd, aes(x=x, y=p_est)) +
geom_bar(stat='identity') +
theme(aspect.ratio = 1)  +
facet_wrap(~p*exp, scales='free') +
geom_vline(aes(xintercept=sample_mean, colour='red')) +
geom_vline(aes(xintercept=pop_mean, colour='blue'))
## step 1:
## H0: p = 0.5
## H1: p > 0.5
q <- 9
p_H0 <- 0.5
## step 2:
p_val <- pgeom(q, p_H0, lower.tail=FALSE)
## step 3: confidence level
alpha <- 0.05
## step 4: reject or no
if(p_val < alpha) {
print('Reject H0:')
} else {
print('Failed to reject H0:')
}
n <- 200
exp_results  <- 12
p_estimate = exp_results / n
## define p assuming H0 is true
p_H0 <- 1/36
p_data_given_H0 <- pbinom(exp_results-1, n, p_H0, lower.tail=FALSE)
p_data_given_H0
## (b) use binom.test() to answer this question.
binom.test(exp_results, n=n, p=p_H0, alternative='greater', conf.level=0.05)
$?
?
$
ls
getwd()
setwd("/Users/isanette.carreon/Desktop/demowatch/demowatch/notebooks/")
reading_set <- read_csv("notebooks/reading_set.csv")
setwd("/Users/isanette.carreon/Desktop/demowatch/demowatch/")
reading_set <- read_csv("notebooks/reading_set.csv")
reading_set <- read.csv("notebooks/reading_set.csv")
file_names <- list.files("data/city")
city_names <- gsub("\\+", " ", gsub(".csv", "", list.files("data/city")))
#creates a
city_data_dir <- "data/city/"
hand_labels <- cbind(city = city_names[1],
read_csv(paste0(city_data_dir, file_names[1])))
hand_labels <- cbind(city = city_names[1],
read.csv(paste0(city_data_dir, file_names[1])))
for (obj in 2:length(file_names)) {
new_city <- cbind(city = city_names[obj],
read_csv(paste0(city_data_dir, file_names[obj])) +
max(hand_labels[, 2]))
hand_labels <- rbind(hand_labels, new_city)
}
setwd("/Users/isanette.carreon/Desktop/demowatch/demowatch/")
reading_set <- read.csv("notebooks/reading_set.csv")
file_names <- list.files("data/city")
city_names <- gsub("\\+", " ", gsub(".csv", "", list.files("data/city")))
#creates a
city_data_dir <- "data/city/"
hand_labels <- cbind(city = city_names[1],
read.csv(paste0(city_data_dir, file_names[1])))
for (obj in 2:length(file_names)) {
new_city <- cbind(city = city_names[obj],
read.csv(paste0(city_data_dir, file_names[obj])) +
max(hand_labels[, 2]))
hand_labels <- rbind(hand_labels, new_city)
}
hand_labels <- hand_labels %>% mutate(city = as.character(city))
unique_cities <- unique(reading_set$city)
for (obj in 1:length(unique_cities)) {
current_city <- unique_cities[obj]
if (current_city %in% city_names) {
next
} else {
hand_labels <- rbind(hand_labels,
list(current_city, max(hand_labels$ID) + 1))
}
}
labelled_tbl <- cbind(reading_set %>% filter(city == "Charleston"),
hand_labels %>%
filter(city == "Charleston") %>%
select(ID))
for (obj in 2:length(unique_cities)) {
city_tbl <- reading_set %>% filter(city == unique_cities[obj])
city_labels <- hand_labels %>% filter(city == unique_cities[obj])
labelled_tbl <- rbind(labelled_tbl, cbind(city_tbl, ID = city_labels$ID))
}
labelled_tbl <- labelled_tbl %>% select(city, date, text, ID)
write_csv(labelled_tbl, "validation_set.csv")
library(dplyr)
library(readr)
install.packages(c("dplyr", "readr"))
library(dplyr)
library(readr)
setwd("/Users/isanette.carreon/Desktop/demowatch/demowatch/")
reading_set <- read.csv("notebooks/reading_set.csv")
file_names <- list.files("data/city")
city_names <- gsub("\\+", " ", gsub(".csv", "", list.files("data/city")))
#creates a
city_data_dir <- "data/city/"
hand_labels <- cbind(city = city_names[1],
read.csv(paste0(city_data_dir, file_names[1])))
for (obj in 2:length(file_names)) {
new_city <- cbind(city = city_names[obj],
read.csv(paste0(city_data_dir, file_names[obj])) +
max(hand_labels[, 2]))
hand_labels <- rbind(hand_labels, new_city)
}
hand_labels <- hand_labels %>% mutate(city = as.character(city))
unique_cities <- unique(reading_set$city)
for (obj in 1:length(unique_cities)) {
current_city <- unique_cities[obj]
if (current_city %in% city_names) {
next
} else {
hand_labels <- rbind(hand_labels,
list(current_city, max(hand_labels$ID) + 1))
}
}
labelled_tbl <- cbind(reading_set %>% filter(city == "Charleston"),
hand_labels %>%
filter(city == "Charleston") %>%
select(ID))
for (obj in 2:length(unique_cities)) {
city_tbl <- reading_set %>% filter(city == unique_cities[obj])
city_labels <- hand_labels %>% filter(city == unique_cities[obj])
labelled_tbl <- rbind(labelled_tbl, cbind(city_tbl, ID = city_labels$ID))
}
labelled_tbl <- labelled_tbl %>% select(city, date, text, ID)
labelled_tbl
file_names
city_names
hand_labels
library(dplyr)
library(readr)
setwd("/Users/isanette.carreon/Desktop/demowatch/demowatch/")
reading_set <- read.csv("notebooks/reading_set.csv")
file_names <- list.files("data/city")
city_names <- gsub("\\+", " ", gsub(".csv", "", list.files("data/city")))
#assigns ID to city
city_data_dir <- "data/city/"
hand_labels <- cbind(city = city_names[1],
read.csv(paste0(city_data_dir, file_names[1])))
for (obj in 2:length(file_names)) {
new_city <- cbind(city = city_names[obj],
read.csv(paste0(city_data_dir, file_names[obj])) +
max(hand_labels[, 2]))
hand_labels <- rbind(hand_labels, new_city)
}
hand_labels <- hand_labels %>% mutate(city = as.character(city))
hand_labels
reading_set <- read.csv("notebooks/reading_set.csv")
file_names <- list.files("data/city")
city_names <- gsub("\\+", " ", gsub(".csv", "", list.files("data/city")))
file_names
city_names
#assigns ID to city
city_data_dir <- "data/city/"
hand_labels <- cbind(city = city_names[1],
read.csv(paste0(city_data_dir, file_names[1])))
hand_labels
hand_labels
for (obj in 2:length(file_names)) {
new_city <- cbind(city = city_names[obj],
read.csv(paste0(city_data_dir, file_names[obj])) +
max(hand_labels[, 2]))
hand_labels <- rbind(hand_labels, new_city)
}
new_city
hand_labels
hand_labels <- hand_labels %>% mutate(city = as.character(city))
hand_labels
unique_cities <- unique(reading_set$city)
unique_cities
for (obj in 1:length(unique_cities)) {
current_city <- unique_cities[obj]
if (current_city %in% city_names) {
next
} else {
hand_labels <- rbind(hand_labels,
list(current_city, max(hand_labels$ID) + 1))
}
}
hand_labels
labelled_tbl <- cbind(reading_set %>% filter(city == "Charleston"),
hand_labels %>%
filter(city == "Charleston") %>%
select(ID))
labelled_tbl
for (obj in 2:length(unique_cities)) {
city_tbl <- reading_set %>% filter(city == unique_cities[obj])
city_labels <- hand_labels %>% filter(city == unique_cities[obj])
labelled_tbl <- rbind(labelled_tbl, cbind(city_tbl, ID = city_labels$ID))
}
city_tbl
city_labels
labelled_tbl
labelled_tbl <- labelled_tbl %>% select(city, date, text, ID)
labelled_tbl
question_weighter <- function(scheme_tbl, labelled_qs_ans) {
library(dplyr)
### calculate question weights
weights <- scheme_tbl
weights[, 2][weights[, 2] == "Additive"] <- 0.1
weights[, 2][weights[, 2] == "Ambiguous"] <- 0.5
weights[, 2][weights[, 2] == "Contradictory"] <- 1
weights <- weights %>%
mutate(weights = as.numeric(Type), q_name = as.character(Number)) %>%
select(5, 4)
for (i in 1:length(weights$q_name)) {
if (nchar(weights$q_name[i]) != 4) {
weights$q_name[i] <- paste0(weights$q_name[[i]], "0")
}
}
q_ans_combos <- data.frame(col = colnames(labelled_qs_ans[, -c(1, 2, 297)]))
q_ans_combos$q_name <- substr(q_ans_combos$col, 0, 4)
weights <- weights %>% right_join(q_ans_combos, by = c("q_name"))
weights$weights[is.na(weights$weights)] <- 0
return(weights)
}
weights
q_ans_combos
question_weighter <- function(scheme_tbl, labelled_qs_ans) {
library(dplyr)
### calculate question weights
weights <- scheme_tbl
weights[, 2][weights[, 2] == "Additive"] <- 0.1
weights[, 2][weights[, 2] == "Ambiguous"] <- 0.5
weights[, 2][weights[, 2] == "Contradictory"] <- 1
weights <- weights %>%
mutate(weights = as.numeric(Type), q_name = as.character(Number)) %>%
select(5, 4)
for (i in 1:length(weights$q_name)) {
if (nchar(weights$q_name[i]) != 4) {
weights$q_name[i] <- paste0(weights$q_name[[i]], "0")
}
}
q_ans_combos <- data.frame(col = colnames(labelled_qs_ans[, -c(1, 2, 297)]))
q_ans_combos$q_name <- substr(q_ans_combos$col, 0, 4)
weights <- weights %>% right_join(q_ans_combos, by = c("q_name"))
weights$weights[is.na(weights$weights)] <- 0
return(weights)
}
q_ans_combos
canonicalizer <- function(labelled_qs_ans, pg = 0.8) {
canonicalized_features <- NA
for (id in unique(labelled_qs_ans$ids)) {
cluster_tbl <- check_cluster(labelled_qs_ans, id, pg)
print(id)
if (is.na(canonicalized_features)) {
compact_cluster <- cluster_compacter(cluster_tbl, labelled_qs_ans, id) %>%
mutate(ids = cluster)
canonicalized_features <- compact_cluster
} else {
compact_cluster <- cluster_compacter(cluster_tbl, labelled_qs_ans, id) %>%
mutate(ids = max(canonicalized_features$ids) + cluster)
canonicalized_features <- rbind(canonicalized_features,
compact_cluster)
}
}
canonicalized_features <- canonicalized_features %>% select(-cluster)
return(canonicalized_features)
}
canonicalized_features