.Rhistory

library(openair)
library(tidyverse)
library(ggplot2)
library (ggpubr)
# files that must be loaded:
# had to reset workign directory to retreve file...
getwd()
setwd("2.Clean.Data")
clean.patent.db<- read.csv( "clean.patent.db.csv")
ufo.freq.table.date.range <- read.csv("UFO.freq.date.range.csv")
# reset working directory back to original
setwd("~/GitHub/UFOS-and-Patents")
getwd()
ufo <- read.csv ("consolidated_weather_V03.csv")
#================================ Variables ====================================
# Number of UFO sightings required to make it a date of interest.
ufo.crit.val <- 5   # this was determined in the cleaning.ufo.data script, it is
# based off the mean frequency of UFO sightings during the
# data time period.
# Delay period before sampling of patent frequencies.
delay.period <- 14
# Length of sample period after delay (in days)
sample.period <- 30
#============================ WORK FLOW SET UP =================================
work.d <- getwd()
out.put.folders <- c("1.Raw.Data","2.Clean.Data", "3.Analysis", "4.Graphs",
"4.Graphs/Table")
# check if folders exist and make them if they don't.
# This loop checks goes through the given out.put.folders list and checks to see
# if they exisit in the working directory. If they don't they print "does not
# exisit" and creates them, if it does exist it prints "does exist"
for(i in 1:length(out.put.folders)){
if(!file.exists(out.put.folders[i])){
print(paste(i, "does not exist"))
dir.create(out.put.folders[i])
}else{
print(paste(i,"does exist"))
}
}
#---- Pathways---------
#path to 1.RawData folder
path.rd <- paste(work.d,"/",out.put.folders[1], "/", sep="")
#test the pathway...
# x.t <- c(5)
# write.csv(x.t,paste(path.rd, "test.x.csv"), row.names=FALSE)
# to remove row names from saved files       ^^^
#Path to 2.Clean.Data
path.cd <- paste(work.d,"/",out.put.folders[2], "/", sep="")
# Path to 3.Analysis
path.a <- paste(work.d,"/",out.put.folders[3], "/", sep="")
# Path to 4.Graphs
path.g <- paste(work.d,"/",out.put.folders[4], "/", sep="")
# Path to Table folder in 4.Graphs folder
path.t <- paste(work.d,"/",out.put.folders[5], "/", sep="")
# write.csv(x.t,paste(path.t, "test.x.csv"), row.names=FALSE)
clean.patent.db<- read.csv( "clean.patent.db.csv")
setwd("2.Clean.Data")
clean.patent.db<- read.csv( "clean.patent.db.csv")
ufo.freq.table.date.range <- read.csv("UFO.freq.date.range.csv")
setwd("2.Clean.Data")
clean.patent.db<- read.csv( "clean.patent.db.csv",)
ufo.freq.table.date.range <- read.csv(paste(path.cd, "UFO.freq.date.range.TV.csv"))
# reset working directory back to original
setwd("~/GitHub/UFOS-and-Patents")
getwd()
# use table function to make frequency table
table.pf <- table(clean.patent.db$actiondate)
# make into a data frame
patent.freq.df <- data.frame(table.pf)
# rename column
colnames(patent.freq.df) <- c("action_date","Frequency")
# make the actiondate into a date instead of a factor
patent.freq.df$action_date <- as.Date(patent.freq.df$action_date)
class(patent.freq.df$action_date)
# make vector with all dates within the range
all.dates.p <- seq(as.Date("2006-01-01"), as.Date("2015-12-31"), "day")
# make into a dataframe
missing.dates.p.df <- as.data.frame(c(all.dates.p))
# add another column for frequency
missing.dates.p.df$frequency <- rep(NA, length(all.dates.p))
# make vector with all dates within the range
all.dates.p <- seq(as.Date("2006-01-01"), as.Date("2015-12-31"), "day")
# make into a dataframe
missing.dates.p.df <- as.data.frame(c(all.dates.p))
# add another column for frequency
missing.dates.p.df$frequency <- rep(NA, length(all.dates.p))
# Rename the columns
colnames(missing.dates.p.df) <- c("action_date", "Frequency")
# Merge missing dates with patent frequency table.
patent.freq.p.df <- merge(patent.freq.df, missing.dates.p.df,
by.x= "action_date", by.y="action_date", all=TRUE)
# check to confirm merge worked by looking for NA in the Frequency Data from the
# Patent dataframe
any(is.na(patent.freq.p.df$Frequency.x))
# cut the Frequency.y column
final.patent.freq <- patent.freq.p.df[, c("action_date", "Frequency.x")]
# Rename Frequency.x to Frequency
colnames(final.patent.freq) <- c("action_date", "Frequency")
# replace all NA values with 0
final.patent.freq$Frequency[is.na(final.patent.freq$Frequency)] <- 0
# check to see if it worked
any(is.na(final.patent.freq$Frequency))
# save as csv
write.csv(final.patent.freq, paste(path.cd, "final.patent.freq.csv"))
# change the Date variable to be recognized as class "Date"
class(ufo.freq.critical$Date)
# pull UFO data for days with more than critical value of sightings
head(ufo.freq.table.date.range)
ufo.freq.critical <- ufo.freq.table.date.range[
ufo.freq.table.date.range$Frequency>ufo.crit.val,]
# save the number of critical days to variable for later use
num.crit.days <- nrow(ufo.freq.critical)
# change the Date variable to be recognized as class "Date"
class(ufo.freq.critical$Date)
ufo.freq.critical$Date<- as.Date(ufo.freq.critical$Date)
class(ufo.freq.critical$Date)
ufo.freq.critical$Delayed
ufo.freq.critical$Delayed <- ufo.freq.critical$Date + delay.period
head(ufo.freq.critical)
# check to ensure it ran correctly
head(ufo.freq.critical)
d.dates.in <- data.frame(ufo.freq.critical$Delayed)  # make new dataframe with
# the Delayed date
d.dates.in$indicator <- rep(1, length(d.dates.in))  #made indicator column with
# value = 1
# rename the columns to: date_intrest and indicator
colnames(d.dates.in) <-  c("date_interest", "indicator")
patent.dates.intrest <- merge(final.patent.freq, d.dates.in,
by.x= "action_date", by.y="date_interest", all=TRUE)
ob.sum <- NULL
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){   # saying if indicator not NA...
t.sum <- 0   # them create a variable that = 0
for(j in 1:30){     # then count from 1-30 starting at date of interest
patent.row.no <- i + j   # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
}   # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.samples) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum))  #put sum in dataframe
}
}
tail(ob.sum)
# create empty vector
ob.sum <- NULL
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){   # saying if indicator not NA...
t.sum <- 0   # them create a variable that = 0
for(j in 1:30){     # then count from 1-30 starting at date of interest
patent.row.no <- i + j   # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
}   # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.samples) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum))  #put sum in dataframe
}
}
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){   # saying if indicator not NA...
t.sum <- 0   # them create a variable that = 0
for(j in 1:30){     # then count from 1-30 starting at date of interest
patent.row.no <- i + j   # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
}   # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.sum) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum))  #put sum in dataframe
}
}
tail(ob.sum) # the last 31 samples are NA values as the data set does not
# contain enough data to look for the whole 30 days following this.
# excludes all the NA values
ob.sums <- na.exclude(ob.sum)
# change the Date variable to be recognized as class "Date"
class(ufo.freq.critical$Date)
ufo.freq.critical$Date<- as.Date(ufo.freq.critical$Date)
class(ufo.freq.critical$Date)
ufo.freq.critical$Delayed
ufo.freq.critical$Delayed <- ufo.freq.critical$Date + delay.period
head(ufo.freq.critical)
# check to ensure it ran correctly
head(ufo.freq.critical)
#---- Create New Data Frame ----------------------------------------------------
# create data frame with Delayed dates and Indicator column.
d.dates.in <- data.frame(ufo.freq.critical$Delayed)  # make new dataframe with
# the Delayed date
d.dates.in$indicator <- rep(1, length(d.dates.in))  #made indicator column with
# rename the columns to: date_intrest and indicator
colnames(d.dates.in) <-  c("date_interest", "indicator")
# merging
patent.dates.intrest <- merge(final.patent.freq, d.dates.in,
by.x= "action_date", by.y="date_interest", all=TRUE)
# create empty vector
ob.sum <- NULL
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){   # saying if indicator not NA...
t.sum <- 0   # them create a variable that = 0
for(j in 1:30){     # then count from 1-30 starting at date of interest
patent.row.no <- i + j   # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
}   # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.sum) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum))  #put sum in dataframe
}
}
tail(ob.sum) # the last 31 samples are NA values as the data set does not
# contain enough data to look for the whole 30 days following this.
# excludes all the NA values
ob.sums <- na.exclude(ob.sum)
#----- Find The Observed Mean --------------------------------------------------
# find the mean of the patents sums in the 30 days following a day of interest.
ob.mean <- mean(ob.sums[,1])
ob.mean
# create empty vector in which to store the sum of each sample.period.
ob.sum <- NULL
# create a for loop which will go through our data frame contianing patent
# frequency, date of interest and indicator varibles. It will identify rows
# containing dates of interest via the indicator coulmn, then selection the
# patent frequency for that date and the following 29 days, take the sum of
# these frequencies, store them in a data frame and move on to the next date of
# interest.
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){   # saying if indicator not NA...
t.sum <- 0   # them create a variable that = 0
for(j in 1:30){     # then count from 1-30 starting at date of interest
patent.row.no <- i + j   # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
}   # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.sum) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum))  #put sum in dataframe
}
}
tail(ob.sum) # the last 31 samples are NA values as the data set does not
number.samples <- sum(is.na(ob.sum))
number.samples
# we will count the number of NAs within the sample
number.samples <- sum(is.na(ob.sum))
number.samples <- num.crit.days-(sum(is.na(ob.sum)))
number.samples <- num.crit.days-(sum(is.na(ob.sum)))
number.samples
# number of critical dates minus the number of NA values returned.
number.sample.p <- num.crit.days-(sum(is.na(ob.sum)))
#----- Find The Observed Mean --------------------------------------------------
# find the mean of the patents sums in the 30 days following a day of interest.
ob.mean <- mean(ob.sums[,1])
# Number of samples the null distribution runs.
null.run.times <- 10000
# this for loop will collect "number.sample.p" samples of "sample.period" days.
# It will then sum the patent frequencies for each "sample.period" and then find
# the mean number of patent sums within "number.sample.p" samples.
# It will then run this loop "null.run.times" times to create a distribution of
# means.
for(i in 1:null.run.times){
pull.samples <- sample(final.patent.freq$Frequency, size = number.sample.p*sample.period,
replace = TRUE) #pull a sample of , 2516 times
pull.sample <- matrix(pull.samples, number.sample.p) # putting those into a matrix
sum.sample <- apply(pull.sample,1,sum)  # sum values within a sample.period
# day sample.
sample.sums <- as.data.frame(sum.sample) # storeing sums into a data frame
null.dis.means[i,1] <- mean(sample.sums[,1])  #taking the mean of the sums and
# storing it into a new data frame.
}
# create an empty data frame to store all of the means
null.dis.means <- as.data.frame(NULL)
# this for loop will collect "number.sample.p" samples of "sample.period" days.
# It will then sum the patent frequencies for each "sample.period" and then find
# the mean number of patent sums within "number.sample.p" samples.
# It will then run this loop "null.run.times" times to create a distribution of
# means.
for(i in 1:null.run.times){
pull.samples <- sample(final.patent.freq$Frequency, size = number.sample.p*sample.period,
replace = TRUE) #pull a sample of , 2516 times
pull.sample <- matrix(pull.samples, number.sample.p) # putting those into a matrix
sum.sample <- apply(pull.sample,1,sum)  # sum values within a sample.period
# day sample.
sample.sums <- as.data.frame(sum.sample) # storeing sums into a data frame
null.dis.means[i,1] <- mean(sample.sums[,1])  #taking the mean of the sums and
# storing it into a new data frame.
}
head(null.dis.means)
colnames(null.dis.means) <- c("Mean")
head(null.dis.means)
# save as a csv
write.csv(null.dis.means, paste(path.cd, "null.dis.means.csv"), row.names = FALSE)
# we want the means displayed in frequencies in order
null.dis.mean.sum <- ggplot(data=null.dis.means, aes(null.dis.means$Mean)) +
geom_histogram(col ="blue", bins = 100, fill="light blue") +
labs(x= "Sample Mean", y= "Frequency") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
null.dis.mean.sum
null.dis.mean.sum <- ggplot(data=null.dis.means, aes(null.dis.means$Mean)) +
geom_histogram(col ="blue", bins = 100, fill="light blue") +
labs(x= "Sample Mean", y= "Frequency") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")) +
geom_vline(xintercept = ob.mean, color= "maroon")
null.dis.mean.sum
pdf(paste(path.g, "Null.Dis.Patents.pdf"))
plot(null.dis.mean.sum)
dev.off()
# count the number of values in our null distribution higher than our observed
# mean
sum(null.dis.means[null.dis.means$Mean>ob.mean,1])
# count the number of values in our null distribution higher than or equal to
# our observed mean.
sum(null.dis.means[null.dis.means$Mean>=ob.mean,1])
null.dis.means$Mean>=ob.mean,1
null.dis.means[null.dis.means$Mean>=ob.mean,1]
# count the number of values in our null distribution higher than or equal to
# our observed mean.
sum(null.dis.means[null.dis.means$Mean>=ob.mean,1])
# count the number of values in our null distribution higher than or equal to
# our observed mean.
num.grater.equal <- sum(null.dis.means[null.dis.means$Mean>=ob.mean,1])
# we will then multiply this by two, add 1 and divide by the "number.sample.p".
# This will give us the two-tailed p-value associated with our Observed Mean
# given our null distribution.
p.value <- ((num.grater.equal*2)+1)/(number.sample.p)
p.value
# we will then multiply this by two, add 1 and divide by the "number.sample.p".
# This will give us the two-tailed p-value associated with our Observed Mean
# given our null distribution.
p.value <- ((num.grater.equal*2)+1)/(null.run.times)
p.value