-
Notifications
You must be signed in to change notification settings - Fork 1
/
.Rhistory
331 lines (331 loc) · 14.9 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
library(openair)
library(tidyverse)
library(ggplot2)
library (ggpubr)
# files that must be loaded:
# had to reset workign directory to retreve file...
getwd()
setwd("2.Clean.Data")
clean.patent.db<- read.csv( "clean.patent.db.csv")
ufo.freq.table.date.range <- read.csv("UFO.freq.date.range.csv")
# reset working directory back to original
setwd("~/GitHub/UFOS-and-Patents")
getwd()
ufo <- read.csv ("consolidated_weather_V03.csv")
#================================ Variables ====================================
# Number of UFO sightings required to make it a date of interest.
ufo.crit.val <- 5 # this was determined in the cleaning.ufo.data script, it is
# based off the mean frequency of UFO sightings during the
# data time period.
# Delay period before sampling of patent frequencies.
delay.period <- 14
# Length of sample period after delay (in days)
sample.period <- 30
#============================ WORK FLOW SET UP =================================
work.d <- getwd()
out.put.folders <- c("1.Raw.Data","2.Clean.Data", "3.Analysis", "4.Graphs",
"4.Graphs/Table")
# check if folders exist and make them if they don't.
# This loop checks goes through the given out.put.folders list and checks to see
# if they exisit in the working directory. If they don't they print "does not
# exisit" and creates them, if it does exist it prints "does exist"
for(i in 1:length(out.put.folders)){
if(!file.exists(out.put.folders[i])){
print(paste(i, "does not exist"))
dir.create(out.put.folders[i])
}else{
print(paste(i,"does exist"))
}
}
#---- Pathways---------
#path to 1.RawData folder
path.rd <- paste(work.d,"/",out.put.folders[1], "/", sep="")
#test the pathway...
# x.t <- c(5)
# write.csv(x.t,paste(path.rd, "test.x.csv"), row.names=FALSE)
# to remove row names from saved files ^^^
#Path to 2.Clean.Data
path.cd <- paste(work.d,"/",out.put.folders[2], "/", sep="")
# Path to 3.Analysis
path.a <- paste(work.d,"/",out.put.folders[3], "/", sep="")
# Path to 4.Graphs
path.g <- paste(work.d,"/",out.put.folders[4], "/", sep="")
# Path to Table folder in 4.Graphs folder
path.t <- paste(work.d,"/",out.put.folders[5], "/", sep="")
# write.csv(x.t,paste(path.t, "test.x.csv"), row.names=FALSE)
clean.patent.db<- read.csv( "clean.patent.db.csv")
setwd("2.Clean.Data")
clean.patent.db<- read.csv( "clean.patent.db.csv")
ufo.freq.table.date.range <- read.csv("UFO.freq.date.range.csv")
setwd("2.Clean.Data")
clean.patent.db<- read.csv( "clean.patent.db.csv",)
ufo.freq.table.date.range <- read.csv(paste(path.cd, "UFO.freq.date.range.TV.csv"))
# reset working directory back to original
setwd("~/GitHub/UFOS-and-Patents")
getwd()
# use table function to make frequency table
table.pf <- table(clean.patent.db$actiondate)
# make into a data frame
patent.freq.df <- data.frame(table.pf)
# rename column
colnames(patent.freq.df) <- c("action_date","Frequency")
# make the actiondate into a date instead of a factor
patent.freq.df$action_date <- as.Date(patent.freq.df$action_date)
class(patent.freq.df$action_date)
# make vector with all dates within the range
all.dates.p <- seq(as.Date("2006-01-01"), as.Date("2015-12-31"), "day")
# make into a dataframe
missing.dates.p.df <- as.data.frame(c(all.dates.p))
# add another column for frequency
missing.dates.p.df$frequency <- rep(NA, length(all.dates.p))
# make vector with all dates within the range
all.dates.p <- seq(as.Date("2006-01-01"), as.Date("2015-12-31"), "day")
# make into a dataframe
missing.dates.p.df <- as.data.frame(c(all.dates.p))
# add another column for frequency
missing.dates.p.df$frequency <- rep(NA, length(all.dates.p))
# Rename the columns
colnames(missing.dates.p.df) <- c("action_date", "Frequency")
# Merge missing dates with patent frequency table.
patent.freq.p.df <- merge(patent.freq.df, missing.dates.p.df,
by.x= "action_date", by.y="action_date", all=TRUE)
# check to confirm merge worked by looking for NA in the Frequency Data from the
# Patent dataframe
any(is.na(patent.freq.p.df$Frequency.x))
# cut the Frequency.y column
final.patent.freq <- patent.freq.p.df[, c("action_date", "Frequency.x")]
# Rename Frequency.x to Frequency
colnames(final.patent.freq) <- c("action_date", "Frequency")
# replace all NA values with 0
final.patent.freq$Frequency[is.na(final.patent.freq$Frequency)] <- 0
# check to see if it worked
any(is.na(final.patent.freq$Frequency))
# save as csv
write.csv(final.patent.freq, paste(path.cd, "final.patent.freq.csv"))
# change the Date variable to be recognized as class "Date"
class(ufo.freq.critical$Date)
# pull UFO data for days with more than critical value of sightings
head(ufo.freq.table.date.range)
ufo.freq.critical <- ufo.freq.table.date.range[
ufo.freq.table.date.range$Frequency>ufo.crit.val,]
# save the number of critical days to variable for later use
num.crit.days <- nrow(ufo.freq.critical)
# change the Date variable to be recognized as class "Date"
class(ufo.freq.critical$Date)
ufo.freq.critical$Date<- as.Date(ufo.freq.critical$Date)
class(ufo.freq.critical$Date)
ufo.freq.critical$Delayed
ufo.freq.critical$Delayed <- ufo.freq.critical$Date + delay.period
head(ufo.freq.critical)
# check to ensure it ran correctly
head(ufo.freq.critical)
d.dates.in <- data.frame(ufo.freq.critical$Delayed) # make new dataframe with
# the Delayed date
d.dates.in$indicator <- rep(1, length(d.dates.in)) #made indicator column with
# value = 1
# rename the columns to: date_intrest and indicator
colnames(d.dates.in) <- c("date_interest", "indicator")
patent.dates.intrest <- merge(final.patent.freq, d.dates.in,
by.x= "action_date", by.y="date_interest", all=TRUE)
ob.sum <- NULL
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){ # saying if indicator not NA...
t.sum <- 0 # them create a variable that = 0
for(j in 1:30){ # then count from 1-30 starting at date of interest
patent.row.no <- i + j # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
} # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.samples) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum)) #put sum in dataframe
}
}
tail(ob.sum)
# create empty vector
ob.sum <- NULL
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){ # saying if indicator not NA...
t.sum <- 0 # them create a variable that = 0
for(j in 1:30){ # then count from 1-30 starting at date of interest
patent.row.no <- i + j # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
} # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.samples) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum)) #put sum in dataframe
}
}
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){ # saying if indicator not NA...
t.sum <- 0 # them create a variable that = 0
for(j in 1:30){ # then count from 1-30 starting at date of interest
patent.row.no <- i + j # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
} # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.sum) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum)) #put sum in dataframe
}
}
tail(ob.sum) # the last 31 samples are NA values as the data set does not
# contain enough data to look for the whole 30 days following this.
# excludes all the NA values
ob.sums <- na.exclude(ob.sum)
# change the Date variable to be recognized as class "Date"
class(ufo.freq.critical$Date)
ufo.freq.critical$Date<- as.Date(ufo.freq.critical$Date)
class(ufo.freq.critical$Date)
ufo.freq.critical$Delayed
ufo.freq.critical$Delayed <- ufo.freq.critical$Date + delay.period
head(ufo.freq.critical)
# check to ensure it ran correctly
head(ufo.freq.critical)
#---- Create New Data Frame ----------------------------------------------------
# create data frame with Delayed dates and Indicator column.
d.dates.in <- data.frame(ufo.freq.critical$Delayed) # make new dataframe with
# the Delayed date
d.dates.in$indicator <- rep(1, length(d.dates.in)) #made indicator column with
# rename the columns to: date_intrest and indicator
colnames(d.dates.in) <- c("date_interest", "indicator")
# merging
patent.dates.intrest <- merge(final.patent.freq, d.dates.in,
by.x= "action_date", by.y="date_interest", all=TRUE)
# create empty vector
ob.sum <- NULL
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){ # saying if indicator not NA...
t.sum <- 0 # them create a variable that = 0
for(j in 1:30){ # then count from 1-30 starting at date of interest
patent.row.no <- i + j # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
} # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.sum) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum)) #put sum in dataframe
}
}
tail(ob.sum) # the last 31 samples are NA values as the data set does not
# contain enough data to look for the whole 30 days following this.
# excludes all the NA values
ob.sums <- na.exclude(ob.sum)
#----- Find The Observed Mean --------------------------------------------------
# find the mean of the patents sums in the 30 days following a day of interest.
ob.mean <- mean(ob.sums[,1])
ob.mean
# create empty vector in which to store the sum of each sample.period.
ob.sum <- NULL
# create a for loop which will go through our data frame contianing patent
# frequency, date of interest and indicator varibles. It will identify rows
# containing dates of interest via the indicator coulmn, then selection the
# patent frequency for that date and the following 29 days, take the sum of
# these frequencies, store them in a data frame and move on to the next date of
# interest.
for(i in 1:nrow(patent.dates.intrest)){
if(is.na(patent.dates.intrest[i,3])==FALSE){ # saying if indicator not NA...
t.sum <- 0 # them create a variable that = 0
for(j in 1:30){ # then count from 1-30 starting at date of interest
patent.row.no <- i + j # assign row number
t.sum <- t.sum + patent.dates.intrest$Frequency[patent.row.no]
} # add up the patent frequencies for each of the 30 rows following a date
# of interest
row.no <- nrow(ob.sum) + 1 # assign new row number
ob.sum <- rbind(ob.sum, data.frame(t.sum)) #put sum in dataframe
}
}
tail(ob.sum) # the last 31 samples are NA values as the data set does not
number.samples <- sum(is.na(ob.sum))
number.samples
# we will count the number of NAs within the sample
number.samples <- sum(is.na(ob.sum))
number.samples <- num.crit.days-(sum(is.na(ob.sum)))
number.samples <- num.crit.days-(sum(is.na(ob.sum)))
number.samples
# number of critical dates minus the number of NA values returned.
number.sample.p <- num.crit.days-(sum(is.na(ob.sum)))
#----- Find The Observed Mean --------------------------------------------------
# find the mean of the patents sums in the 30 days following a day of interest.
ob.mean <- mean(ob.sums[,1])
# Number of samples the null distribution runs.
null.run.times <- 10000
# this for loop will collect "number.sample.p" samples of "sample.period" days.
# It will then sum the patent frequencies for each "sample.period" and then find
# the mean number of patent sums within "number.sample.p" samples.
# It will then run this loop "null.run.times" times to create a distribution of
# means.
for(i in 1:null.run.times){
pull.samples <- sample(final.patent.freq$Frequency, size = number.sample.p*sample.period,
replace = TRUE) #pull a sample of , 2516 times
pull.sample <- matrix(pull.samples, number.sample.p) # putting those into a matrix
sum.sample <- apply(pull.sample,1,sum) # sum values within a sample.period
# day sample.
sample.sums <- as.data.frame(sum.sample) # storeing sums into a data frame
null.dis.means[i,1] <- mean(sample.sums[,1]) #taking the mean of the sums and
# storing it into a new data frame.
}
# create an empty data frame to store all of the means
null.dis.means <- as.data.frame(NULL)
# this for loop will collect "number.sample.p" samples of "sample.period" days.
# It will then sum the patent frequencies for each "sample.period" and then find
# the mean number of patent sums within "number.sample.p" samples.
# It will then run this loop "null.run.times" times to create a distribution of
# means.
for(i in 1:null.run.times){
pull.samples <- sample(final.patent.freq$Frequency, size = number.sample.p*sample.period,
replace = TRUE) #pull a sample of , 2516 times
pull.sample <- matrix(pull.samples, number.sample.p) # putting those into a matrix
sum.sample <- apply(pull.sample,1,sum) # sum values within a sample.period
# day sample.
sample.sums <- as.data.frame(sum.sample) # storeing sums into a data frame
null.dis.means[i,1] <- mean(sample.sums[,1]) #taking the mean of the sums and
# storing it into a new data frame.
}
head(null.dis.means)
colnames(null.dis.means) <- c("Mean")
head(null.dis.means)
# save as a csv
write.csv(null.dis.means, paste(path.cd, "null.dis.means.csv"), row.names = FALSE)
# we want the means displayed in frequencies in order
null.dis.mean.sum <- ggplot(data=null.dis.means, aes(null.dis.means$Mean)) +
geom_histogram(col ="blue", bins = 100, fill="light blue") +
labs(x= "Sample Mean", y= "Frequency") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
null.dis.mean.sum
null.dis.mean.sum <- ggplot(data=null.dis.means, aes(null.dis.means$Mean)) +
geom_histogram(col ="blue", bins = 100, fill="light blue") +
labs(x= "Sample Mean", y= "Frequency") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")) +
geom_vline(xintercept = ob.mean, color= "maroon")
null.dis.mean.sum
pdf(paste(path.g, "Null.Dis.Patents.pdf"))
plot(null.dis.mean.sum)
dev.off()
# count the number of values in our null distribution higher than our observed
# mean
sum(null.dis.means[null.dis.means$Mean>ob.mean,1])
# count the number of values in our null distribution higher than or equal to
# our observed mean.
sum(null.dis.means[null.dis.means$Mean>=ob.mean,1])
null.dis.means$Mean>=ob.mean,1
null.dis.means[null.dis.means$Mean>=ob.mean,1]
# count the number of values in our null distribution higher than or equal to
# our observed mean.
sum(null.dis.means[null.dis.means$Mean>=ob.mean,1])
# count the number of values in our null distribution higher than or equal to
# our observed mean.
num.grater.equal <- sum(null.dis.means[null.dis.means$Mean>=ob.mean,1])
# we will then multiply this by two, add 1 and divide by the "number.sample.p".
# This will give us the two-tailed p-value associated with our Observed Mean
# given our null distribution.
p.value <- ((num.grater.equal*2)+1)/(number.sample.p)
p.value
# we will then multiply this by two, add 1 and divide by the "number.sample.p".
# This will give us the two-tailed p-value associated with our Observed Mean
# given our null distribution.
p.value <- ((num.grater.equal*2)+1)/(null.run.times)
p.value