-
Notifications
You must be signed in to change notification settings - Fork 0
/
deep_learning_pub_analysis.R
187 lines (135 loc) · 8.91 KB
/
deep_learning_pub_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#Deep learning trends and keyword analysis. This script assumes that the deep_learning_corpus R project it is associated with has been opened and that the directory structure has been preserved.
#Search term field for scopus:
##Environmental science subject area:
# TITLE-ABS-KEY ( ( "deep learn*" AND ecol* ) OR ( "machine learn*" AND ecol* ) OR ( "artificial intelligence" AND ecol* ) ) AND SUBJAREA (envi)
## Agriculture subject area:
# TITLE-ABS-KEY ( ( "deep learn*" AND ecol* ) OR ( "machine learn*" AND ecol* ) OR ( "artificial intelligence" AND ecol* ) ) AND SUBJAREA (agri)
# Libraries ---------------------------------------------------------------
#Data manipulation
library(janitor)
library(dplyr)
library(tidyr)
library(tidytext)
#Keyword network
library(akc)
#Graphing
library(ggplot2)
library(viridis)
# Data Preparation --------------------------------------------------------
#ARTICLES DOWNLOADED ON 13/10/2021 (12:08 NZT)
#Articles pulled by the comprehensive search of artificial intelligence, machine learning and deep learning in the ecological literature (environmental science and agriculture subject areas of scopus)
#envi subject area of scopus
aiScopus_df <- read.csv("./data/enviScopus.csv") %>%
#Including the papers in the agri subject area
bind_rows(read.csv("./data/agriScopus.csv")) %>%
#Tidying column names for convenience
clean_names() %>%
#Distinct based on title because not all have DOI - removing double ups from agri and envi
distinct(title, .keep_all = TRUE)
#Just the papers that came up as a result of searching deep learning in the above subject areas
dlScopus_df <- read.csv("./data/dlScopus.csv") %>%
clean_names() %>%
#Summarising count for deep learning by year
group_by(year) %>%
summarise(dlCount = n()) %>%
#Standardising to 1 as the first dl paper (according to search by title, keyword and abstract) was published in 2006
mutate(dl_nn = dlCount/1)
#Total count of articles published by scopus by year
allScopus_df <- data.frame(year = 1971:2021,
all_articles = read.csv("./data/allArticles_03062022.csv"),
envi = read.csv("./data/allEnvArticles.csv")) %>%
mutate(envi_nn = envi / 55700) #55660 = the number published in 2000
# Trends in ML -----------------------------------------------------------
#Summarising count by year for graph of trends
aiSum_df <- aiScopus_df %>%
#Counts by year
group_by(year) %>%
summarise(aiCount = n()) %>%
#Merging dataframes for plotting
left_join(allScopus_df, by = "year") %>%
left_join(dlScopus_df, by = "year") %>%
#Changing NA's to 0 for plotting
mutate(dl_nn = replace(dl_nn, is.na(dl_nn), 0))
#Find the baseline number to normalise against (# this is wear the number "7" came from)
aiStandard <- filter(aiSum_df, year == 2000) %>%
select(aiCount) %>%
slice(1)
#Manipulating to create normalised column
aiTrends_df <- aiSum_df %>%
#Adding a column to standardise against the year 2000
mutate(ai_nn = aiCount/7) %>%
#Pivoting to long format for plotting
pivot_longer(cols = c(dl_nn, ai_nn, envi_nn), names_to = "article_type", values_to = "article_nn") %>%
#Altering for more informative labels
mutate(article_type = ifelse(article_type == "dl_nn", "Deep Learning", ifelse(article_type == "ai_nn", "Artificial Intelligence", "Environmental Science"))) %>%
#Altering ordering on plot
mutate(article_type = factor(article_type, levels = c("Deep Learning", "Artificial Intelligence", "Environmental Science")))
#Plotting trends through time to visualise how they differ from one another...
ggplot(filter(aiTrends_df, year != 2022),
aes(x = year,
y = article_nn,
colour = article_type)) +
geom_line(size = 1.15) +
scale_colour_manual(values = viridis::magma(3, begin = 0.2, end = 0.8, direction = - 1), name = "Article Type") +
labs(y = "Normalised Article Count",
x = "Year") +
xlim(c(1980, 2021)) +
geom_vline(xintercept = 2000, linetype = "dashed") +
geom_vline(xintercept = 2006, linetype = "dotted") +
theme_classic()
#Saving it in two formats for the publisher
ggsave("./graphs/publishingTrends.pdf", width = 7.75, height = 4.5)
ggsave("./graphs/publishingTrends.png", width = 7.75, height = 4.5)
# Keyword analysis (Deep Learning) ----------------------------------------
data(stop_words) #stop words to be removed from scopus database
#Extra words that André chose on the basis of repeat groupings.
sillyWords <- data.frame(word = c("model", "models", "modelling", "modeling", "learn", "learning", "machine", "environment", "intelligence", "deep", "result", "method", "â", "1", "study", "factor", "index", "base", "analysis", "artificial", "algorithm", "research", "performance", "approach", "develop", "accuracy", "train", "neural", "process", "provide", "feature", "sample", "dataset", "network", "net", "article", "data set", "guangdong", "china", "south africa", "united states", "sacramento-san joaquin delta", "malaysia", "poland", "brain", "mean square error", "root mean square error", "root mean square errors", "priority journal", "bayesian analysis", "netherlands", "india", "madre de dios", "indicator indicator", "clinical evaluation", "least squares method", "information management", "comparative study", "controlled study", "errors", "support vector machine", "support vector machines", "decision trees", "major clinical study", "lanzhou", "cellular automaton", "light detection and ranging", "correlation", "hubei", "south korea", "three gorges reservoir", "principle component analysis", "beijing [china]", "random forest", "parameter estimation", "emotion"), lexicon = "Andre") %>%
#Combining it with the tidytext library of unhelpful words
bind_rows(stop_words, .) %>%
#Standardising all words to upper case to avoid case mismatching
mutate(word = toupper(word))
#Keyword data cleaning
dlKeywords_df <- read.csv("./data/dlScopus.csv") %>%
#Tidying column names to a known format
clean_names() %>%
#Using index_keywords as these are less varied than author ones
select(c(title, year, index_keywords)) %>%
#akc needs tibbles for the keyword_* functions to work
tibble() %>%
#Cleaning and separating the keywords
keyword_clean(id = "title",
keyword = "index_keywords",
lemmatize = FALSE) %>%
#Making the upper case for uniformity
mutate(keyword = toupper(keyword)) %>%
#Removing unhelpful words
anti_join(sillyWords,
by = c("keyword" = "word")) %>%
#Replacing keywords that are synonymous to avoid spurious connections and redundancy in network graph
mutate(keyword = replace(keyword, keyword == "ECOSYSTEMS", "ECOSYSTEM"),
keyword = replace(keyword, keyword == "CONVOLUTIONAL NEURAL NETWORKS", "CONVOLUTIONAL NEURAL NETWORK"),
keyword = replace(keyword, keyword == "CONVOLUTION", "CONVOLUTIONAL NEURAL NETWORK"),
keyword = replace(keyword, keyword == "ECOSYSTEM SERVICES", "ECOSYSTEM SERVICE"),
keyword = replace(keyword, keyword == "HUMANS", "HUMAN"),
keyword = replace(keyword, keyword == "ANIMALS", "ANIMAL"),
keyword = replace(keyword, keyword == "ANIMALIA", "ANIMAL"),
keyword = replace(keyword, keyword == "GREENHOUSE GAS", "GREENHOUSE GASES"),
keyword = replace(keyword, keyword == "PUBLIC ATTITUDE", "PUBLIC OPINION"),
keyword = replace(keyword, keyword == "FORECASTING METHOD", "FORECASTING"),
keyword = replace(keyword, keyword == "ACCURACY ASSESSMENT", "PERFORMANCE ASSESSMENT"),
keyword = replace(keyword, keyword == "RIVER WATER", "WATER"),
keyword = replace(keyword, keyword %in% c("CHEMICAL OXYGEN DEMAND", "BIOCHEMICAL OXYGEN DEMAND"), "OXYGEN DEMAND"),
keyword = replace(keyword, keyword %in% c("DEEP NEURAL NETWORKS", "NEURAL NETWORKS", "ARTIFICIAL NEURAL NETWORK", "MACHINE LEARNING", "NEURAL NETWORKS, COMPUTER", "LEARNING NEURAL NETWORKS", "NEURAL NETWORK MODEL", "LEARNING SYSTEMS", "NEURAL-NETWORKS", "MACHINE-LEARNING", "LEARNING ALGORITHMS", "LEARNING MODELS", "ARTIFICIAL INTELLIGENCE", "NEURAL NETWORK METHOD", "RESIDUAL NEURAL NETWORKS"), "DEEP LEARNING"))
#Graphing the deep learning index keyword associations
dlKeywords_df %>%
keyword_group(id = "id",
keyword = "keyword") %>%
#akc visualisation function and limiting the number of node viewed to avoid cluttering
keyword_vis(max_nodes = 10, facet = TRUE) +
scale_fill_manual(values = viridis::viridis(n = 3,
begin = 0.5,
end = 1)) +
scale_alpha_manual(values = 0.3)
#Saving
ggsave("./graphs/keywordNetwork.pdf", width = 15, height = 7.15)
ggsave("./graphs/keywordNetwork.png", width = 15, height = 7.15)