This repository has been archived by the owner on Apr 20, 2022. It is now read-only.
forked from pachadotdev/tradestatistics-hs12-historic-series
-
Notifications
You must be signed in to change notification settings - Fork 0
/
03-EDA.R
114 lines (99 loc) · 3.75 KB
/
03-EDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
library(arrow)
library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(stringr)
library(ggplot2)
library(forcats)
draw <- open_dataset("hs12-visualization/yrpc",
partitioning = c("year", "reporter_iso"))
dimputed <- open_dataset("hs12-visualization/yrpc-imputed",
partitioning = c("year", "reporter_iso"))
dvenraw <-map_df(
2002:2020,
function(y) {
message(y)
draw %>%
filter(year == paste0("year=", y), reporter_iso == "reporter_iso=ven") %>%
select(year, reporter_iso, trade_value_usd_exp, trade_value_usd_imp) %>%
collect() %>%
group_by(year) %>%
summarise_if(is.numeric, sum)
}
)
dvenimputed <-map_df(
2002:2020,
function(y) {
message(y)
dimputed %>%
filter(year == paste0("year=", y), reporter_iso == "reporter_iso=ven") %>%
select(year, reporter_iso, trade_value_usd_exp, trade_value_usd_imp) %>%
collect() %>%
group_by(year) %>%
summarise_if(is.numeric, sum)
}
)
dven <- dvenraw %>%
mutate(year = as.integer(str_replace_all(year, "[a-z]|=", ""))) %>%
rename(`Exports, Raw` = trade_value_usd_exp,
`Imports, Raw` = trade_value_usd_imp) %>%
gather(group, value, -year) %>%
bind_rows(
dvenimputed %>%
mutate(year = as.integer(str_replace_all(year, "[a-z]|=", ""))) %>%
rename(`Exports, Imputed` = trade_value_usd_exp,
`Imports, Imputed` = trade_value_usd_imp) %>%
gather(group, value, -year)
)
cols <- c("#93b4f1", "#454548", "#67bb7b", "#eea55e")
ggplot(dven %>% filter(grepl("Raw", group))) +
geom_line(aes(x = year, y = value, color = group)) +
geom_point(aes(x = year, y = value, color = group)) +
expand_limits(x = 2002:2020) +
labs(title = "Imports and exports of Venezuela 2002-2020, Raw Data") +
theme_minimal(base_size = 13, base_family = "Roboto Condensed") +
scale_color_manual(values = cols)
ggplot(dven %>% filter(grepl("Impu", group))) +
geom_line(aes(x = year, y = value, color = group)) +
geom_point(aes(x = year, y = value, color = group)) +
expand_limits(x = 2002:2020) +
labs(title = "Imports and exports of Venezuela 2002-2020, Imputed Data") +
theme_minimal(base_size = 13, base_family = "Roboto Condensed") +
scale_color_manual(values = cols)
rows_with_imputation_per_year <- read_csv("~/github/un_escap/baci-like-hs12/rows_with_imputation_per_year.csv")
rows_with_imputation_per_year <- rows_with_imputation_per_year %>%
mutate(
flag2 = case_when(
flag == "exports, no change" ~ 1,
flag == "imports, divided by cif/fob ratio" ~ 2,
flag == "imports, divided by cif/fob ratio due to large exp/imp mismatch" ~ 3,
flag == "imports, divided by 1 + model constant due to large exp/imp mismatch" ~ 4
)
) %>%
arrange(year, flag2) %>%
mutate(flag = as_factor(flag))
rows_with_imputation_per_year %>%
mutate(
flag2 = ifelse(flag2 > 1, 2, flag2)
) %>%
group_by(year, flag2) %>%
summarise(
m = sum(m)
) %>%
group_by(flag2) %>%
summarise(
m = mean(m)
)
ggplot(rows_with_imputation_per_year) +
geom_col(aes(x = year, y = n, fill = flag)) +
theme_minimal(base_size = 13, base_family = "Roboto Condensed") +
scale_fill_manual(values = cols) +
theme(legend.position = "bottom", legend.direction="vertical", legend.title = element_blank()) +
labs(title = "Number of observations by imputation status", y = "No. obs", x = "Year")
ggplot(rows_with_imputation_per_year) +
geom_col(aes(x = year, y = m, fill = flag)) +
theme_minimal(base_size = 13, base_family = "Roboto Condensed") +
scale_fill_manual(values = cols) +
theme(legend.position = "bottom", legend.direction="vertical", legend.title = element_blank()) +
labs(title = "Share of observations by imputation status", y = "%. obs", x = "Year")