- Eksplorasi Data Menggunakan R ================ Muhammad Luqman 2018-04-29
Dalam statistik, Exploratory Data Analysis (EDA) adalah teknik untuk
menganalisis data dengan tujuan untuk melihat karakteristik data
tersebut. EDA seringkali dilakukan dengan menggunakan teknik
visualisasi. Dengan menggunakan library ggplot2
, kita dapat melakukan
visualisasi data dalam R untuk melakukan EDA.
library(dplyr)
library(ggplot2)
library(hflights)
hflights = as_tibble(hflights)
hflights
## # A tibble: 227,496 × 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <chr> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## # ℹ 227,486 more rows
## # ℹ 13 more variables: TailNum <chr>, ActualElapsedTime <int>, AirTime <int>,
## # ArrDelay <int>, DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
str(hflights)
## tibble [227,496 × 21] (S3: tbl_df/tbl/data.frame)
## $ Year : int [1:227496] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int [1:227496] 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int [1:227496] 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int [1:227496] 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int [1:227496] 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int [1:227496] 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr [1:227496] "AA" "AA" "AA" "AA" ...
## $ FlightNum : int [1:227496] 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr [1:227496] "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int [1:227496] 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int [1:227496] 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int [1:227496] -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int [1:227496] 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr [1:227496] "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr [1:227496] "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int [1:227496] 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int [1:227496] 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int [1:227496] 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int [1:227496] 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr [1:227496] "" "" "" "" ...
## $ Diverted : int [1:227496] 0 0 0 0 0 0 0 0 0 0 ...
lookup = c(A = "Carrier", B = "Weather", C = "National Air System", D = "Security")
hflights = hflights %>%
mutate(CancellationReason = lookup[CancellationCode])
ct = table(hflights$UniqueCarrier, hflights$Cancelled)
ct
##
## 0 1
## AA 3184 60
## AS 365 0
## B6 677 18
## CO 69557 475
## DL 2599 42
## EV 2128 76
## F9 832 6
## FL 2118 21
## MQ 4513 135
## OO 15837 224
## UA 2038 34
## US 4036 46
## WN 44640 703
## XE 71921 1132
## YV 78 1
prop.table(ct)
##
## 0 1
## AA 1.399585e-02 2.637409e-04
## AS 1.604424e-03 0.000000e+00
## B6 2.975876e-03 7.912227e-05
## CO 3.057504e-01 2.087949e-03
## DL 1.142438e-02 1.846186e-04
## EV 9.354011e-03 3.340718e-04
## F9 3.657207e-03 2.637409e-05
## FL 9.310054e-03 9.230932e-05
## MQ 1.983771e-02 5.934170e-04
## OO 6.961441e-02 9.846327e-04
## UA 8.958399e-03 1.494532e-04
## US 1.774097e-02 2.022014e-04
## WN 1.962232e-01 3.090164e-03
## XE 3.161418e-01 4.975912e-03
## YV 3.428632e-04 4.395682e-06
prop.table(ct, margin = 1)
##
## 0 1
## AA 0.981504316 0.018495684
## AS 1.000000000 0.000000000
## B6 0.974100719 0.025899281
## CO 0.993217386 0.006782614
## DL 0.984096933 0.015903067
## EV 0.965517241 0.034482759
## F9 0.992840095 0.007159905
## FL 0.990182328 0.009817672
## MQ 0.970955250 0.029044750
## OO 0.986053172 0.013946828
## UA 0.983590734 0.016409266
## US 0.988731014 0.011268986
## WN 0.984495953 0.015504047
## XE 0.984504401 0.015495599
## YV 0.987341772 0.012658228
prop.table(ct, margin = 2)
##
## 0 1
## AA 0.0141811752 0.0201816347
## AS 0.0016256686 0.0000000000
## B6 0.0030152813 0.0060544904
## CO 0.3097989961 0.1597712748
## DL 0.0115756515 0.0141271443
## EV 0.0094778709 0.0255634040
## F9 0.0037056337 0.0020181635
## FL 0.0094333320 0.0070635721
## MQ 0.0201003906 0.0454086781
## OO 0.0705362034 0.0753447696
## UA 0.0090770211 0.0114362597
## US 0.0179758867 0.0154725866
## WN 0.1988215016 0.2364614867
## XE 0.3203279842 0.3807601749
## YV 0.0003474032 0.0003363606
ggplot(hflights, aes(x = Cancelled)) +
geom_bar()
hflights %>%
filter(Cancelled == 1) %>%
ggplot(aes(x = CancellationReason)) +
geom_bar()
ggplot(hflights, aes(x = UniqueCarrier, fill = Cancelled)) +
geom_bar(position = "dodge")
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(hflights, aes(x = UniqueCarrier, fill = Cancelled)) +
geom_bar(position = "fill")
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
hflights %>%
filter(Cancelled == 1) %>%
ggplot(aes(x = UniqueCarrier,
fill = CancellationCode)) +
geom_bar(position = "fill")
ggplot(hflights, aes(x = AirTime)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3622 rows containing non-finite values (`stat_bin()`).
ggplot(hflights, aes(x = AirTime)) +
geom_histogram(binwidth = 60)
## Warning: Removed 3622 rows containing non-finite values (`stat_bin()`).
ggplot(hflights, aes(x = AirTime)) +
geom_histogram() +
facet_wrap(~Origin)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3622 rows containing non-finite values (`stat_bin()`).
ggplot(hflights, aes(x = AirTime)) +
geom_density()
## Warning: Removed 3622 rows containing non-finite values (`stat_density()`).
ggplot(hflights, aes(x = AirTime, fill = Origin)) +
geom_density(alpha = 0.3)
## Warning: Removed 3622 rows containing non-finite values (`stat_density()`).
ggplot(hflights, aes(x = 1, y = Distance)) +
geom_boxplot()
ggplot(hflights, aes(x = Origin, y = Distance)) +
geom_boxplot()
ggplot(hflights, aes(x = Distance, y = AirTime)) +
geom_point()
## Warning: Removed 3622 rows containing missing values (`geom_point()`).
library(gapminder)
gap2007 = gapminder %>%
filter(year == 2007, continent != "Oceania")
gap2007
## # A tibble: 140 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975.
## 2 Albania Europe 2007 76.4 3600523 5937.
## 3 Algeria Africa 2007 72.3 33333216 6223.
## 4 Angola Africa 2007 42.7 12420476 4797.
## 5 Argentina Americas 2007 75.3 40301927 12779.
## 6 Austria Europe 2007 79.8 8199783 36126.
## 7 Bahrain Asia 2007 75.6 708573 29796.
## 8 Bangladesh Asia 2007 64.1 150448339 1391.
## 9 Belgium Europe 2007 79.4 10392226 33693.
## 10 Benin Africa 2007 56.7 8078314 1441.
## # ℹ 130 more rows
gap2007 %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot()
gap2007 %>%
ggplot(aes(x = lifeExp, fill = continent)) +
geom_density(alpha = 0.3)
gap2007 %>%
group_by(continent) %>%
summarize(mean(lifeExp),
median(lifeExp))
## # A tibble: 4 × 3
## continent `mean(lifeExp)` `median(lifeExp)`
## <fct> <dbl> <dbl>
## 1 Africa 54.8 52.9
## 2 Americas 73.6 72.9
## 3 Asia 70.7 72.4
## 4 Europe 77.6 78.6
gap2007 %>%
group_by(continent) %>%
summarize(varians = var(lifeExp),
std = sd(lifeExp),
iqr = IQR(lifeExp),
n_data = n())
## # A tibble: 4 × 5
## continent varians std iqr n_data
## <fct> <dbl> <dbl> <dbl> <int>
## 1 Africa 92.8 9.63 11.6 52
## 2 Americas 19.7 4.44 4.63 25
## 3 Asia 63.4 7.96 10.2 33
## 4 Europe 8.88 2.98 4.78 30