-
Notifications
You must be signed in to change notification settings - Fork 0
/
R-Excel tutorial.R
77 lines (72 loc) · 3.27 KB
/
R-Excel tutorial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#Welcome to How to Incorporate (or Replace) Your Excel Tasks with R
#Author: Mitchell D. Shuey
#------
install.packages("tidyverse")
install.packages("xlsx")
#The following function returns where your directory is. setwd("path") will naturally set it.
getwd()
#PHASE ONE: Importing data. Easiest way is to have the file, with a short name, already in your working directory.
#shift+right click your file in Windows Explorer, then choose "Copy as path".
#Replace backslashes with forward ones via Ctrl-R
#Don't forget to select sheet and skip!
library(readxl)
WinePrd = read_excel("C:\Users\LENOVO\Documents\GitHub\Tutorials\Megafile_of_global_wine_data_1835_to_2016_1217.xlsx",
sheet = "T6 Wine production", skip = 1)
WineCon = read_excel(,
sheet = "T34 Wine consumption vol", skip = 1)
View(WinePrd)
colnames(WinePrd)[1]="Year"
View(WineCon)
#Scroll to the rightmost edge of the sheet. Notice Norway.
#This will be a recurring piece of the data to account for as we transform.
#It's a great example of quirks in datasets that are best accounted for early.
#For now, we rearrange:
ncol(WineCon)
WineCon=WineCon[,c(1,ncol(WineCon), 2:57)]
colnames(WineCon)[1]="Year"
#------
#PHASE TWO: Slicing data into cross-sections
library(tidyr)
#converting from wide to tall
tidy.prd = gather(WinePrd, key=Country, value=Vol, -Year, -X__2, -ncol(WinePrd))
#removing extra column
tidy.prd = tidy.prd[,-2]
#repeating for wine consumption (WineCon.
tidy.con =
tidy.con = tidy.con[,-2]
library(dplyr)
tidy.wine=merge(tidy.prd, tidy.con, by=c("Year", "Country"), suffixes=c(".prd",".con"), all=T)
View(tidy.wine)
#Hold on, we're missing something!
#Now we want better factors to organize our countries with.
tidy.wine[,7] = tidy.wine$Region
tidy.wine$Region[tidy.wine$Country %in% colnames(WineCon[,c(2:27)])] = "Europe"
tidy.wine$Region[tidy.wine$Country %in% colnames(WineCon[,c(28:29)])] = "Oceania"
tidy.wine$Region[tidy.wine$Country %in% colnames(WineCon[,c(30,31)])] = "NorthAm"
tidy.wine$Region[tidy.wine$Country %in% colnames(WineCon[,c(32:37)])] = "LatAm"
tidy.wine$Region[tidy.wine$Country %in% colnames(WineCon[,c(38:43)])] = "AME"
tidy.wine$Region[tidy.wine$Country %in% colnames(WineCon[,c(44:54)])] = "AsiaPac"
tidy.wine$Region[tidy.wine$Country %in% colnames(WineCon[,55])] ="Other"
tidy.wine$Region[tidy.wine$Country %in% colnames(WineCon[,56])] = "WORLD"
tidy.wine$Region=as.factor(tidy.wine$Region)
#Did we factor everything as we wanted to?
summary(tidy.wine$Region)
#We're ready to move on!
#----
#PHASE THREE: Graphs and subset analysis
library(ggplot2)
tidy.wine = tidy.wine %>% mutate(`net`=`Vol.prd`-`Vol.con`)
tiny.wine$net = `Vol.prd`
Region.Prd.Year=spread(tidy.wine, Region, -2)
pt.y=ggplot(tidy.wine[tidy.wine$Country=="Portugal",], aes(Year, sum(`Vol.prd`)))
pt.y+geom_line()
Regionsums= summarise(group_by(tidy.wine, Year, Region), sum.prd=sum(Vol.prd))
View(Regionsums)
#Can anyone figure out what's wrong here?
#Introducing the pipe operator, to make our code more readable
Regionsums= tidy.wine %>%
na.omit() %>%
group_by(Year, Region) %>%
summarise(sum.prd=sum(Vol.prd))
r.y= ggplot(Regionsums, aes(Year, sum.prd, group=Region, col=Region))
r.y+geom_line()+labs("Yearly Wine Production by Global Region", "Year","Production")