diff --git a/assets/pptx/First-steps-with-R_day2_afternoon.pptx b/assets/pptx/First-steps-with-R_day2_afternoon.pptx index 5c260af..92a5267 100644 Binary files a/assets/pptx/First-steps-with-R_day2_afternoon.pptx and b/assets/pptx/First-steps-with-R_day2_afternoon.pptx differ diff --git a/assets/pptx/First-steps-with-R_day2_morning.pptx b/assets/pptx/First-steps-with-R_day2_morning.pptx index e5ad684..d1f3141 100644 Binary files a/assets/pptx/First-steps-with-R_day2_morning.pptx and b/assets/pptx/First-steps-with-R_day2_morning.pptx differ diff --git a/assets/pptx/First-steps-with-R_practicals_day1.pptx b/assets/pptx/First-steps-with-R_practicals_day1.pptx index e428845..8f2a471 100644 Binary files a/assets/pptx/First-steps-with-R_practicals_day1.pptx and b/assets/pptx/First-steps-with-R_practicals_day1.pptx differ diff --git a/assets/pptx/First-steps-with-R_practicals_day2.pptx b/assets/pptx/First-steps-with-R_practicals_day2.pptx index 0c9c46a..8cf1c3c 100644 Binary files a/assets/pptx/First-steps-with-R_practicals_day2.pptx and b/assets/pptx/First-steps-with-R_practicals_day2.pptx differ diff --git a/assets/pptx/exam_R.pptx b/assets/pptx/exam_R.pptx index a0ace4c..18c8f23 100644 Binary files a/assets/pptx/exam_R.pptx and b/assets/pptx/exam_R.pptx differ diff --git a/code/day1_code.R b/code/day1_code.R index f8a5022..745b1ba 100644 --- a/code/day1_code.R +++ b/code/day1_code.R @@ -230,15 +230,19 @@ data(sleep) head(sleep) tapply(X=sleep$extra, INDEX=sleep$group, FUN=mean) +## table +table(snps$chr) +table(snps$minor, snps$major) + ## adding rows snps_updated <- rbind(snps, data.frame(chr=22, pos=1723369, minor="A", major="T")) snps_updated ## adding columns -tested <- rep( c("yes",'no') , nrow(snps)/2) -tested -snps_mod <- cbind(snps, tested) +majorGC <- snps$major %in% c("G","C") +majorGC +snps_mod <- cbind(snps, majorGC) snps_mod ## reshaping dataframe diff --git a/code/day2_code.R b/code/day2_code.R index 45479a4..46d643b 100644 --- a/code/day2_code.R +++ b/code/day2_code.R @@ -38,15 +38,41 @@ legend(x="bottomright", bg="gray90") -## aparte - get data for practice -## iris dataset +## coloring data(iris) -head(iris) -## generating random numbers -rnorm(10) -rnorm(10 , mean=10 , sd=2) +plot(iris$Sepal.Length, + iris$Sepal.Width , + col = c('red','green','blue')[ iris$Species ] , + pch=19) + + +## coloring trick +iris$Species +as.numeric(iris$Species) +c('red','green','blue')[ c(1,1,1,2,2,2,3,3,3) ] + +c('red','green','blue')[ iris$Species ] + +## lines +data(airquality) +plot(airquality$Wind, airquality$Ozone, pch=20, + xlab="Wind (mph)",ylab="Ozone (ppb)") + +## horizontal and vertical lines +abline(h=60, col="red", lty="dashed") +abline(v=seq(3,21,3), col="grey", lty="dotdash") + +## tendency line +abline(lm(airquality$Ozone ~ airquality$Wind), + col=2, lwd=2) + +legend("topright", legend= c("measures","fitted line"), + pch= c(20, NA), lty = c(0, 1), lwd=c(NA, 2), + col = c(1, 2), bg = "gray90") + + ## histogram x <- rnorm(10000) @@ -92,28 +118,11 @@ points(thickness ~ status, data=Melanoma, col="blue", pch=19) #adds the actual data points to the plot -## lines -data(airquality) -plot(airquality$Wind, airquality$Ozone, pch=20, - xlab="Wind (mph)",ylab="Ozone (ppb)") - -## horizontal and vertical lines -abline(h=60, col="red", lty="dashed") -abline(v=seq(3,21,3), col="grey", lty="dotdash") - -## tendency line -abline(lm(airquality$Ozone ~ airquality$Wind), - col=2, lwd=2) - -legend("topright", legend= c("measures","fitted line"), - pch= c(20, NA), lty = c(0, 1), lwd=c(NA, 2), - col = c(1, 2), bg = "gray90") - ## pairplot data(iris) #contains 4 measurements for 150 flowers from 3 species of iris (Iris setosa, versicolor and virginica)​ pairs(iris[,1:4], main="Edgar Anderson's Iris Data", - pch=21, bg=c("red", "green3", "blue")[iris$Species]) + pch=19, col=c("red", "green3", "blue")[iris$Species]) iris$Species as.numeric(iris$Species) @@ -122,57 +131,19 @@ c('red','green','blue')[ c(1,1,1,2,2,2,3,3,3) ] c('red','green','blue')[iris$Species] -## barplot - -course_data <- c(25, 35, 50, 100) -barplot(course_data, main="Number of requests for R courses", - names.arg=c("2011", "2012","2013", "2014"), - col=c("yellow", "orange","red", "blue")) - - -## more complex barplot -df <- data.frame(year = c("2011","2012","2013","2014", - "2011","2012","2013","2014", - "2011","2012","2013","2014", - "2011","2012","2013","2014"), - city = c("A","A","A","A", "B","B","B","B", - "C","C","C","C", "D","D","D","D"), - nb_requests_courses = c(30,36,50,98, 26,35,54,101, - 28,38,51,105, 29,40,55,125)) - -# Check what is inside -df - -mean_nb <- tapply(df$nb_requests_courses, df$year, mean) -sd_nb <- tapply(df$nb_requests_courses, df$year, sd) -n_values <- tapply(df$nb_requests_courses, df$year, length) -mean_nb -sd_nb -n_values - - -mids <- barplot(mean_nb, - xlab="year",ylab="Number of requests for courses", - ylim=c(0,120), - col=c("yellow", "orange","red", "blue"), - cex=1.5, cex.axis=1.5, cex.main=1.5, cex.names=1.5, - main= "Number of requests for R") -mids - -arrows(mids, mean_nb-sd_nb, # coordinates of lower point - mids, mean_nb+sd_nb, # coordinates of upper point - code=3, # type of arrow: "head at both ends" - angle=90, # angle between shaft and head of arrow - length=0.1, # length of edges of arrow head - lwd=2) # line width - - -# Add text at the midpoints and at height 5 on the y-axis: number of observations​ -text(x=mids, y=5, paste("n =",n_values), cex=2) +## par +par(mfrow=c(1,2)) +plot(-90:90 , (-90:90)**2 ) +plot(-90:90 , (-90:90)**3 ) +par(mfrow=c(2,2)) +plot(-90:90 , (-90:90)**1 ) +plot(-90:90 , (-90:90)**2 ) +plot(-90:90 , (-90:90)**3 ) +plot(-90:90 , (-90:90)**4 ) plot( rnorm(10) , rnorm(10) ) par(col="red", pch=15) @@ -205,40 +176,57 @@ dev.off() ### stats ### -data(sleep) -head(sleep) +data(iris) +# we limit the data to 2 species +iris_f = iris[ iris$Species %in% c('versicolor','virginica') , ] +iris_f$Species = factor(iris_f$Species) + +tapply( iris_f$Petal.Length ,iris_f$Species , mean ) + +plen_versicolor = iris_f$Petal.Length[iris_f$Species=='versicolor'] +plen_virginica = iris_f$Petal.Length[iris_f$Species=='virginica'] -summary(sleep) par(mfrow=c(2,2)) -hist(sleep$extra[sleep$group==1], - freq=FALSE, xlab="Drug 1", - main=" Extra sleep on drug 1") -qqnorm(sleep$extra[sleep$group==1]) -qqline(sleep$extra[sleep$group==1]) - -hist(sleep$extra[sleep$group==2], - freq=FALSE, xlab="Drug 2", - main=" Extra sleep on drug 2") -qqnorm(sleep$extra[sleep$group==2]) -qqline(sleep$extra[sleep$group==2]) +hist(plen_versicolor, + xlab="versicolor", + main="petal length - versicolor") +qqnorm(plen_versicolor) +qqline(plen_versicolor) + +hist(plen_virginica, + xlab="virginica", + main="petal length - virginica") +qqnorm(plen_virginica) +qqline(plen_virginica) dev.off() -boxplot(extra ~ group, data=sleep, col=c("orange", "pink"), ylab="Extra sleep", xlab="Drug received") -points(extra ~ group, data = sleep, col="black",pch = 19) +shapiro.test( plen_versicolor ) +shapiro.test( plen_virginica ) + + +boxplot(Petal.Length ~ Species, data=iris_f, col=c("cornflowerblue", "pink"), + ylab="petal length", xlab="species") +points( Petal.Length ~ Species, data=iris_f, col="black",pch = 19) -t.test(sleep$extra[sleep$group==1], - sleep$extra[sleep$group==2]) +t.test(plen_versicolor, + plen_virginica) -t.test(extra ~ group, data=sleep) #equivalent to the above +t.test(Petal.Length ~ Species, data=iris_f) #equivalent to the above -test_res = t.test(sleep$extra[sleep$group==1], - sleep$extra[sleep$group==2]) +test_res = t.test(Petal.Length ~ Species, + data=iris_f) names(test_res) test_res[['p.value']] + +## paired t-test + +data(sleep) +head(sleep) + interaction.plot(response=sleep$extra, x.factor=sleep$group, trace.factor=sleep$ID, legend=FALSE, type="b", lty=1, pch=16, xlab="Drug received", ylab="Extra sleep") @@ -297,15 +285,20 @@ class_data <- read.csv("course_datasets/class.csv") class_data$Gender = as.factor(class_data$Gender) #dataset* of 19 students' measurements summary(class_data) -pairs(class_data) +pairs(class_data[2:4] , col = class_data$Gender, pch=19 ) +# create model model_height_age <- lm(Height~Age, data=class_data) model_height_age +plot(Height~Age, data=class_data) +abline(model_height_age , col='red') + +residuals(model_height_age) + ## check model assumptions -par(mfrow=c(1,2)) -plot(model_height_age, which=1) -plot(model_height_age, which=2) +par(mfrow=c(2,2)) +plot(model_height_age) dev.off() ## plot model @@ -322,3 +315,20 @@ abline(a=0,b=1,lty=2) summary( model_height_age ) + +## model with 2 parameters + + +# create model +model2 <- lm(Height~Age+Gender, data=class_data) + +par(mfrow=c(2,2)) +plot(model2) +dev.off() +summary(model2) + +plot( Height~Age, col=c('turquoise','purple')[Gender], pch=19, data=class_data ) +C = coef( model2 ) +abline(a=C[1],b=C[2] , col='turquoise',lwd=2) +abline(a=C[1]+C[3],b=C[2] , col='purple',lwd=2) +legend('bottomright',c('female','male'),pch=19,lwd=2,col=c('turquoise','purple')) diff --git a/slides/First-steps-with-R_day2_afternoon.pdf b/slides/First-steps-with-R_day2_afternoon.pdf index fb62831..3812d17 100644 Binary files a/slides/First-steps-with-R_day2_afternoon.pdf and b/slides/First-steps-with-R_day2_afternoon.pdf differ diff --git a/slides/First-steps-with-R_day2_morning.pdf b/slides/First-steps-with-R_day2_morning.pdf index d002696..95a87ff 100644 Binary files a/slides/First-steps-with-R_day2_morning.pdf and b/slides/First-steps-with-R_day2_morning.pdf differ diff --git a/slides/First-steps-with-R_practicals_day1.pdf b/slides/First-steps-with-R_practicals_day1.pdf index cda1e28..e6917c6 100644 Binary files a/slides/First-steps-with-R_practicals_day1.pdf and b/slides/First-steps-with-R_practicals_day1.pdf differ diff --git a/slides/First-steps-with-R_practicals_day2.pdf b/slides/First-steps-with-R_practicals_day2.pdf index 0c6a1a3..b4f1079 100644 Binary files a/slides/First-steps-with-R_practicals_day2.pdf and b/slides/First-steps-with-R_practicals_day2.pdf differ diff --git a/slides/exam_R.pdf b/slides/exam_R.pdf index 1286640..ed606c7 100644 Binary files a/slides/exam_R.pdf and b/slides/exam_R.pdf differ diff --git a/solutions/R_practice10_solution.R b/solutions/R_practice10_solution.R index c9bc42d..9dc898f 100644 --- a/solutions/R_practice10_solution.R +++ b/solutions/R_practice10_solution.R @@ -1,31 +1,95 @@ - -# First Steps in R, Practice 10 - -# 1) Load the package MASS using library(). (You may need to install it first). -# Load the dataset Pima.tr using data(). Use ? to get an idea which variables it contains. - -library(MASS) -data(Pima.tr) -?Pima.tr - - -#2) Hypothesis: Blood glucose level (glu) is associated with diastolic blood pressure (bp). -# Run a linear model to test the hypothesis. - -model_fit <- lm(bp~glu, data=Pima.tr) -summary(model_fit) - - -# the linear association between glucose and blood pressure is significant at the 0.05 alpha level -# (p = 0.000115) - -# 3) Visualize the fit with a scatter plot and a trend line. -par(mfrow=c(1,1)) -plot(Pima.tr$glu, Pima.tr$bp, pch=20, xlab="blood glucose", ylab="blood pressure") -abline(model_fit, col='red' , lwd=3) - -# 4) Check the assumption of normality of the residuals graphically. -par(mfrow=c(2,1)) -plot(model_fit, which=1) -plot(model_fit, which=2) -# Looks quite ok +# First Steps in R, Practice 10 + +# Come back to the mice data-set stored in the "mice_data" data frame. + +mice_data <- read.table( "course_datasets/mice_data.csv" , header=T , sep=",", + colClasses=c("factor", "factor", "numeric")) # define classes for columns ) + +mice_data$genotype <- factor(mice_data$genotype, levels=c("WT", "KO")) + + +# 1) Considering WT mice weight and KO mice weight separately, check the assumption of normality graphically. + + +KO_weights = mice_data$weight[ mice_data$genotype=="KO"] +WT_weights = mice_data$weight[ mice_data$genotype=="WT"] + +par(mfrow=c(2,2)) + +hist(KO_weights, freq=FALSE, + xlab="KO", main="weights of KO mice") +qqnorm(KO_weights) +qqline(KO_weights) + +hist(WT_weights, freq=FALSE, + xlab="WT", main="weights of WT mice") +qqnorm(WT_weights) +qqline(WT_weights) + +par(mfrow=c(1,1)) # removes setting in par + + +# KO mice: normality assumption seems ok + +# WT mice: +# histogram - suggests no normality (but small sample size for this) +# QQ Plot - quite ok + + + +# Welch's t-test is fairly robust if the deviation from normality is not extreme, +# so let's continue + +# 2) Make an appropriate plot to visualize the mouse weights grouped by genotype. + +boxplot(weight ~ genotype, data=mice_data , + col=c('orange','blue'), + main="mice weight by genotype") + +# 3) Perform a test to see whether the mouse weight is different between the two genotypes. +t.test(KO_weights, WT_weights) #NB : here the data is not paired : WT and KO mice are different individuals +# we do not reject HO : the is no significant difference between WT and KO mice mean weights for any reasonable significance level +# (p=0.8514) + + +# Alternative: Use non-parametric test +# Does not assume normally distributed data +# However, assumes the same standard deviation in all groups (while Welch's t test does not) +# Therefore, probably not better + +wilcox.test(weight ~ genotype, data=mice_data) +# no significative difference between the mean weights of WT and KO mice + + + + +# 4) Repeat step 1 to 3 for the diet variable. + +HFD_weights = mice_data$weight[mice_data$diet=="HFD"] +CHOW_weights = mice_data$weight[mice_data$diet=="CHOW"] + +par(mfrow=c(2,2)) + +hist(HFD_weights, freq=FALSE, + xlab="HFD", main="weights of HFD mice") +qqnorm(HFD_weights) +qqline(HFD_weights) + +hist(CHOW_weights, freq=FALSE, + xlab="CHOW", main="weights of CHOW mice") +qqnorm(CHOW_weights) +qqline(CHOW_weights) + +par(mfrow=c(1,1)) +# all seems OK + +boxplot(weight ~ diet, data=mice_data, col=c('orange','blue'), main="Mouse Weight by Diet") + +t.test(HFD_weights, CHOW_weights ) #NB : here the data is not paired : HFD and CHOW mice are different individuals +# we can reject HO : there is a significant difference between HFD and CHOW mice mean weights at significance level p=0.05 +# (CHOW weight are on average lower) + + +# Out of curiosity, let's see what result we would get with the non-parametric Whitney Mann U test (implemented in wilcox.test) +wilcox.test(weight ~ diet, data=mice_data) + diff --git a/solutions/R_practice11_solution.R b/solutions/R_practice11_solution.R new file mode 100644 index 0000000..fbbff86 --- /dev/null +++ b/solutions/R_practice11_solution.R @@ -0,0 +1,31 @@ + +# First Steps in R, Practice 11 + +# 1) Load the package MASS using library(). (You may need to install it first). +# Load the dataset Pima.tr using data(). Use ? to get an idea which variables it contains. + +library(MASS) +data(Pima.tr) +?Pima.tr + + +#2) Hypothesis: Blood glucose level (glu) is associated with diastolic blood pressure (bp). +# Run a linear model to test the hypothesis. + +model_fit <- lm(bp~glu, data=Pima.tr) +summary(model_fit) + + +# the linear association between glucose and blood pressure is significant at the 0.05 alpha level +# (p = 0.000115) + +# 3) Visualize the fit with a scatter plot and a trend line. +par(mfrow=c(1,1)) +plot(Pima.tr$glu, Pima.tr$bp, pch=20, xlab="blood glucose", ylab="blood pressure") +abline(model_fit, col='red' , lwd=3) + +# 4) Check the assumption of normality of the residuals graphically. +par(mfrow=c(2,1)) +plot(model_fit, which=1) +plot(model_fit, which=2) +# Looks quite ok diff --git a/solutions/R_practice7_solution.R b/solutions/R_practice7_solution.R index 4c5dd82..546583e 100644 --- a/solutions/R_practice7_solution.R +++ b/solutions/R_practice7_solution.R @@ -1,90 +1,48 @@ -# First Steps in R, Practice 7 - -# Import the mouse data from the file mice_data_mod.csv. -# - - -# 1) Check your data frame: did it load correctly? Make sure genotype and diet are factor variables. - -getwd() # check where you are. If you didn't change anything, you will be in the folder with the .Rproj file (rpoject root) - -mice_data <- read.table("course_datasets/mice_data_mod.csv", header=TRUE, sep=",") # define classes for columns - -str(mice_data) - - -# 2) Convert genotype and diet to factor variables -# define the order of factor levels -mice_data$genotype <- factor(mice_data$genotype, levels=c("WT", "KO")) -mice_data$diet <- factor(mice_data$diet, levels=c("CHOW", "HFD")) - -# - -# 3) Plot an histogram of mouse weight and customize it with colours, labels, title and represent the density line on top. -hist(mice_data$weight, - freq=FALSE, breaks=8, - main="Mouse Weight", - col="orange" , - xlab="weight") -lines(density(mice_data$weight), col='blue') -# Note: freq=FALSE makes the histogram density based, which makes it scale well with the density line - - -# 4) Make a scatter plot of respiratory rate against mouse weights using the function plot(). -# Function arguments: -# - use solid circles as plotting symbol -# - add a title -# - customise the axis labels (“Weight [g]”, “Respiratory Rate [bpm]”) -# - colour the points by genotype. -# Add a legend for the genotype. - - -plot(mice_data$weight,mice_data$respiratoryRate, - pch=19, - main="Respiratory Rate vs Weight in Mice", - xlab="Weight [g]", ylab="Respiratory Rate [bpm]", - col=c("orange", "blue")[mice_data$genotype] - ) - -legend("bottomright", - legend=levels(mice_data$genotype), - col=c("orange","blue"), - pch=19) - -abline(lm(mice_data$respiratoryRate ~ mice_data$weight), - col="black", lwd=1.5) - -# 5) Make boxplots of weights from WT and KO mice. Customize with title, labels, colors. - -boxplot(weight ~ genotype, data= mice_data, - col=c("orange", "blue"), - main="Mouse Weight by Genotype" -) -points(weight ~ genotype, data= mice_data) - - -# 6) Optional: Repeat 4 with diet instead of genotype. - - -plot(mice_data$weight,mice_data$respiratoryRate, - pch=19, - main="Respiratory Rate vs Weight in Mice", - xlab="Weight [g]", ylab="Respiratory Rate [bpm]", - col=c("darkred", "goldenrod")[mice_data$diet] -) - -legend("bottomright", - legend=levels(mice_data$diet), - col=c("darkred","goldenrod"), - pch=19) - -abline(lm(mice_data$respiratoryRate ~ mice_data$weight), - col="black", lwd=1.5) - -# 7) Optional: Repeat 5 with diet instead of genotype. - -boxplot(weight ~ diet, data= mice_data, - col=c("darkred", "goldenrod"), - main="Mouse Weight by Diet" -) -points(weight ~ diet, data= mice_data) \ No newline at end of file +# First Steps in R, Practice 7 + +# Import the mouse data from the file mice_data_mod.csv. +# + + +# 1) Check your data frame: did it load correctly? Make sure genotype and diet are factor variables. + +getwd() # check where you are. If you didn't change anything, you will be in the folder with the .Rproj file (rpoject root) + +mice_data <- read.table("course_datasets/mice_data_mod.csv", header=TRUE, sep=",") # define classes for columns + +str(mice_data) + + +# 2) Convert genotype and diet to factor variables +# define the order of factor levels +mice_data$genotype <- factor(mice_data$genotype, levels=c("WT", "KO")) +mice_data$diet <- factor(mice_data$diet, levels=c("CHOW", "HFD")) + + + + +# 3) Make a scatter plot of respiratory rate against mouse weights using the function plot(). +# Function arguments: +# - use solid circles as plotting symbol +# - add a title +# - customise the axis labels (“Weight [g]”, “Respiratory Rate [bpm]”) +# - colour the points by genotype. + +plot(mice_data$weight,mice_data$respiratoryRate, + pch=19, + main="Respiratory Rate vs Weight in Mice", + xlab="Weight [g]", ylab="Respiratory Rate [bpm]", + col=c("orange", "blue")[mice_data$genotype] + ) + +# 4) Fit a trend line using the function abline() +abline(lm(mice_data$respiratoryRate ~ mice_data$weight), + col="black", lwd=1.5) + +# 5) Add a legend for the genotype. +legend("bottomright", + legend=levels(mice_data$genotype), + col=c("orange","blue"), + pch=19) + + diff --git a/solutions/R_practice8_solution.R b/solutions/R_practice8_solution.R index 70c748e..beef408 100644 --- a/solutions/R_practice8_solution.R +++ b/solutions/R_practice8_solution.R @@ -1,84 +1,30 @@ -# First Steps in R, Practice 8 - -# 1) Make a multi-panel figure with the four graphics on one page, exporting the figure to a png file. -# Set width and height arguments in the call to png() to make it look nice. - -pdf("mice_data_plots_by_genotype.pdf", width=7, height=7, paper="a4") - -# we want to make 4 plots on the same panel -> 2 rows and 2 columns -par(mfrow=c(2,2)) - -# Plot 1 -plot(mice_data$weight,mice_data$respiratoryRate, - pch=19, - main="Respiratory Rate vs Weight in Mice", - xlab="Weight [g]", ylab="Respiratory Rate [bpm]", - col=c("orange", "blue")[mice_data$genotype] -) - -legend("bottomright", - legend=levels(mice_data$genotype), - col=c("orange","blue"), - pch=19) - -abline(lm(mice_data$respiratoryRate ~ mice_data$weight), - col="black", lwd=1.5) - -# Plot 2 -boxplot(weight ~ genotype, data= mice_data, - col=c("orange", "blue"), - main="Mouse Weight by Genotype" -) -points(weight ~ genotype, data= mice_data) - - -# Plot 3 -plot(mice_data$weight,mice_data$respiratoryRate, - pch=19, - main="Respiratory Rate vs Weight in Mice", - xlab="Weight [g]", ylab="Respiratory Rate [bpm]", - col=c("darkred", "goldenrod")[mice_data$diet] -) - -legend("bottomright", - legend=levels(mice_data$diet), - col=c("darkred","goldenrod"), - pch=19) - -abline(lm(mice_data$respiratoryRate ~ mice_data$weight), - col="black", lwd=1.5) - -# Plot4 -boxplot(weight ~ diet, data= mice_data, - col=c("darkred", "goldenrod"), - main="Mouse Weight by Diet" -) -points(weight ~ diet, data= mice_data) - -dev.off() - - -# 2) Optional: Export the histogram (3 from previous exercise) to a png file. -# Set width and height arguments in the call to png() to make it look nice. - -# png: width and height are in pixels by default - -png("hist_weight.png", width=800, height=600) -hist(mice_data$weight, - freq=FALSE, breaks=8, - main="Mouse Weight", - col="orange" , - xlab="weight") -lines(density(mice_data$weight), col='blue') -dev.off() - -# 3) Look at the multi-panel figure. Are your impressions about mouse weight from yesterday's exploration of data summaries confirmed by today's visualizations? -# Yes. -# Genotpye: The two genotypes (KO v WT) have very similar mean weights. -# Diet: Mice on HFD are heavier on average than mice on CHOW diet, and their weights are more variable. - - - - - - +mice_data <- read.table("course_datasets/mice_data_mod.csv", header=TRUE, sep=",") # define classes for columns +mice_data$genotype <- factor(mice_data$genotype, levels=c("WT", "KO")) +mice_data$diet <- factor(mice_data$diet, levels=c("CHOW", "HFD")) + + +# 1) Plot an histogram of mouse weight and customize it with colours, labels, title and represent the density line on top. +hist(mice_data$weight, + freq=FALSE, breaks=8, + main="Mouse Weight", + col="orange" , + xlab="weight") +lines(density(mice_data$weight), col='blue') +# Note: freq=FALSE makes the histogram density based, which makes it scale well with the density line + +# 2) Make boxplots of weights from WT and KO mice. Customize with title, labels, colors. + +boxplot(weight ~ genotype, data= mice_data, + col=c("orange", "blue"), + main="Mouse Weight by Genotype" +) +points(weight ~ genotype, data= mice_data) + + +# 3) Optional: Repeat 2 with diet instead of genotype. + +boxplot(weight ~ diet, data= mice_data, + col=c("darkred", "goldenrod"), + main="Mouse Weight by Diet" +) +points(weight ~ diet, data= mice_data) \ No newline at end of file diff --git a/solutions/R_practice9_solution.R b/solutions/R_practice9_solution.R index 69cc540..30f0b53 100644 --- a/solutions/R_practice9_solution.R +++ b/solutions/R_practice9_solution.R @@ -1,95 +1,84 @@ -# First Steps in R, Practice 9 - -# Come back to the mice data-set stored in the "mice_data" data frame. - -mice_data <- read.table( "course_datasets/mice_data.csv" , header=T , sep=",", - colClasses=c("factor", "factor", "numeric")) # define classes for columns ) - -mice_data$genotype <- factor(mice_data$genotype, levels=c("WT", "KO")) - - -# 1) Considering WT mice weight and KO mice weight separately, check the assumption of normality graphically. - - -KO_weights = mice_data$weight[ mice_data$genotype=="KO"] -WT_weights = mice_data$weight[ mice_data$genotype=="WT"] - -par(mfrow=c(2,2)) - -hist(KO_weights, freq=FALSE, - xlab="KO", main="weights of KO mice") -qqnorm(KO_weights) -qqline(KO_weights) - -hist(WT_weights, freq=FALSE, - xlab="WT", main="weights of WT mice") -qqnorm(WT_weights) -qqline(WT_weights) - -par(mfrow=c(1,1)) # removes setting in par - - -# KO mice: normality assumption seems ok - -# WT mice: -# histogram - suggests no normality (but small sample size for this) -# QQ Plot - quite ok - - - -# Welch's t-test is fairly robust if the deviation from normality is not extreme, -# so let's continue - -# 2) Make an appropriate plot to visualize the mouse weights grouped by genotype. - -boxplot(weight ~ genotype, data=mice_data , - col=c('orange','blue'), - main="mice weight by genotype") - -# 3) Perform a test to see whether the mouse weight is different between the two genotypes. -t.test(KO_weights, WT_weights) #NB : here the data is not paired : WT and KO mice are different individuals -# we do not reject HO : the is no significant difference between WT and KO mice mean weights for any reasonable significance level -# (p=0.8514) - - -# Alternative: Use non-parametric test -# Does not assume normally distributed data -# However, assumes the same standard deviation in all groups (while Welch's t test does not) -# Therefore, probably not better - -wilcox.test(weight ~ genotype, data=mice_data) -# no significative difference between the mean weights of WT and KO mice - - - - -# 4) Repeat step 1 to 3 for the diet variable. - -HFD_weights = mice_data$weight[mice_data$diet=="HFD"] -CHOW_weights = mice_data$weight[mice_data$diet=="CHOW"] - -par(mfrow=c(2,2)) - -hist(HFD_weights, freq=FALSE, - xlab="HFD", main="weights of HFD mice") -qqnorm(HFD_weights) -qqline(HFD_weights) - -hist(CHOW_weights, freq=FALSE, - xlab="CHOW", main="weights of CHOW mice") -qqnorm(CHOW_weights) -qqline(CHOW_weights) - -par(mfrow=c(1,1)) -# all seems OK - -boxplot(weight ~ diet, data=mice_data, col=c('orange','blue'), main="Mouse Weight by Diet") - -t.test(HFD_weights, CHOW_weights ) #NB : here the data is not paired : HFD and CHOW mice are different individuals -# we can reject HO : there is a significant difference between HFD and CHOW mice mean weights at significance level p=0.05 -# (CHOW weight are on average lower) - - -# Out of curiosity, let's see what result we would get with the non-parametric Whitney Mann U test (implemented in wilcox.test) -wilcox.test(weight ~ diet, data=mice_data) - +# First Steps in R, Practice 9 + +# 1) Make a multi-panel figure with the four graphics on one page, exporting the figure to a png file. +# Set width and height arguments in the call to png() to make it look nice. + +pdf("mice_data_plots_by_genotype.pdf", width=7, height=7, paper="a4") + +# we want to make 4 plots on the same panel -> 2 rows and 2 columns +par(mfrow=c(2,2)) + +# Plot 1 +plot(mice_data$weight,mice_data$respiratoryRate, + pch=19, + main="Respiratory Rate vs Weight in Mice", + xlab="Weight [g]", ylab="Respiratory Rate [bpm]", + col=c("orange", "blue")[mice_data$genotype] +) + +legend("bottomright", + legend=levels(mice_data$genotype), + col=c("orange","blue"), + pch=19) + +abline(lm(mice_data$respiratoryRate ~ mice_data$weight), + col="black", lwd=1.5) + +# Plot 2 +boxplot(weight ~ genotype, data= mice_data, + col=c("orange", "blue"), + main="Mouse Weight by Genotype" +) +points(weight ~ genotype, data= mice_data) + + +# Plot 3 +plot(mice_data$weight,mice_data$respiratoryRate, + pch=19, + main="Respiratory Rate vs Weight in Mice", + xlab="Weight [g]", ylab="Respiratory Rate [bpm]", + col=c("darkred", "goldenrod")[mice_data$diet] +) + +legend("bottomright", + legend=levels(mice_data$diet), + col=c("darkred","goldenrod"), + pch=19) + +abline(lm(mice_data$respiratoryRate ~ mice_data$weight), + col="black", lwd=1.5) + +# Plot4 +boxplot(weight ~ diet, data= mice_data, + col=c("darkred", "goldenrod"), + main="Mouse Weight by Diet" +) +points(weight ~ diet, data= mice_data) + +dev.off() + + +# 2) Optional: Export the histogram (3 from previous exercise) to a png file. +# Set width and height arguments in the call to png() to make it look nice. + +# png: width and height are in pixels by default + +png("hist_weight.png", width=800, height=600) +hist(mice_data$weight, + freq=FALSE, breaks=8, + main="Mouse Weight", + col="orange" , + xlab="weight") +lines(density(mice_data$weight), col='blue') +dev.off() + +# 3) Look at the multi-panel figure. Are your impressions about mouse weight from yesterday's exploration of data summaries confirmed by today's visualizations? +# Yes. +# Genotpye: The two genotypes (KO v WT) have very similar mean weights. +# Diet: Mice on HFD are heavier on average than mice on CHOW diet, and their weights are more variable. + + + + + +