From d20b0bd3983ed93ced9075495a9c73a42b58ac3a Mon Sep 17 00:00:00 2001 From: Sue McClatchy Date: Fri, 1 Nov 2024 12:10:26 -0400 Subject: [PATCH] reformat text and code --- episodes/differential-expression-analysis.Rmd | 541 ++++++++++-------- 1 file changed, 316 insertions(+), 225 deletions(-) diff --git a/episodes/differential-expression-analysis.Rmd b/episodes/differential-expression-analysis.Rmd index 7d3892e..f7329ed 100644 --- a/episodes/differential-expression-analysis.Rmd +++ b/episodes/differential-expression-analysis.Rmd @@ -53,17 +53,17 @@ suppressPackageStartupMessages(library("tidyverse")) suppressPackageStartupMessages(library("dplyr")) ``` -**Reading Gene Expression Count matrix from Previous Lesson** +**Reading Gene Expression Count matrix from previous lesson** -In this lesson, we will use the raw counts matrix and metadata -downloaded in the previous lesson and will perform differential -expression analysis. +In this lesson, we will use the raw counts matrix and metadata downloaded in the +previous lesson and will perform differential expression analysis. ```{r} counts <- read.delim("data/htseqcounts_5XFAD.txt", check.names = FALSE) ``` **Reading Sample Metadata from Previous Lesson** + ```{r} covars <- readRDS("data/covars_5XFAD.rds") ``` @@ -71,16 +71,19 @@ covars <- readRDS("data/covars_5XFAD.rds") Let’s explore the data: Let’s look at the top of the metadata. + ```{r} head(covars) ``` identify distinct groups using sample metadata + ```{r} distinct(covars, sex, genotype, timepoint) ``` How many mice were used to produce this data? + ```{r} covars %>% group_by(sex, genotype, timepoint) %>% @@ -88,74 +91,77 @@ covars %>% ``` How many rows and columns are there in counts? + ```{r} dim(counts) ``` -In the counts matrix, genes are in rows and samples are in columns. -Let’s look at the first few rows. +In the counts matrix, genes are in rows and samples are in columns. Let’s look +at the first few rows. + ```{r} head(counts, n=5) ``` -As you can see, the gene ids are ENSEMBL IDs. There is some risk that -these may not be unique. Let’s check whether any of the gene symbols are -duplicated. We will sum the number of duplicated gene symbols. +As you can see, the gene ids are ENSEMBL IDs. There is some risk that these may +not be unique. Let’s check whether any of the gene symbols are duplicated. We +will sum the number of duplicated gene symbols. + ```{r} sum(duplicated(rownames(counts))) ``` -The sum equals zero, so there are no duplicated gene symbols, which is -good. Similarly, samples should be unique. Once again, let’s verify -this: +The sum equals zero, so there are no duplicated gene symbols, which is good. +Similarly, samples should be unique. Once again, let’s verify this: + ```{r} sum(duplicated(colnames(counts))) ``` **Formatting the count matrix** -Now, as we see that gene_id is in first column of count matrix, but we -will need only count data in matrix, so we will change the gene_id -column to rownames. Converting the gene_id as rownames of count matrix +Now, as we see that `gene_id` is in first column of count matrix, but we will +need only count data in matrix, so we will change the `gene_id` column to +`rownames`. ```{r} +# Converting the `gene_id` as `rownames` of `counts` matrix counts <- counts %>% column_to_rownames(., var="gene_id") %>% as.data.frame() ``` -let’s confirm if change is done correctly +Let’s confirm if change is done correctly. ```{r} head(counts, n=5) ``` -As you can see from count table there are some genes that start with -“ENSG” and others start with “ENSMUSG”. “ENSG” referes to human gene -ENSEMBL id and “ENSMUSG” refer to mouse ENSEMBL id. Let’s check how many -gene_ids are NOT from the mouse genome by searching for the string “MUS” -(as in Mus musculus) in the rownames of count matrix. +As you can see from count table there are some genes that start with `ENSG` and +others start with `ENSMUSG`. `ENSG` refers to human gene ENSEMBL id and +`ENSMUSG` refer to mouse ENSEMBL id. Let’s check how many `gene_ids` are NOT +from the mouse genome by searching for the string "MUS" (as in *Mus musculus*) +in the `rownames` of the `counts` matrix. ```{r} counts[, 1:6] %>% filter(!str_detect(rownames(.), "MUS")) ``` -Ok, so we see there are two human genes in out count matrix. Why? What -genes are they? +Ok, so we see there are two human genes in out count matrix. Why? What genes are +they? -Briefly, 5xFAD mouse strain harbors two human transgenes APP -(“ENSG00000142192”) and PSEN1 (“ENSG00000080815”) and inserted into exon -2 of the mouse Thy1 gene. To validate 5XFAD strain and capture -expression of human transgene APP and PS1, a custom mouse genomic -sequence was created and we quantified expression of human as well as -mouse App (“ENSMUSG00000022892”) and Psen1 (“ENSMUSG00000019969”) genes -by our MODEL-AD RNA-Seq pipeline. +Briefly, the 5xFAD mouse strain harbors two human transgenes APP +(`ENSG00000142192`) and PSEN1 (`ENSG00000080815`) and inserted into exon 2 of +the mouse Thy1 gene. To validate 5XFAD strain and capture expression of human +transgene APP and PS1, a custom mouse genomic sequence was created and we +quantified expression of human as well as mouse App (`ENSMUSG00000022892`) and +Psen1 (`ENSMUSG00000019969`) genes by our MODEL-AD RNA-Seq pipeline. **Validation of 5xFAD mouse strain** -First we convert the dataframe to -longer format and join our covariates by MouseID +First we convert the dataframe to longer format and join our covariates by +`MouseID`. ```{r} count_tpose <- counts %>% @@ -197,85 +203,85 @@ Visualize orthologous genes. ```{r} #Create simple box plots showing normalized counts by genotype and time point, faceted by sex. count_tpose %>% - ggplot(aes(x=timepoint, y=counts, color=genotype)) + - geom_boxplot() + - geom_point(position=position_jitterdodge()) + - facet_wrap(~sex+gene_id) +theme_bw() + ggplot(aes(x = timepoint, y = counts, color = genotype)) + + geom_boxplot() + + geom_point(position = position_jitterdodge()) + + facet_wrap(~ sex + gene_id) + + theme_bw() ``` -You will notice expression of Human APP is higher in 5XFAD carriers but -lower in non-carriers. However mouse App expressed in both 5XFAD carrier -and non-carrier. +You will notice expression of Human APP is higher in 5XFAD carriers but lower in +non-carriers. However mouse App expressed in both 5XFAD carrier and non-carrier. + +We are going to sum the counts from both orthologous genes (human APP and mouse +App; human PSEN1 and mouse Psen1) and save the summed expression as expression +of mouse genes, respectively to match with gene names in control mice. -We are going to sum the counts from both ortholgous genes (human APP and -mouse App; human PSEN1 and mouse Psen1) and save summed expression as -expression of mouse genes, respectively to match with gene names in -control mice. ```{r} # merge mouse and human APP gene raw count -counts[rownames(counts) %in% "ENSMUSG00000022892",] <- -counts[rownames(counts) %in% "ENSMUSG00000022892",] + -counts[rownames(counts) %in% "ENSG00000142192",] +counts[rownames(counts) %in% "ENSMUSG00000022892", ] <- + counts[rownames(counts) %in% "ENSMUSG00000022892", ] + + counts[rownames(counts) %in% "ENSG00000142192", ] + counts <- counts[!rownames(counts) %in% c("ENSG00000142192"), ] # merge mouse and human PS1 gene raw count -counts[rownames(counts) %in% "ENSMUSG00000019969",] <- -counts[rownames(counts) %in% "ENSMUSG00000019969",] + -counts[rownames(counts) %in% "ENSG00000080815",] +counts[rownames(counts) %in% "ENSMUSG00000019969", ] <- + counts[rownames(counts) %in% "ENSMUSG00000019969", ] + + counts[rownames(counts) %in% "ENSG00000080815", ] + counts <- counts[!rownames(counts) %in% c("ENSG00000080815"), ] ``` Let’s verify if expression of both human genes have been merged or not: + ```{r} counts[, 1:6] %>% filter(!str_detect(rownames(.), "MUS")) ``` What proportion of genes have zero counts in all samples? + ```{r} gene_sums <- data.frame(gene_id = rownames(counts), sums = Matrix::rowSums(counts)) sum(gene_sums$sums == 0) ``` -We can see that 9691 (17%) genes have no reads at all associated with -them. In the next lesson, we will remove genes that have no counts in -any samples. - +We can see that 9,691 (17%) genes have no reads at all associated with them. In +the next lesson, we will remove genes that have no counts in any samples. ## Differential Analysis using DESeq2 -Now, after exploring and formatting -the data, We will look for differential expression between the control -and 5xFAD mice at different ages for both sexes. The differentially -expressed genes (DEGs) can inform our understanding of how the 5XFAD -mutation affect the biological processes. +Now, after exploring and formatting the data, We will look for differential +expression between the control and 5xFAD mice at different ages for both sexes. +The differentially expressed genes (DEGs) can inform our understanding of how +the 5XFAD mutation affects biological processes. -DESeq2 analysis consist of multiple steps. We are going to briefly -understand some of the important steps using subset of data and then we -will perform differential analysis on whole dataset. +DESeq2 analysis consist of multiple steps. We are going to briefly understand +some of the important steps using a subset of data and then we will perform differential analysis on the whole dataset. -First, order the data (so counts and metadata specimenID orders match) and save -as another variable name. +First, order the data (so counts and metadata `specimenID` orders match) and +save as another variable name. ```{r} rawdata <- counts[, sort(colnames(counts))] metadata <- covars[sort(rownames(covars)), ] ``` -subset the counts matrix and sample metadata to include only 12 month -old male mice. You can amend the code to compare wild type and 5XFAD -mice from either sex, at any time point. +Subset the counts matrix and sample metadata to include only 12-month old male +mice. You can amend the code to compare wild type and 5XFAD mice from either +sex, at any time point. ```{r} -meta.12M.Male <- metadata[(metadata$sex=="male" & - metadata$timepoint=='12 mo'), ] +meta.12M.Male <- metadata[(metadata$sex == "male" & + metadata$timepoint == "12 mo"), ] meta.12M.Male ``` ```{r} -dat <- as.matrix(rawdata[,colnames(rawdata) %in% rownames(meta.12M.Male)]) +dat <- as.matrix(rawdata[ , colnames(rawdata) %in% rownames(meta.12M.Male)]) colnames(dat) ``` @@ -284,14 +290,15 @@ rownames(meta.12M.Male) ``` ```{r} -match(colnames(dat),rownames(meta.12M.Male)) +match(colnames(dat), rownames(meta.12M.Male)) ``` -Next, we build the DESeqDataSet using the following function: +Next, we build the `DESeqDataSet` using the following function: + ```{r} -ddsHTSeq <- DESeqDataSetFromMatrix(countData=dat, - colData=meta.12M.Male, - design = ~genotype) +ddsHTSeq <- DESeqDataSetFromMatrix(countData = dat, + colData = meta.12M.Male, + design = ~ genotype) ``` ```{r} @@ -300,86 +307,90 @@ ddsHTSeq **Pre-filtering** -While it is not necessary to pre-filter low count genes -before running the DESeq2 functions, there are two reasons which make -pre-filtering useful: by removing rows in which there are very few -reads, we reduce the memory size of the dds data object, and we increase -the speed of the transformation and testing functions within DESeq2. It -can also improve visualizations, as features with no information for -differential expression are not plotted. +While it is not necessary to pre-filter low count genes before running the +DESeq2 functions, there are two reasons which make pre-filtering useful: by +removing rows in which there are very few reads, we reduce the memory size of +the dds data object, and we increase the speed of the transformation and testing +functions within DESeq2. It can also improve visualizations, as features with no +information for differential expression are not plotted. + +Here we perform a minimal pre-filtering to keep only rows that have at least 10 +reads total. -Here we perform a minimal pre-filtering to keep only rows that have at -least 10 reads total. ```{r} -ddsHTSeq <- ddsHTSeq[rowSums(counts(ddsHTSeq)) >= 10,] +ddsHTSeq <- ddsHTSeq[rowSums(counts(ddsHTSeq)) >= 10, ] ddsHTSeq ``` **Reference level** -By default, R will choose a reference level for factors -based on alphabetical order. Then, if you never tell the DESeq2 -functions which level you want to compare against (e.g. which level -represents the control group), the comparisons will be based on the -alphabetical order of the levels. +By default, R will choose a reference level for factors based on alphabetical +order. Then, if you never tell the `DESeq2` functions which level you want to +compare against (*e.g.* which level represents the control group), the +comparisons will be based on the alphabetical order of the levels. -specifying the reference-level to 5XFAD_noncarrier: ```{r} -ddsHTSeq$genotype <- relevel(ddsHTSeq$genotype,ref="5XFAD_noncarrier") +# specifying the reference-level to `5XFAD_noncarrier` +ddsHTSeq$genotype <- relevel(ddsHTSeq$genotype, ref = "5XFAD_noncarrier") ``` Run the standard differential expression analysis steps that is wrapped -into a single function, DESeq. +into a single function, `DESeq`. + ```{r} -dds <- DESeq(ddsHTSeq,parallel = TRUE) +dds <- DESeq(ddsHTSeq, parallel = TRUE) ``` -Results tables are generated using the function results, which extracts -a results table with log2 fold changes, p values and adjusted p values. -By default the argument alpha is set to 0.1. If the adjusted p value -cutoff will be a value other than 0.1, alpha should be set to that -value: +Results tables are generated using the function results, which extracts a +results table with log2 fold changes, p-values and adjusted p-values. By default +the argument `alpha` is set to 0.1. If the adjusted p-value cutoff will be a +value other than 0.1, alpha should be set to that value: + ```{r} -res <- results(dds,alpha=0.05) # setting 0.05 as significant threshold +res <- results(dds, alpha=0.05) # setting 0.05 as significant threshold res ``` -We can order our results table by the smallest p value: +We can order our results table by the smallest p-value: + ```{r} -resOrdered <- res[order(res$pvalue),] +resOrdered <- res[order(res$pvalue), ] -head(resOrdered,n=10) +head(resOrdered, n=10) ``` -we can summarize some basic tallies using the summary function. +We can summarize some basic tallies using the summary function. + ```{r} summary(res) ``` How many adjusted p-values were less than 0.05? + ```{r} sum(res$padj < 0.05, na.rm=TRUE) ``` How many adjusted p-values were less than 0.1? + ```{r} sum(res$padj < 0.1, na.rm=TRUE) ``` **Function to convert ensembleIDs to common gene names** -We’ll use a package -to translate mouse ENSEMBL IDS to gene names. Run this function and they -will be called up when assembling results from the differential -expression analysis +We’ll use a package to translate mouse ENSEMBL IDS to gene names. Run this +function and they will be called up when assembling results from the +differential expression analysis. + ```{r} map_function.df <- function(x, inputtype, outputtype) { mapIds( org.Mm.eg.db, - keys = row.names(x), - column = outputtype, - keytype = inputtype, + keys = row.names(x), + column = outputtype, + keytype = inputtype, multiVals = "first" ) } @@ -387,25 +398,39 @@ map_function.df <- function(x, inputtype, outputtype) { **Generating Results table** -Here we will call the function to get the 'symbol' names of the genes incorporated into the results table, along with the columns we are most interested in +Here we will call the function to get the `symbol` names of the genes +incorporated into the results table, along with the columns we are most +interested in. + ```{r} All_res <- as.data.frame(res) %>% - mutate(symbol = map_function.df(res,"ENSEMBL","SYMBOL")) %>% ##run map_function to add symbol of gene corresponding to ENSEBL ID - mutate(EntrezGene = map_function.df(res,"ENSEMBL","ENTREZID")) %>% ##run map_function to add Entrez ID of gene corresponding to ENSEBL ID - dplyr::select("symbol", "EntrezGene","baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj") + # run map_function to add symbol of gene corresponding to ENSEMBL ID + mutate(symbol = map_function.df(res, "ENSEMBL", "SYMBOL")) %>% + # run map_function to add Entrez ID of gene corresponding to ENSEMBL ID + mutate(EntrezGene = map_function.df(res, "ENSEMBL", "ENTREZID")) %>% + dplyr::select("symbol", + "EntrezGene", + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj") head(All_res) ``` **Extracting genes that are significantly expressed** -Let's subset all the genes with a pvalue < 0.05 +Let's subset all the genes with a p-value < 0.05. + ```{r} dseq_res <- subset(All_res[order(All_res$padj), ], padj < 0.05) ``` Wow! We have a lot of genes with apparently very strong statistically significant differences between the control and 5xFAD carrier. + ```{r} dim(dseq_res) ``` @@ -418,71 +443,84 @@ head(dseq_res) **Exporting results to CSV files** -we can save results file into a csv file like this: +We can save results file into a csv file like this: + ```{r} -write.csv(All_res,file="results/All_5xFAD_12months_male.csv") -write.csv(dseq_res,file="results/DEG_5xFAD_12months_male.csv") +write.csv(All_res, file="results/All_5xFAD_12months_male.csv") +write.csv(dseq_res, file="results/DEG_5xFAD_12months_male.csv") ``` **Volcano plot** -We can visualize the differential expression results using -Volcano plot function from EnhancedVolcano package. For the most basic -volcano plot, only a single data-frame, data-matrix, or tibble of test -results is required, containing point labels, log2FC, and adjusted or -unadjusted P values. The default cut-off for log2FC is \>\|2\|; the -default cut-off for P value is 10e-6. +We can visualize the differential expression results using the volcano plot +function from `EnhancedVolcano` package. For the most basic volcano plot, only a +single data frame, data matrix, or tibble of test results is required, +containing point labels, log2FC, and adjusted or unadjusted p-values. The +default cut-off for log2FC is \>\|2\|; the default cut-off for p-value is 10e-6. + ```{r} EnhancedVolcano(All_res, - lab = (All_res$symbol), - x = 'log2FoldChange', - y = 'padj',legendPosition = 'none', - title = 'Volcano plot:Differential Expression Results', - subtitle = '', - FCcutoff = 0.1, - pCutoff = 0.05, - xlim = c(-3, 6)) + lab = (All_res$symbol), + x = 'log2FoldChange', + y = 'padj', + legendPosition = 'none', + title = 'Volcano plot:Differential Expression Results', + subtitle = '', + FCcutoff = 0.1, + pCutoff = 0.05, + xlim = c(-3, 6)) ``` -You can see that some top significantly expressed are -immune/inflammation-related genes such as Ctsd, C4b, Csf1 etc. These -genes are upregulated in the 5XFAD strain. - +You can see that some top significantly expressed are +immune/inflammation-related genes such as Ctsd, C4b, Csf1 etc. These genes are +upregulated in the 5XFAD strain. **Principal component plot of the samples** -Principal component analysis is a dimension reduction technique that reduces the dimensionality of these large matrixes into a linear coordinate system, so that we can more easily visualize what factors are contributing the most to variation in the dataset by graphing the principal components. +Principal component analysis is a dimension reduction technique that reduces the dimensionality of these large matrixes into a linear coordinate system, so that +we can more easily visualize what factors are contributing the most to variation +in the dataset by graphing the principal components. + ```{r} -ddsHTSeq <- DESeqDataSetFromMatrix(countData=as.matrix(rawdata), colData=metadata, design= ~ genotype) +ddsHTSeq <- DESeqDataSetFromMatrix(countData = as.matrix(rawdata), + colData = metadata, + design = ~ genotype) ``` ```{r} -ddsHTSeq <- ddsHTSeq[rowSums(counts(ddsHTSeq)>1) >= 10, ] -dds <- DESeq(ddsHTSeq,parallel = TRUE) +ddsHTSeq <- ddsHTSeq[rowSums(counts(ddsHTSeq) > 1) >= 10, ] +dds <- DESeq(ddsHTSeq, parallel = TRUE) ``` ```{r} -vsd <- varianceStabilizingTransformation(dds, blind=FALSE) +vsd <- varianceStabilizingTransformation(dds, blind = FALSE) -plotPCA(vsd, intgroup=c("genotype", "sex","timepoint")) +plotPCA(vsd, intgroup = c("genotype", "sex", "timepoint")) ``` -We can see that clustering is occurring, though it's kind of hard to see exactly how they are clustering in this visualization. -It is also possible to customize the PCA plot using the ggplot function. +We can see that clustering is occurring, though it's kind of hard to see exactly +how they are clustering in this visualization. + +It is also possible to customize the PCA plot using the `ggplot` function. + ```{r} -pcaData <- plotPCA(vsd, intgroup=c("genotype", "sex","timepoint"), returnData=TRUE) +pcaData <- plotPCA(vsd, + intgroup = c("genotype", "sex","timepoint"), + returnData=TRUE) + percentVar <- round(100 * attr(pcaData, "percentVar")) -ggplot(pcaData, aes(PC1, PC2,color=genotype, shape=sex)) + - geom_point(size=3) + - geom_text(aes(label=timepoint),hjust=0.5, vjust=2,size =3.5) + - labs(x= paste0("PC1: ",percentVar[1],"% variance"), y= paste0("PC2: ",percentVar[2],"% variance")) +ggplot(pcaData, aes(PC1, PC2,color = genotype, shape = sex)) + + geom_point(size=3) + + geom_text(aes(label = timepoint), hjust=0.5, vjust=2, size =3.5) + + labs(x = paste0("PC1: ", percentVar[1], "% variance"), + y = paste0("PC2: ", percentVar[2], "% variance")) ``` -PCA identified genotype and sex being a major source of variation in -between 5XFAD and WT mice. Female and male samples from the 5XFAD carriers -clustered distinctly at all ages, suggesting the presence of sex-biased -molecular changes in animals. +PCA identified genotype and sex being a major source of variation in between +5XFAD and WT mice. Female and male samples from the 5XFAD carriers clustered +distinctly at all ages, suggesting the presence of sex-biased molecular changes +in animals. **Function for Differential analysis using DESeq2** @@ -490,84 +528,111 @@ molecular changes in animals. Finally, we can build a function for differential analysis that consists of all above discussed steps. It will require to input sorted raw count matrix, sample metadata and define the reference group. + ```{r} -DEG <- function(rawdata, meta, - include.batch = FALSE, - ref = ref) { - dseq_res <- data.frame() - All_res <- data.frame() +DEG <- function(rawdata, meta, include.batch = FALSE, ref = ref) { + dseq_res <- data.frame() + All_res <- data.frame() + + if (include.batch) { + cat("Including batch as covariate\n") + design_formula <- ~ Batch + genotype + } + else { + design_formula <- ~ genotype + } + + dat2 <- as.matrix(rawdata[, colnames(rawdata) %in% + rownames(meta)]) - if (include.batch) { - cat("Including batch as covariate\n") - design_formula <- ~ Batch + genotype - } - else{ - design_formula <- ~ genotype - } + ddsHTSeq <- DESeqDataSetFromMatrix(countData = dat2, + colData = meta, + design = design_formula) + + ddsHTSeq <- ddsHTSeq[rowSums(counts(ddsHTSeq)) >= 10, ] - dat2 <- as.matrix(rawdata[, colnames(rawdata) %in% rownames(meta)]) - ddsHTSeq <- - DESeqDataSetFromMatrix(countData = dat2, - colData = meta, - design = design_formula) - ddsHTSeq <- ddsHTSeq[rowSums(counts(ddsHTSeq)) >= 10, ] - ddsHTSeq$genotype <- relevel(ddsHTSeq$genotype, ref = ref) - dds <- DESeq(ddsHTSeq, parallel = TRUE) + ddsHTSeq$genotype <- relevel(ddsHTSeq$genotype, ref = ref) + + dds <- DESeq(ddsHTSeq, parallel = TRUE) - res <- results(dds, alpha = 0.05) - #summary(res) + res <- results(dds, alpha = 0.05) + #summary(res) - res$symbol <- map_function.df(res,"ENSEMBL","SYMBOL") + res$symbol <- map_function.df(res,"ENSEMBL","SYMBOL") - res$EntrezGene <- map_function.df(res,"ENSEMBL","ENTREZID") + res$EntrezGene <- map_function.df(res,"ENSEMBL","ENTREZID") - All_res <<- as.data.frame(res[, c("symbol", "EntrezGene","baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]) - -} + All_res <<- as.data.frame(res[, c("symbol", + "EntrezGene", + "baseMean", + "log2FoldChange", + "lfcSE", + "stat", + "pvalue", + "padj")]) + } ``` Let’s use this function to analyze all groups present in our data. **Differential Analysis of all groups** -First, we add a Group column to our metadata table that will combine all -variables of interest (genotype, sex, and timepoint) for each sample. +First, we add a `Group` column to our metadata table that will combine all +variables of interest (`genotype`, `sex`, and `timepoint`) for each sample. + ```{r} -metadata$Group <- paste0(metadata$genotype,"-",metadata$sex,"-",metadata$timepoint) +metadata$Group <- paste0(metadata$genotype, + "-", + metadata$sex, + "-", + metadata$timepoint) unique(metadata$Group) ``` -Next, we create a comparison table that has all cases and controls that -we would like to compare with each other. Here I have made comparison -groups for age and sex-matched 5xFAD carriers vs 5xFAD_noncarriers, with -carriers as the cases and noncarriers as the controls: +Next, we create a comparison table that has all cases and controls that we would +like to compare with each other. Here I have made comparison groups for age and +sex-matched 5xFAD carriers vs 5xFAD_noncarriers, with carriers as the cases and +noncarriers as the controls: + ```{r} -comparisons <- data.frame(control=c("5XFAD_noncarrier-male-4 mo", "5XFAD_noncarrier-female-4 mo", "5XFAD_noncarrier-male-6 mo", - "5XFAD_noncarrier-female-6 mo","5XFAD_noncarrier-male-12 mo", "5XFAD_noncarrier-female-12 mo"), - case=c("5XFAD_carrier-male-4 mo", "5XFAD_carrier-female-4 mo", "5XFAD_carrier-male-6 mo", - "5XFAD_carrier-female-6 mo","5XFAD_carrier-male-12 mo", "5XFAD_carrier-female-12 mo") - ) +comparisons <- data.frame(control = c("5XFAD_noncarrier-male-4 mo", + "5XFAD_noncarrier-female-4 mo", + "5XFAD_noncarrier-male-6 mo", + "5XFAD_noncarrier-female-6 mo", + "5XFAD_noncarrier-male-12 mo", + "5XFAD_noncarrier-female-12 mo"), + case = c("5XFAD_carrier-male-4 mo", + "5XFAD_carrier-female-4 mo", + "5XFAD_carrier-male-6 mo", + "5XFAD_carrier-female-6 mo", + "5XFAD_carrier-male-12 mo", + "5XFAD_carrier-female-12 mo")) ``` ```{r} comparisons ``` -Finally, we implement our DEG function on each case/control comparison of +Finally, we implement our `DEG` function on each case/control comparison of interest and store the result table in a list and data frame: + ```{r} # initiate an empty list and data frame to save results DE_5xFAD.list <- list() -DE_5xFAD.df <- data.frame() +DE_5xFAD.df <- data.frame() for (i in 1:nrow(comparisons)) { meta <- metadata[metadata$Group %in% comparisons[i,],] - DEG(rawdata,meta,ref = "5XFAD_noncarrier") + DEG(rawdata, meta, ref = "5XFAD_noncarrier") - #append results in data frame - DE_5xFAD.df <- rbind(DE_5xFAD.df,All_res %>% mutate(model="5xFAD",sex=unique(meta$sex),age=unique(meta$timepoint))) + # append results in data frame + DE_5xFAD.df <- rbind(DE_5xFAD.df, + All_res %>% + mutate(model = "5xFAD", + sex = unique(meta$sex), + age = unique(meta$timepoint))) #append results in list DE_5xFAD.list[[i]] <- All_res @@ -576,89 +641,115 @@ for (i in 1:nrow(comparisons)) ``` Let’s explore the result stored in our list: + ```{r} names(DE_5xFAD.list) ``` We can easily extract the result table for any group of interest by using \$ -and name of group. Let’s check top few rows from 5XFAD_carrier-male-4 mo +and name of group. Let’s check top few rows from `5XFAD_carrier-male-4 mo` group: + ```{r} head(DE_5xFAD.list$`5XFAD_carrier-male-4 mo`) ``` Let’s check the result stored as dataframe: + ```{r} head(DE_5xFAD.df) ``` Check if result is present for all ages: + ```{r} unique((DE_5xFAD.df$age)) ``` Check if result is present for both sexes: + ```{r} unique((DE_5xFAD.df$sex)) ``` Check number of genes in each group: + ```{r} -dplyr::count(DE_5xFAD.df,model,sex,age) +dplyr::count(DE_5xFAD.df, model, sex, age) ``` -Check number of genes significantly differentially expressed in all -cases compared to age and sex-matched controls: +Check number of genes significantly differentially expressed in all cases +compared to age and sex-matched controls: + ```{r} -degs.up <- map(DE_5xFAD.list, ~length(which(.x$padj<0.05 & .x$log2FoldChange>0))) -degs.down <- map(DE_5xFAD.list, ~length(which(.x$padj<0.05 & .x$log2FoldChange<0))) -deg <- data.frame(Cases=names(degs.up), Up_DEGs.pval.05=as.vector(unlist(degs.up)),Down_DEGs.pval.05=as.vector(unlist(degs.down))) +degs.up <- map(DE_5xFAD.list, + ~ length(which(.x$padj < 0.05 & + .x$log2FoldChange > 0))) + +degs.down <- map(DE_5xFAD.list, + ~ length(which(.x$padj < 0.05 & + .x$log2FoldChange < 0))) + +deg <- data.frame(Case = names(degs.up), + Up_DEGs.pval.05 = as.vector(unlist(degs.up)), + Down_DEGs.pval.05 = as.vector(unlist(degs.down))) + knitr::kable(deg) ``` Interestingly, in females more genes are differentially expressed at all age -groups, and more genes are differentially expressed the older the mice get -in both sexes. +groups, and more genes are differentially expressed the older the mice get in +both sexes. ## Pathway Enrichment -We may wish to look for enrichment of biological -pathways in a list of differentially expressed genes. Here we will test -for enrichment of KEGG pathways using using enrichKEGG function in -clusterProfiler package. +We may wish to look for enrichment of biological pathways in a list of +differentially expressed genes. Here we will test for enrichment of KEGG +pathways using using the `enrichKEGG` function in the `clusterProfiler` package. + ```{r} -dat <- list(FAD_M_4=subset(DE_5xFAD.list$`5XFAD_carrier-male-4 mo`[order(DE_5xFAD.list$`5XFAD_carrier-male-4 mo`$padj), ], padj < 0.05) %>% pull(EntrezGene), - FAD_F_4=subset(DE_5xFAD.list$`5XFAD_carrier-female-4 mo`[order(DE_5xFAD.list$`5XFAD_carrier-female-4 mo`$padj), ], padj < 0.05) %>% pull(EntrezGene)) +dat <- list(FAD_M_4 = subset(DE_5xFAD.list$`5XFAD_carrier-male-4 mo`[order(DE_5xFAD.list$`5XFAD_carrier-male-4 mo`$padj), ], + padj < 0.05) %>% + pull(EntrezGene), + FAD_F_4 = subset(DE_5xFAD.list$`5XFAD_carrier-female-4 mo`[order(DE_5xFAD.list$`5XFAD_carrier-female-4 mo`$padj), ], + padj < 0.05) %>% + pull(EntrezGene)) ## perform enrichment analysis enrich_pathway <- compareCluster(dat, fun = "enrichKEGG", - pvalueCutoff = 0.05, - organism = "mmu" + pvalueCutoff = 0.05, + organism = "mmu" ) -enrich_pathway@compareClusterResult$Description <- gsub(" - Mus musculus \\(house mouse)","",enrich_pathway@compareClusterResult$Description) +enrich_pathway@compareClusterResult$Description <- + gsub(" - Mus musculus \\(house mouse)", + "", + enrich_pathway@compareClusterResult$Description) ``` -Let’s plot top enriched functions using dotplot function of -clusterProfiler package. +Let’s plot top enriched functions using the `dotplot` function of the +`clusterProfiler` package. + ```{r} -clusterProfiler::dotplot(enrich_pathway,showCategory=10,font.size = 14,title="Enriched KEGG Pathways") +clusterProfiler::dotplot(enrich_pathway, + showCategory = 10, + font.size = 14, + title = "Enriched KEGG Pathways") ``` What does this plot infer? - ## Save Data for Next Lesson -We will use the results data in the next -lesson. Save it now and we will load it at the beginning of the next -lesson. We will use R’s save command to save the objects in compressed, -binary format. The save command is useful when you want to save multiple -objects in one file. +We will use the results data in the next lesson. Save it now and we will load it +at the beginning of the next lesson. We will use R’s `save` command to save the +objects in compressed, binary format. The `save` command is useful when you want +to save multiple objects in one file. + ```{r} -save(DE_5xFAD.df,DE_5xFAD.list,file="results/DEAnalysis_5XFAD.Rdata") +save(DE_5xFAD.df,DE_5xFAD.list, file="results/DEAnalysis_5XFAD.Rdata") ``` ## Session Info