-
Notifications
You must be signed in to change notification settings - Fork 0
/
cuffdiff_analysis.Rmd
105 lines (86 loc) · 3.93 KB
/
cuffdiff_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
---
title: "Cuffdiff Analysis for AD sample RNA-seq Data"
output: html_document
---
Set project working directory and read in `gene_exp.diff` file from cuffdiff output directory
```{r}
# setwd(dir = "C:\\Users/sujay/Desktop/USC Assignments and Material/TRGN515/RNAseq Project/")
diff_exp_genes = read.table(file = "../cuffdiff results/Cuffdiff_wStats/gene_exp.diff", header = TRUE, sep = "\t")
```
Filter only those genes with a p-value < 0.01, to get a "good-sized" (~50-850) list of significantly expressed genes
```{r}
sig_gene_data = subset(diff_exp_genes, p_value < 0.01)
```
```{r}
head(sig_gene_data)
```
To calclulate the number of significantly expressed genes
```{r}
nrow(sig_gene_data)
```
Read in the `genes_fpkm.tracking` into an R object/dataframe
```{r}
genes_fpkm = read.table(file = "../cuffdiff results/Cuffdiff_NoStats/genes.fpkm_tracking", header = TRUE, sep = "\t")
```
Limit the above `genes_fpkm` dataframes' rows/records to just the genes in `sig_gene_data` to obtain corresponding FPKM values
```{r}
genes_fpkm = genes_fpkm[genes_fpkm$gene_id %in% sig_gene_data$gene_id,]
```
```{r}
head(genes_fpkm)
```
Clean-up `genes_fpkm` by reducing the column list to ones we require
```{r}
library(dplyr)
genes_fpkm_matrix = select(genes_fpkm,
gene_short_name,
Ctrl1_FPKM, Ctrl2_FPKM, Ctrl3_FPKM,
MidAD1_FPKM, MidAD2_FPKM, MidAD3_FPKM,
LateAD1_FPKM, LateAD2_FPKM, LateAD3_FPKM)
```
_Note: To make it more generic you can simply filter out those columns with "FPKM" as a substring in them to refer to the FPKM value-containing columns_
Apply `log10(x + 1)` transformation to all the FPKM value cells in `genes_fpkm_matrix`
```{r}
genes_fpkm_matrix[,2:10] = log10(genes_fpkm_matrix[,2:10] + 1)
```
Transform the matrix to have the gene short names as row names of the matrix
```{r}
genes_fpkm_matrix_clean = genes_fpkm_matrix[,-1]
rownames(genes_fpkm_matrix_clean) = genes_fpkm_matrix[,1]
```
Heatmap generation: define appropriate colour palette and generate heatmap using `heatmap.2()` function
```{r}
library(gplots)
library(RColorBrewer)
palette <- colorRampPalette(brewer.pal(8, "Paired"))(25)
heatmap.2(x = as.matrix(genes_fpkm_matrix_clean), col = palette, margins = c(12, 9), trace = "none")
```
Matrix pre-processing and transpose - for PCA and Cluster Analysis
```{r}
genes_fpkm_matrix_clean = as.data.frame(lapply(genes_fpkm_matrix_clean, as.double))
genes_fpkm_matrix_clean_t = as.data.frame(t(genes_fpkm_matrix_clean))
```
Use `prcomp()` to split the feature set into principal components, and also group the features into the classes on the basis of which the colour coding will be done while generating the PCA plot
```{r}
library(ggfortify)
pca_components = prcomp(genes_fpkm_matrix_clean_t)
genes_fpkm_matrix_clean_t$Samples = rownames(genes_fpkm_matrix_clean_t)
genes_fpkm_matrix_clean_t$Group = ifelse(grepl("Ctrl", genes_fpkm_matrix_clean_t$Samples), "CTRL",
ifelse(grepl("Mid", genes_fpkm_matrix_clean_t$Samples), "Mid AD", "Late AD"))
autoplot(pca_components, data = genes_fpkm_matrix_clean_t, label = TRUE, label.size = 2, colour = "Group")
```
```{r}
library(factoextra)
genes_fpkm_matrix_clean_t[is.na(genes_fpkm_matrix_clean_t)] = 2
dist.eucl = dist(genes_fpkm_matrix_clean_t, method = "euclidean")
hclust.ward.eucl = hclust(d = dist.eucl, method = "ward.D2")
plot(hclust.ward.eucl, main = "Euclidean - Ward's")
```
```{r}
upregulated_genes = subset(sig_gene_data, log2.fold_change. > 0)
write.table(upregulated_genes, "upregulated_genes.txt", sep = "\t", row.names = FALSE, quote = FALSE)
```
```{r}
downregulated_genes = subset(sig_gene_data, log2.fold_change. < 0)
write.table(downregulated_genes, "downregulated_genes.txt", sep = "\t", row.names = FALSE, quote = FALSE)
```