ARG_analysis_figs.Rmd

---
title: "ARG_analysis_figs"
author: "Zeya Xue"
date: "1/1/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Setting up packages and working directory 
```{r}
library(phyloseq);packageVersion("phyloseq")
library(ggplot2);packageVersion("ggplot2")
library(plyr)
library(dplyr)
library(superheat)
library(vegan)

path <- "~/Google Drive File Stream/My Drive/Lemay_lab/analysis/"
```

## Import the merged, normalized MEGARes output and make phyloseq object
```{r}
# Prep the count table
CountTab <- read.csv(file.path(path,"NovaSeq043/CAZy/CAZy_merged_gene_norm_tab.csv"), 
                     row.names = 1, header = TRUE, stringsAsFactors = FALSE)
#CountTab <- CountTab[ , !colnames(CountTab) %in% c("Class","Mechanism","Group","Gene")]
colnames(CountTab) <- gsub("X","",colnames(CountTab))
## format needed for importing into phyloseq
row.names(CountTab) <- CountTab$Gene
CountTab <- CountTab[,-1]
CountTab <- as.matrix.data.frame(CountTab)

# Prep the "tax tab"
TaxTab <- read.csv(file.path(path,"R043_merge_norm_final.csv"), 
                     row.names = 1, header = TRUE, stringsAsFactors = FALSE)
TaxTab <- TaxTab[ , colnames(TaxTab) %in% c("Class","Mechanism","Group","Gene", "MEGID")]
rownames(TaxTab) <- TaxTab$MEGID
TaxTab <- TaxTab[,-1]
TaxTab <- as.matrix.data.frame(TaxTab)

# Read in metadata file
samdf <- read.table(file.path(path,"samdf/CTSC24532USDAWHNRCNu-Block2014FFQ_DATA_2020-01-30_1657_auth.csv"), 
                    sep = ',', header = TRUE)
#samdf <- samdf[ , !colnames(samdf) %in% c("X","X.1")]
rownames(samdf) <- samdf$subject_id

# merge as the phyloseq 
ps <- phyloseq(otu_table(CountTab, taxa_are_rows = TRUE), tax_table(TaxTab),sample_data(samdf))
ps # 840 taxa and 48 samples
#sample_data(ps)$fiber_group <- factor(sample_data(ps)$fiber_group,levels = c("low","adequate"))
```

## Aggregate the ARG by different levels
```{r}
ps.group <- tax_glom(ps, "Group", NArm=FALSE)
ps.group # 110 taxa and 48 samples

ps.mech <- tax_glom(ps, "Mechanism", NArm=FALSE)
ps.mech # 41 taxa and 48 samples
```

## Make 2-D beta diversity plots
```{r}
# Define function to plot 2D beta diversity dot plot
BetaPlot <- function(ps, ClusterMethod, DistMethod, outfp, w, h){
  ps.ord <- ordinate(ps, method = ClusterMethod, distance = DistMethod)
  
  pdf(outfp, w, h)
  print(plot_ordination(ps, ps.ord, color="fiber_group") + 
          geom_point(size=3)+ 
          scale_color_brewer(palette="Paired")+
          stat_ellipse()+
          ggtitle("")+
          ylab("NMDS2")+ 
          xlab("NMDS1")+
          theme_bw(base_size = 15))
  dev.off()
}


BetaPlot(ps.group, ClusterMethod = "NMDS", DistMethod = "bray",
         outfp=file.path(path,"low_adeq_fiber_figs/nmds_bray_group.pdf"), w=5.5,h=4)
BetaPlot(ps.group, ClusterMethod = "NMDS", DistMethod = "jaccard",
         outfp=file.path(path,"low_adeq_fiber_figs/nmds_jaccard_group.pdf"), w=5.5,h=4)
BetaPlot(ps.mech, ClusterMethod = "NMDS", DistMethod = "bray",
         outfp=file.path(path,"low_adeq_fiber_figs/nmds_bray_mechanism.pdf"), w=5.5,h=4)
BetaPlot(ps.mech, ClusterMethod = "NMDS", DistMethod = "jaccard",
         outfp=file.path(path,"low_adeq_fiber_figs/nmds_jaccard_mechanism.pdf"), w=5.5,h=4)
BetaPlot(ps, ClusterMethod = "NMDS", DistMethod = "bray",
         outfp=file.path(path,"low_adeq_fiber_figs/nmds_bray_gene.pdf"), w=5.5,h=4)
BetaPlot(ps, ClusterMethod = "NMDS", DistMethod = "jaccard",
         outfp=file.path(path,"low_adeq_fiber_figs/nmds_jaccard_gene.pdf"), w=5.5,h=4)
```

## PERMANOVA 
```{r}
# PERMANOVA with the vegan adonis() function
perm <- function(ps, var){
  # Calculate bray curtis distance matrix
  bray <- phyloseq::distance(ps, method = "bray")
  # make a data frame from the sample_data
  sampledf <- data.frame(sample_data(ps))
  # Adonis test
  adonis(bray ~ get(var), data = sampledf)
}

for (i in colnames(sample_data(ps.mech)) ){
  write.table(i, file.path(path,"low_adeq_fiber_figs/adonis_output.txt"), append = TRUE)
  write.table(perm(ps.mech, var = i)[1]$aov.tab , file.path(path,"low_adeq_fiber_figs/adonis_output.txt"), 
              append = TRUE)
}

```

## DESeq2 to determine sig diff mechanisms 
Because of a lot near-zero values in the count table, I would like to find out which mechanisms are significantly different between low and high fibers before adjusting color scale for heat map
```{r}
library(DESeq2);packageVersion("DESeq2") 

## Define function to use DESeq2 for differential analysis 
## Uses default Benjamini-Hochberg pvalue adjust
fiberDA <- function(ps, path.out) {
  psdds <- phyloseq_to_deseq2(ps, ~ fiber_group)
  
  #geoMeans <- apply(counts(psdds), 0, gmMean)
  #psdds <- estimateSizeFactors(psdds)
  
  dds <- DESeq(psdds, test = "Wald", fitType = "local", sfType = "poscounts") 
  plotDispEsts(dds, ylim = c(1e-6, 1e2)) 
  
  res <- results(dds, lfcThreshold = 1.2)
  mcols(res, use.names=TRUE)  
  plotMA(res) # diagnostic description
  hist(res$pvalue, breaks=20, col="grey" ) # diagnostic plot
  
  alpha <- 0.1 #  padj for indepenent filtering and expect FDR < alpha
  sigtab = res[which(res$padj < alpha), ] 
  if (nrow(sigtab) == 0) {
    print("There are no significant features whose padj < 0.1 and |lfc| > 1.2")
  } else {
    sigtab <- cbind(as(sigtab, "data.frame"), as(tax_table(ps)[rownames(sigtab), ], "matrix"))
    write.csv(sigtab, file.path(path.out))
  }
}

fiberDA(ps.mech, path.out = file.path(path,"low_adeq_fiber_figs/mech_deseq.csv"))
#There are no significant features whose padj < 0.1 and |lfc| > 1.2"
fiberDA(ps, path.out = file.path(path,"low_adeq_fiber_figs/gene_deseq.csv"))
#Error in estimateDispersionsFit(object, fitType = fitType, quiet = quiet) : 
  #all gene-wise dispersion estimates are within 2 orders of magnitude
  #from the minimum value, and so the standard curve fitting techniques will not work.
fiberDA(ps.group, path.out = file.path(path,"low_adeq_fiber_figs/group_deseq.csv"))
#There are no significant features whose padj < 0.1 and |lfc| > 1.2"
```

## metagenomeSeq to determine sig diff mechanisms 
DESeq2 requires non-normalized table. Maybe the microbecensus normalized data are not suitable with DESeq2. I would like to try a different method such as metagenomeSeq for differential abundance testing. However, I don't have much hope in discovering signifantly different genes, because the NMDS plots showed bascially no separation.
https://bioconductor.org/packages/release/bioc/vignettes/metagenomeSeq/inst/doc/metagenomeSeq.pdf 
https://bioconductor.org/packages/release/bioc/manuals/metagenomeSeq/man/metagenomeSeq.pdf
```{r}
library(metagenomeSeq);packageVersion("metagenomeSeq") 

fiberDA2 <- function(ps, path.out){
  psme <- phyloseq_to_metagenomeSeq(ps)
  
  # I used libSize(psme) to check for library size of all samples and the lib sizes are between 40 and 65
  # Skip metagenomeSeq normaliztion step
  # implement non-wrapped version
  
  
  pd <- pData(psme)
  # The design matrices that we ultimately work with will have at least two columns: an intercept column, which consists of a column of 1’s, and a second column, which specifies which samples are in a second group. In
  mod <- model.matrix(~fiber_group, data = pd) # Y=Xβ+ε linear model matrices
  
  res <- fitFeatureModel(psme, mod, coef = 2), # warning message: glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred

  MRcoefs(res)
  # plotMRheatmap(psme, n=20, norm = FALSE, log = FALSE)
  # plotCorr(psme, n=20, norm = FALSE, log = FALSE)
  # plotOrd(psme, tran = TRUE, norm = FALSE, log = FALSE, usePCA = FALSE, useDist = TRUE, bg = factor(pData(psme)$fiber_group), pch = 21) 
}

fData(lungData)

```


## Heat map based on ARG Group abundances
```{r}
heatmap <- function(ps,varx,vary,outfp,w,h){
  df <- psmelt(ps)
  df2 <- df[, colnames(df) %in% c(varx, vary, "Abundance")]
  df2[[varx]] <- factor(df2[[varx]], levels = c("low","adequate"))
  
  pdf(outfp, w, h)
  print(ggplot(df2, aes(x = get(varx), y = get(vary), fill = Abundance)) +
          geom_tile() +  
          scale_fill_gradient(low = "#fff7fb", high = "#034e7b")+
          xlab(label = "Fiber intake")+
          ylab(label = vary)+
          theme_bw(base_size = 15))
  dev.off()
}

heatmap(ps.group, varx = "dt_fibe", vary = "Group", 
        outfp = file.path(path,"low_adeq_fiber_figs/heatmap_group.pdf"), 
        w=9,h=12)
heatmap(ps.mech, varx = "fiber_group", vary = "Mechanism", 
        outfp = file.path(path,"low_adeq_fiber_figs/heatmap_mechanism.pdf"), 
        w=9,h=8)


# make heatmap for each sample to see trend, not average 
png(file.path(path,"low_adeq_fiber_figs/heatmap_mechanism_persam_HEI.png"), 1200, 1000)
plot_heatmap(ps.mech,
             sample.label = "hei_asa24_totalscore",
             taxa.label = "Mechanism",
             low="#66CCFF", high="#000033", na.value="white",
             distance="bray",
             #sample.order = "dt_fibe",
             taxa.order = "Mechanism")+
  theme_bw(base_size = 15)+
  theme(axis.text.x = element_text(angle = 90))
dev.off()

png(file.path(path,"low_adeq_fiber_figs/heatmap_mechanism_persam_nmds.png"), 1200, 1000)
p1 <- plot_heatmap(ps.mech,
             method = "NMDS", distance = "bray",
             sample.label = "fiber_group",
             taxa.label = "Mechanism",
             low="#66CCFF", high="#000033", na.value="white",
             taxa.order = "Mechanism")+
  theme_bw(base_size = 15)+
  theme(axis.text.x = element_text(angle = 90))
dev.off()

png(file.path(path,"low_adeq_fiber_figs/heatmap_mechanism_persam_nmds_fat.png"), 1200, 1000)
plot_heatmap(ps.mech,
             method = "NMDS", distance = "bray",
             sample.label = "fat_total_per_1000kcal",
             taxa.label = "Mechanism",
             low="#66CCFF", high="#000033", na.value="white",
             taxa.order = "Mechanism")+
  theme_bw(base_size = 15)+
  theme(axis.text.x = element_text(angle = 90))
dev.off()


# Make variable per sample ID line plot (placed as an extra grid on the  bottom of heat map)
sample_data(ps.mech)$Subject.ID <- factor(sample_data(ps.mech)$Subject.ID, 
                                          levels = c("6003","5062","7035","7009","7056","6063","7017","7115",
                                                     "6014","5020","6009","7077","7039","6086","7087","6071",
                                                     "6065","7109","7112","7047","7057","6010","5007","5005",
                                                     "7043","6085","7059","5016","7114","5002","5052","5023",
                                                     "7032","6092","7116","6064","5059","5015","7099","6095",
                                                     "7113","5006","7105","5060","6100","6053","6032","7067"
                                                     ))
samdf2 <- sample_data(ps.mech) %>% data.frame()

library(gridExtra)
line_grid <- function(df, vary){

  p2 <- ggplot(df, aes(x=Subject.ID, y=get(vary), group = 1))+
    geom_line()
  #grid.arrange(p1, p2, ncol=1)
  
  #p <- rbind(ggplotGrob(p1), ggplotGrob(p2), size = "second")
  #p$widths <- unit.pmax(p1$widths, p2$widths)
  #grid.newpage()
  #grid.draw(p)
}

line_grid(samdf2, "total_kcal")

############################################################################################################
############To make heatmap with adjusted color scale. For use after determine which taxa are important.
############################################################################################################
df <- psmelt(ps.group)
  taxa.agg <- aggregate(Abundance ~ Mechanism + fiber_group,
                        data = df,
                        mean)
  taxa.cast <- dcast(taxa.agg, Mechanism~fiber_group , mean, value.var = "Abundance")
  # make as numeric matrix 
  row.names(taxa.cast) <- taxa.cast$Mechanism
  taxa.cast <- taxa.cast[,-1]
  
superheat(taxa.cast,
          # scale the matrix columns
          scale = TRUE,
          # change the color (#b35806 = brown and #542788 = purple)
          heat.pal = c("#fff7fb", "#034e7b"),
          heat.pal.values = c(0, 0.0001,1))
############To make heatmap with adjusted color scale. For use after determine which taxa are important.


```