-
Notifications
You must be signed in to change notification settings - Fork 0
/
bacteria and archaea.Rmd
112 lines (98 loc) · 2.34 KB
/
bacteria and archaea.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
---
title: "archaea vs bacteria infecting viruses"
output:
html_document:
keep_md: yes
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
#importing data
```{r}
library(tidyverse)
library(janitor)
library(ggthemes)
library(paletteer)
library(ggplot2)
library(RColorBrewer)
```
#setting up aesthetics for later
```{r}
colors<- LaCroixColoR::lacroix_palette("Pamplemousse", type = "discrete")
```
#importing data
```{r}
viruses <- readr::read_csv("data/viruses.csv") %>% clean_names()
```
#cleaning data
```{r}
viruses <- viruses %>%
filter(level == "Complete") %>%
select(organism_name, organism_groups, level, size_mb, gc_percent, host, genes)
```
```{r}
viruses<-viruses %>%
separate(organism_groups, into = c("domain", "group", "family"),sep=";")
```
#glimpse of data
```{r}
glimpse(viruses)
```
#comparing viruses with bacterial vs archaeal hosts
#bacteria: larger genome size, more genes, higher gc content
#similar gene to genome size ratio
```{r}
arch_bact <- viruses %>%
filter(host == "archaea"|host == "bacteria")
```
```{r}
data <- arch_bact %>%
group_by(host) %>%
summarize(size = mean(size_mb, na.rm = T),
gc = mean(gc_percent, na.rm = T),
gene = mean(genes, na.rm = T))
```
```{r}
data %>%
ggplot(aes(x=host, y=size, fill=host))+
geom_col()+
theme_solarized()+
scale_fill_manual(values=colors)+
labs(title = "Genome Size of Viruses by Host",
x="Host",
y="Genome Size (Mb)")
```
```{r}
data %>%
ggplot(aes(x=host, y=gc, fill=host))+
geom_col()+
theme_solarized()+
scale_fill_manual(values=colors)+
labs(title = "GC Content of Viruses by Host",
x="Host",
y="GC Content (Percent)")
```
```{r}
data %>%
ggplot(aes(x=host, y=gene, fill=host))+
geom_col()+
theme_solarized()+
scale_fill_manual(values=colors)+
labs(title = "Number of Genes in Virus Genomes by Host",
x="Host",
y="Number of Genes")
```
```{r}
arch_bact %>%
filter(genes != 0) %>%
mutate(gene_mb_ratio = genes/size_mb) %>%
group_by(host) %>%
summarize(gene_content = mean(gene_mb_ratio, na.rm = T)) %>%
ggplot(aes(x=host, y=gene_content, fill=host))+
geom_col()+
theme_solarized()+
scale_fill_manual(values=colors)+
labs(title = "Gene to Genome Size Ratio by Host",
x="Host",
y="Genes to Genome Size (Genes/Mb)")
```