Taxonomic assignments of the contigsΒΆ
library(tidyverse)
library(ggsci)
library(useful)
library(RColorBrewer)
mat = read.table("transcripts.fasta.gc", sep="\t", header=T, comment.char="",
stringsAsFactors=F, check.names=F)
colnames(mat)[1] = "ID"
head(mat)
## ID Length GC
## 1 NODE_1_length_25671_cov_21.867536_g0_i0 25671 0.5940
## 2 NODE_2_length_22538_cov_23.707146_g1_i0 22538 0.5314
## 3 NODE_3_length_22478_cov_23.780597_g1_i1 22478 0.5316
## 4 NODE_4_length_22111_cov_30.963285_g2_i0 22111 0.5897
## 5 NODE_5_length_21962_cov_26.892986_g3_i0 21962 0.5702
## 6 NODE_6_length_19553_cov_12.257742_g4_i0 19553 0.5099
annot = read.table("transcripts.fasta.Coral_Zooxanthellae_Classify.parse.txt",
sep="\t", quote="", stringsAsFactors=F, col.names=c("ID", "Group", "Evalue", "TopHit"))
head(annot)
## ID Group Evalue
## 1 NODE_1_length_25671_cov_21.867536_g0_i0 Zooxanthellae 0
## 2 NODE_2_length_22538_cov_23.707146_g1_i0 Zooxanthellae 0
## 3 NODE_3_length_22478_cov_23.780597_g1_i1 Zooxanthellae 0
## 4 NODE_4_length_22111_cov_30.963285_g2_i0 Zooxanthellae 0
## 5 NODE_5_length_21962_cov_26.892986_g3_i0 Zooxanthellae 0
## 6 NODE_6_length_19553_cov_12.257742_g4_i0 Zooxanthellae 0
## TopHit
## 1 Symbiodinium_sp_clade_C-16463
## 2 Symbiodinium_sp_clade_C-54602
## 3 Symbiodinium_sp_clade_C-54602
## 4 Symbiodinium_sp_clade_C-23670
## 5 Symbiodinium_sp_clade_A3-5501
## 6 Symbiodinium_sp_clade_A3-13600
annot.nr = read.table("transcripts.fasta.nr_diamond.parse.txt",
sep="\t", header=F, check.names=F, stringsAsFactors=F)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## γ―γͺγΌγγ§ε²γΎγγζεεδΈγ«EOFγγγγΎγ
annot.nr = annot.nr %>% select(V1, V5) %>%
mutate(phylum = sapply(V5, function(x) strsplit(x, ";")[[1]][2]),
class = sapply(V5, function(x) strsplit(x, ";")[[1]][3]),
order = sapply(V5, function(x) strsplit(x, ";")[[1]][4]),
family = sapply(V5, function(x) strsplit(x, ";")[[1]][4]),
genus = sapply(V5, function(x) strsplit(x, ";")[[1]][5]),
species = sapply(V5, function(x) strsplit(x, ";")[[1]][6]))
colnames(annot.nr)[1:2] = c("ID", "nr")
head(annot.nr)
## ID
## 1 NODE_1_length_25671_cov_21.867536_g0_i0
## 2 NODE_2_length_22538_cov_23.707146_g1_i0
## 3 NODE_3_length_22478_cov_23.780597_g1_i1
## 4 NODE_4_length_22111_cov_30.963285_g2_i0
## 5 NODE_5_length_21962_cov_26.892986_g3_i0
## 6 NODE_6_length_19553_cov_12.257742_g4_i0
## nr
## 1 Eukaryota;;Dinophyceae;Gonyaulacales;Goniodomataceae;Gambierdiscus;Gambierdiscus polynesiensis
## 2 Eukaryota;Endomyxa;;Plasmodiophorida;Plasmodiophoridae;Plasmodiophora;Plasmodiophora brassicae
## 3 Eukaryota;Endomyxa;;Plasmodiophorida;Plasmodiophoridae;Plasmodiophora;Plasmodiophora brassicae
## 4 Eukaryota;Endomyxa;;Plasmodiophorida;Plasmodiophoridae;Plasmodiophora;Plasmodiophora brassicae
## 5 Eukaryota;;Choanoflagellata;Craspedida;Salpingoecidae;Salpingoeca;Salpingoeca rosetta
## 6 Eukaryota;;Dinophyceae;Suessiales;Symbiodiniaceae;Symbiodinium;Symbiodinium microadriaticum
## phylum class order family genus
## 1 Dinophyceae Gonyaulacales Gonyaulacales Goniodomataceae
## 2 Endomyxa Plasmodiophorida Plasmodiophorida Plasmodiophoridae
## 3 Endomyxa Plasmodiophorida Plasmodiophorida Plasmodiophoridae
## 4 Endomyxa Plasmodiophorida Plasmodiophorida Plasmodiophoridae
## 5 Choanoflagellata Craspedida Craspedida Salpingoecidae
## 6 Dinophyceae Suessiales Suessiales Symbiodiniaceae
## species
## 1 Gambierdiscus
## 2 Plasmodiophora
## 3 Plasmodiophora
## 4 Plasmodiophora
## 5 Salpingoeca
## 6 Symbiodinium
GC content
figmat = mat %>% left_join(annot, by="ID") %>%
mutate(Group = factor(ifelse(is.na(Group), "Unclassified", Group), levels=c("Coral", "Zooxanthellae", "Unclassified") ))
head(figmat)
## ID Length GC Group Evalue
## 1 NODE_1_length_25671_cov_21.867536_g0_i0 25671 0.5940 Zooxanthellae 0
## 2 NODE_2_length_22538_cov_23.707146_g1_i0 22538 0.5314 Zooxanthellae 0
## 3 NODE_3_length_22478_cov_23.780597_g1_i1 22478 0.5316 Zooxanthellae 0
## 4 NODE_4_length_22111_cov_30.963285_g2_i0 22111 0.5897 Zooxanthellae 0
## 5 NODE_5_length_21962_cov_26.892986_g3_i0 21962 0.5702 Zooxanthellae 0
## 6 NODE_6_length_19553_cov_12.257742_g4_i0 19553 0.5099 Zooxanthellae 0
## TopHit
## 1 Symbiodinium_sp_clade_C-16463
## 2 Symbiodinium_sp_clade_C-54602
## 3 Symbiodinium_sp_clade_C-54602
## 4 Symbiodinium_sp_clade_C-23670
## 5 Symbiodinium_sp_clade_A3-5501
## 6 Symbiodinium_sp_clade_A3-13600
ggplot() +
geom_histogram(data=subset(figmat, Group=="Coral"), aes(x=GC*100, y=..density.., fill=Group), bins=50, alpha=0.3) +
geom_histogram(data=subset(figmat, Group=="Zooxanthellae"), aes(x=GC*100, y=..density.., fill=Group), bins=50, alpha=0.3) +
geom_histogram(data=subset(figmat, Group=="Unclassified"), aes(x=GC*100, y=..density.., fill=Group), bins=50, alpha=0.3) +
xlab("GC [%]") + ylab("Density") +
theme_bw() +
scale_fill_manual(values=c("Zooxanthellae"="green", "Coral"="red", "Unclassified"="grey")) +
ggsave("1_Contig_GC.density.pdf", width=5, height=3)
ggplot() +
geom_histogram(data=subset(figmat, Group=="Coral"), aes(x=GC*100, y=..count.., fill=Group), bins=50, alpha=0.3) +
geom_histogram(data=subset(figmat, Group=="Zooxanthellae"), aes(x=GC*100, y=..count.., fill=Group), bins=50, alpha=0.3) +
geom_histogram(data=subset(figmat, Group=="Unclassified"), aes(x=GC*100, y=..count.., fill=Group), bins=50, alpha=0.3) +
xlab("GC [%]") + ylab("Number of contigs") +
theme_bw() +
scale_fill_manual(values=c("Zooxanthellae"="green", "Coral"="red", "Unclassified"="grey")) +
ggsave("1_Contig_GC.count.pdf", width=5, height=3)
Length
stats = mat %>% left_join(annot, by="ID") %>%
mutate(Group=ifelse(is.na(Group), "Unclassified", Group)) %>%
mutate(Length_0_500 = Length >= 100 & Length < 500,
Length_500_1000 = Length >= 500 & Length < 1000,
Length_1000_2000 = Length >= 1000 & Length < 2000,
Length_2000_5000 = Length >= 2000 & Length < 5000,
Length_5000_ = Length >= 5000) %>%
group_by(Group) %>%
summarize(n_0_500 = sum(Length_0_500),
n_500_1000 = sum(Length_500_1000),
n_1000_2000 = sum(Length_1000_2000),
n_2000_5000 = sum(Length_2000_5000),
n_5000_ = sum(Length_5000_)) %>%
arrange(desc(n_5000_))
stats
## # A tibble: 3 x 6
## Group n_0_500 n_500_1000 n_1000_2000 n_2000_5000 n_5000_
## <chr> <int> <int> <int> <int> <int>
## 1 Zooxanthellae 3139 16339 25985 12714 1088
## 2 Coral 12600 38292 36473 17905 999
## 3 Unclassified 131214 21645 6251 940 29
figmat = stats %>%
gather(key="Length", value="n", -Group) %>%
as.data.frame() %>%
mutate(Length = gsub("n_", "", Length))
figmat$Length = factor(figmat$Length, levels=c("0_500", "500_1000", "1000_2000", "2000_5000", "5000_"))
figmat$Group = factor(figmat$Group, levels=c("Coral", "Zooxanthellae", "Unclassified"))
ggplot(figmat) +
geom_bar(aes(x=Group, y=n, fill=Length), stat="identity", position="fill") +
scale_y_continuous(labels = scales::percent) +
xlab(NULL) + ylab("Proportion [%]") +
scale_fill_jco() +
theme_bw() +
theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1, size=10)) +
ggsave("1_Contig_Length.Class.pdf", width=2.8, height=3)
Characteristics of Unclassified contigs
tmp = mat %>% left_join(annot, by="ID") %>%
left_join(annot.nr, by="ID") %>%
mutate(Group=ifelse(is.na(Group), "Unclassified", Group))
tmp %>% filter(Group == "Coral") %>% group_by(phylum, class, order) %>% count() %>% arrange(desc(n)) %>% head()
## # A tibble: 6 x 4
## # Groups: phylum, class, order [6]
## phylum class order n
## <chr> <chr> <chr> <int>
## 1 Cnidaria Anthozoa Scleractinia 52936
## 2 <NA> <NA> <NA> 51359
## 3 Cnidaria Anthozoa Actiniaria 1052
## 4 Cnidaria Anthozoa Alcyonacea 210
## 5 Proteobacteria Gammaproteobacteria "" 115
## 6 Arthropoda Hexanauplia Sessilia 51
tmp %>% filter(Group == "Zooxanthellae") %>% group_by(phylum, class, order) %>% count() %>% arrange(desc(n)) %>% head()
## # A tibble: 6 x 4
## # Groups: phylum, class, order [6]
## phylum class order n
## <chr> <chr> <chr> <int>
## 1 "" Dinophyceae Suessiales 30448
## 2 <NA> <NA> <NA> 23848
## 3 "" "" "" 834
## 4 Perkinsozoa "" Perkinsida 324
## 5 Cnidaria Anthozoa Scleractinia 215
## 6 Haptista Haptophyta Isochrysidales 183
tmp %>% filter(Group == "Unclassified") %>% group_by(phylum, class, order) %>% count() %>% arrange(desc(n)) %>% head()
## # A tibble: 6 x 4
## # Groups: phylum, class, order [6]
## phylum class order n
## <chr> <chr> <chr> <int>
## 1 <NA> <NA> <NA> 159084
## 2 Cnidaria Anthozoa Scleractinia 750
## 3 Cnidaria Anthozoa Actiniaria 28
## 4 Arthropoda Hexanauplia Sessilia 23
## 5 Streptophyta Magnoliopsida Fabales 16
## 6 Chlorophyta Trebouxiophyceae Chlorellales 14