Taxonomic assignments of the contigsΒΆ

1_Taxonomy.utf8
library(tidyverse)
library(ggsci)
library(useful)
library(RColorBrewer)
mat = read.table("transcripts.fasta.gc", sep="\t", header=T, comment.char="", 
                 stringsAsFactors=F, check.names=F)
colnames(mat)[1] = "ID"
head(mat)
##                                        ID Length     GC
## 1 NODE_1_length_25671_cov_21.867536_g0_i0  25671 0.5940
## 2 NODE_2_length_22538_cov_23.707146_g1_i0  22538 0.5314
## 3 NODE_3_length_22478_cov_23.780597_g1_i1  22478 0.5316
## 4 NODE_4_length_22111_cov_30.963285_g2_i0  22111 0.5897
## 5 NODE_5_length_21962_cov_26.892986_g3_i0  21962 0.5702
## 6 NODE_6_length_19553_cov_12.257742_g4_i0  19553 0.5099
annot = read.table("transcripts.fasta.Coral_Zooxanthellae_Classify.parse.txt",
                   sep="\t", quote="", stringsAsFactors=F, col.names=c("ID", "Group", "Evalue", "TopHit"))
head(annot)
##                                        ID         Group Evalue
## 1 NODE_1_length_25671_cov_21.867536_g0_i0 Zooxanthellae      0
## 2 NODE_2_length_22538_cov_23.707146_g1_i0 Zooxanthellae      0
## 3 NODE_3_length_22478_cov_23.780597_g1_i1 Zooxanthellae      0
## 4 NODE_4_length_22111_cov_30.963285_g2_i0 Zooxanthellae      0
## 5 NODE_5_length_21962_cov_26.892986_g3_i0 Zooxanthellae      0
## 6 NODE_6_length_19553_cov_12.257742_g4_i0 Zooxanthellae      0
##                           TopHit
## 1  Symbiodinium_sp_clade_C-16463
## 2  Symbiodinium_sp_clade_C-54602
## 3  Symbiodinium_sp_clade_C-54602
## 4  Symbiodinium_sp_clade_C-23670
## 5  Symbiodinium_sp_clade_A3-5501
## 6 Symbiodinium_sp_clade_A3-13600
annot.nr = read.table("transcripts.fasta.nr_diamond.parse.txt", 
                      sep="\t", header=F, check.names=F, stringsAsFactors=F)
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## γ‚―γ‚ͺγƒΌγƒˆγ§ε›²γΎγ‚ŒγŸζ–‡ε­—εˆ—δΈ­γ«EOFγŒγ‚γ‚ŠγΎγ™
annot.nr = annot.nr %>% select(V1, V5) %>% 
  mutate(phylum = sapply(V5, function(x) strsplit(x, ";")[[1]][2]),
         class = sapply(V5, function(x) strsplit(x, ";")[[1]][3]),
         order = sapply(V5, function(x) strsplit(x, ";")[[1]][4]),
         family = sapply(V5, function(x) strsplit(x, ";")[[1]][4]),
         genus = sapply(V5, function(x) strsplit(x, ";")[[1]][5]),
         species = sapply(V5, function(x) strsplit(x, ";")[[1]][6]))
colnames(annot.nr)[1:2] = c("ID", "nr")
head(annot.nr)
##                                        ID
## 1 NODE_1_length_25671_cov_21.867536_g0_i0
## 2 NODE_2_length_22538_cov_23.707146_g1_i0
## 3 NODE_3_length_22478_cov_23.780597_g1_i1
## 4 NODE_4_length_22111_cov_30.963285_g2_i0
## 5 NODE_5_length_21962_cov_26.892986_g3_i0
## 6 NODE_6_length_19553_cov_12.257742_g4_i0
##                                                                                               nr
## 1 Eukaryota;;Dinophyceae;Gonyaulacales;Goniodomataceae;Gambierdiscus;Gambierdiscus polynesiensis
## 2 Eukaryota;Endomyxa;;Plasmodiophorida;Plasmodiophoridae;Plasmodiophora;Plasmodiophora brassicae
## 3 Eukaryota;Endomyxa;;Plasmodiophorida;Plasmodiophoridae;Plasmodiophora;Plasmodiophora brassicae
## 4 Eukaryota;Endomyxa;;Plasmodiophorida;Plasmodiophoridae;Plasmodiophora;Plasmodiophora brassicae
## 5          Eukaryota;;Choanoflagellata;Craspedida;Salpingoecidae;Salpingoeca;Salpingoeca rosetta
## 6    Eukaryota;;Dinophyceae;Suessiales;Symbiodiniaceae;Symbiodinium;Symbiodinium microadriaticum
##     phylum            class            order           family             genus
## 1               Dinophyceae    Gonyaulacales    Gonyaulacales   Goniodomataceae
## 2 Endomyxa                  Plasmodiophorida Plasmodiophorida Plasmodiophoridae
## 3 Endomyxa                  Plasmodiophorida Plasmodiophorida Plasmodiophoridae
## 4 Endomyxa                  Plasmodiophorida Plasmodiophorida Plasmodiophoridae
## 5          Choanoflagellata       Craspedida       Craspedida    Salpingoecidae
## 6               Dinophyceae       Suessiales       Suessiales   Symbiodiniaceae
##          species
## 1  Gambierdiscus
## 2 Plasmodiophora
## 3 Plasmodiophora
## 4 Plasmodiophora
## 5    Salpingoeca
## 6   Symbiodinium

GC content

figmat = mat %>% left_join(annot, by="ID") %>% 
          mutate(Group = factor(ifelse(is.na(Group), "Unclassified", Group), levels=c("Coral", "Zooxanthellae", "Unclassified") ))
head(figmat)
##                                        ID Length     GC         Group Evalue
## 1 NODE_1_length_25671_cov_21.867536_g0_i0  25671 0.5940 Zooxanthellae      0
## 2 NODE_2_length_22538_cov_23.707146_g1_i0  22538 0.5314 Zooxanthellae      0
## 3 NODE_3_length_22478_cov_23.780597_g1_i1  22478 0.5316 Zooxanthellae      0
## 4 NODE_4_length_22111_cov_30.963285_g2_i0  22111 0.5897 Zooxanthellae      0
## 5 NODE_5_length_21962_cov_26.892986_g3_i0  21962 0.5702 Zooxanthellae      0
## 6 NODE_6_length_19553_cov_12.257742_g4_i0  19553 0.5099 Zooxanthellae      0
##                           TopHit
## 1  Symbiodinium_sp_clade_C-16463
## 2  Symbiodinium_sp_clade_C-54602
## 3  Symbiodinium_sp_clade_C-54602
## 4  Symbiodinium_sp_clade_C-23670
## 5  Symbiodinium_sp_clade_A3-5501
## 6 Symbiodinium_sp_clade_A3-13600
ggplot() +
  geom_histogram(data=subset(figmat, Group=="Coral"), aes(x=GC*100, y=..density.., fill=Group), bins=50, alpha=0.3) +
  geom_histogram(data=subset(figmat, Group=="Zooxanthellae"), aes(x=GC*100, y=..density.., fill=Group), bins=50, alpha=0.3) +
  geom_histogram(data=subset(figmat, Group=="Unclassified"), aes(x=GC*100, y=..density.., fill=Group), bins=50, alpha=0.3) +
  xlab("GC [%]") + ylab("Density") +
  theme_bw() +
  scale_fill_manual(values=c("Zooxanthellae"="green", "Coral"="red", "Unclassified"="grey")) +
  ggsave("1_Contig_GC.density.pdf", width=5, height=3)

ggplot() +
  geom_histogram(data=subset(figmat, Group=="Coral"), aes(x=GC*100, y=..count.., fill=Group), bins=50, alpha=0.3) +
  geom_histogram(data=subset(figmat, Group=="Zooxanthellae"), aes(x=GC*100, y=..count.., fill=Group), bins=50, alpha=0.3) +
  geom_histogram(data=subset(figmat, Group=="Unclassified"), aes(x=GC*100, y=..count.., fill=Group), bins=50, alpha=0.3) +
  xlab("GC [%]") + ylab("Number of contigs") +
  theme_bw() +
  scale_fill_manual(values=c("Zooxanthellae"="green", "Coral"="red", "Unclassified"="grey")) +
  ggsave("1_Contig_GC.count.pdf", width=5, height=3)

Length

stats = mat %>% left_join(annot, by="ID") %>% 
          mutate(Group=ifelse(is.na(Group), "Unclassified", Group)) %>%
          mutate(Length_0_500 = Length >= 100 & Length < 500,
                 Length_500_1000 = Length >= 500 & Length < 1000,
                 Length_1000_2000 = Length >= 1000 & Length < 2000,
                 Length_2000_5000 = Length >= 2000 & Length < 5000,
                 Length_5000_ = Length >= 5000) %>%
          group_by(Group) %>% 
          summarize(n_0_500 = sum(Length_0_500),
                    n_500_1000 = sum(Length_500_1000),
                    n_1000_2000 = sum(Length_1000_2000),
                    n_2000_5000 = sum(Length_2000_5000),
                    n_5000_ = sum(Length_5000_)) %>%
          arrange(desc(n_5000_))
stats
## # A tibble: 3 x 6
##   Group         n_0_500 n_500_1000 n_1000_2000 n_2000_5000 n_5000_
##   <chr>           <int>      <int>       <int>       <int>   <int>
## 1 Zooxanthellae    3139      16339       25985       12714    1088
## 2 Coral           12600      38292       36473       17905     999
## 3 Unclassified   131214      21645        6251         940      29
figmat = stats %>% 
           gather(key="Length", value="n", -Group) %>% 
           as.data.frame() %>% 
           mutate(Length = gsub("n_", "", Length))

figmat$Length = factor(figmat$Length, levels=c("0_500", "500_1000", "1000_2000", "2000_5000", "5000_"))
figmat$Group = factor(figmat$Group, levels=c("Coral", "Zooxanthellae", "Unclassified"))
ggplot(figmat) + 
  geom_bar(aes(x=Group, y=n, fill=Length), stat="identity", position="fill") + 
  scale_y_continuous(labels = scales::percent) + 
  xlab(NULL) + ylab("Proportion [%]") +
  scale_fill_jco() +
  theme_bw() + 
  theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1, size=10)) +
  ggsave("1_Contig_Length.Class.pdf", width=2.8, height=3)

Characteristics of Unclassified contigs

tmp = mat %>% left_join(annot, by="ID") %>% 
  left_join(annot.nr, by="ID") %>%
  mutate(Group=ifelse(is.na(Group), "Unclassified", Group)) 

tmp %>% filter(Group == "Coral") %>% group_by(phylum, class, order) %>% count() %>% arrange(desc(n)) %>% head()
## # A tibble: 6 x 4
## # Groups:   phylum, class, order [6]
##   phylum         class               order            n
##   <chr>          <chr>               <chr>        <int>
## 1 Cnidaria       Anthozoa            Scleractinia 52936
## 2 <NA>           <NA>                <NA>         51359
## 3 Cnidaria       Anthozoa            Actiniaria    1052
## 4 Cnidaria       Anthozoa            Alcyonacea     210
## 5 Proteobacteria Gammaproteobacteria ""             115
## 6 Arthropoda     Hexanauplia         Sessilia        51
tmp %>% filter(Group == "Zooxanthellae") %>% group_by(phylum, class, order) %>% count() %>% arrange(desc(n)) %>% head()
## # A tibble: 6 x 4
## # Groups:   phylum, class, order [6]
##   phylum      class       order              n
##   <chr>       <chr>       <chr>          <int>
## 1 ""          Dinophyceae Suessiales     30448
## 2 <NA>        <NA>        <NA>           23848
## 3 ""          ""          ""               834
## 4 Perkinsozoa ""          Perkinsida       324
## 5 Cnidaria    Anthozoa    Scleractinia     215
## 6 Haptista    Haptophyta  Isochrysidales   183
tmp %>% filter(Group == "Unclassified") %>% group_by(phylum, class, order) %>% count() %>% arrange(desc(n)) %>% head()
## # A tibble: 6 x 4
## # Groups:   phylum, class, order [6]
##   phylum       class            order             n
##   <chr>        <chr>            <chr>         <int>
## 1 <NA>         <NA>             <NA>         159084
## 2 Cnidaria     Anthozoa         Scleractinia    750
## 3 Cnidaria     Anthozoa         Actiniaria       28
## 4 Arthropoda   Hexanauplia      Sessilia         23
## 5 Streptophyta Magnoliopsida    Fabales          16
## 6 Chlorophyta  Trebouxiophyceae Chlorellales     14