Differential expression analysisΒΆ

3.2_Coral_Bacteria.knit
library(tidyverse)
library(igraph)
library(openxlsx)
library(useful)
library(edgeR)
library(clusterProfiler)
library(pheatmap)

Load files

rename = function(mat){
  x2 = colnames(mat)
  x2[colnames(mat) == "201602_Is1"] = "201602_Mi1"
  x2[colnames(mat) == "201602_Mi1"] = "201602_Is1"
  colnames(mat) = x2
  mat = mat[,sort(colnames(mat))]
  mat
}
load("../Microbiome/1_BacterialComposition.RData")
genus = t(apply(genustab.c, 1, function(x, s) x/s*100, colSums(genustab.c)))
corner(genus)

load("Expression.UT.RData")
matCount.c = rename(matCount.c)
matFPKM.c = rename(matFPKM.c)
matCount.c = matCount.c[,grep("Ky", colnames(matCount.c), invert=T)]
matFPKM.c = matFPKM.c[,grep("Ky", colnames(matFPKM.c), invert=T)]
corner(matCount.c)

load("../Annotation/coral/Annotation.Coral.UT.RData")
TERM2GENE.GO = gene2GO.c[,c("GO", "Gene")]
genes_not_found = rownames(matCount.c)[!(rownames(matCount.c) %in% TERM2GENE.GO$Gene)]
TERM2GENE.GO = rbind(TERM2GENE.GO, cbind(GO="No hit", Gene=genes_not_found))
head(TERM2GENE.GO)

TERM2GENE.KO = gene2KO.c %>% left_join(ko2pathway) %>% select(Pathway, Gene)
genes_not_found = rownames(matCount.c)[!(rownames(matCount.c) %in% TERM2GENE.KO$Gene)]
TERM2GENE.KO = rbind(TERM2GENE.KO, cbind(Pathway="No hit", Gene=genes_not_found))
head(TERM2GENE.KO)

Functions

make_heatmap = function(mat, up_genes, down_genes, filename, width, height){
  # Sort by colony
  colony = meta[colnames(mat), "Colony"]
  idx = order(colony)
  mat = mat[c(up_genes,down_genes), idx]
  
  #
  gaps_col = cumsum(table(colony))
  gaps_col = gaps_col[1:(length(gaps_col)-1)]

  # Annotation
  annotation_col = data.frame(Colony=colony[idx], row.names=colnames(mat))
  annotation_colors = list(Colony = color.colony)
  
  # Labels
  labels = meta[colnames(mat), "Month"]
  
  # Make figure
  pheatmap(mat,
           scale="row",
           cluster_rows = F,
           cluster_cols = F,
           annotation_col = annotation_col,
           gaps_row = length(up_genes),
           gaps_col = gaps_col,
           labels_row = rep("", nrow(mat)),
           labels_col = rep("", ncol(mat)), #labels,
           annotation_colors = annotation_colors,
           breaks = seq(-2, 2, length=100),
           filename = filename, 
           width = width, 
           height = height)
}
barGSA = function(GSA_result, n, font=3, color_bar, font_color="grey20", border_color="#00000000", limits=c(0, 5)){
  mat = GSA_result[1:min(c(n, nrow(GSA_result))), ]
  figmat = data.frame(P=-log10(mat$p.adjust),
                      Term=Hmisc::capitalize(mat$Description),
                      stringsAsFactors = F)
  figmat$Term = gsub("MRNA", "mRNA", figmat$Term)
  figmat$Term = gsub("RRNA", "rRNA", figmat$Term)
  figmat$Term = gsub("^Ko:", "ko:", figmat$Term)
  figmat$Term = factor(figmat$Term, levels=rev(figmat$Term))
  y.lab = max(figmat$P) * 0.02
  fig = ggplot(figmat) +
    geom_bar(aes(x=Term, y=P), stat="identity", fill=color_bar, color=border_color) +
    geom_text(aes(label=Term, x=Term), y=max(limits)*0.01, hjust=0, size=font, color=font_color) +
    coord_flip() + theme_minimal() + xlab(NULL) + ylab(NULL) +
    theme(axis.text.x = element_text(size=10), axis.text.y = element_blank(),
          axis.ticks.y = element_blank(), axis.title.x = element_text(size=10),
          plot.title = element_blank()) +
    scale_y_continuous(limits=limits)
  fig
}
gene_boxplot = function(mat, gene, title){
  figmat = data.frame(Colony = meta[colnames(mat), "Colony"],
                      Value = as.numeric(mat[gene,]))
  fig = ggplot(figmat) + 
    geom_boxplot(aes(x=Colony, y=Value), coef=10, na.rm=T) + 
    geom_jitter(aes(x=Colony, y=Value, color=Colony), width=0.2, height=0) + 
    scale_color_manual(values=color.colony) +
    theme_minimal() + xlab(NULL) + ylab("log2(FPKM+1)") +
    ggtitle(paste0(gene, "\n", title)) + theme(plot.title = element_text(hjust=0.5, size=12), legend.position="none")
  fig
}

Model 2: Endozoicomonas (Is1,Is2 vs Others)

tmp = meta[colnames(matCount.c),] %>% mutate(Bacteria = Colony %in% c("Is1", "Is2"))
group = tmp$Bacteria

design = model.matrix(~Bacteria, data=tmp)
head(design)

d <- DGEList(counts=matCount.c, group=group)
d <- calcNormFactors(d)
d <- estimateGLMCommonDisp(d, design)
d <- estimateGLMTrendedDisp(d, design)
d <- estimateGLMTagwiseDisp(d, design)

fit <- glmFit(d, design)
lrt <- glmLRT(fit, coef = 2)
result.E = topTags(lrt, p.value = 0.01, n=Inf)

load("../Annotation/coral/Acropora_tenuis_UT.StringTie.transcripts.fasta.transdecoder.pep.swissprot_blastp.parse.Uniprot-mapping.RData")
colnames(mapping)[1] = "Uniprot"
mat = result.E %>% as.data.frame() %>% 
  rownames_to_column("Gene") %>% left_join(gene2Name.c, by="Gene") %>%
  filter(abs(logFC) > 1) %>%
  left_join(mapping, by="Uniprot") %>% unique()
  
write.xlsx(mat, file="3.2_Endozoicomonas_DEG.xlsx")
up_genes = result.E %>% as.data.frame() %>% rownames_to_column("Gene") %>%
  filter(logFC > log2(2) & FDR < 0.01) %>% pull(Gene) 
down_genes = result.E %>% as.data.frame() %>% 
  rownames_to_column("Gene") %>%
  filter(logFC < log2(0.5) & FDR < 0.01) %>% 
  pull(Gene)

make_heatmap(log2(matFPKM.c+1), up_genes, down_genes, "3.2_Endozoicomonas_DEG.Heatmap.pdf", width=3, height=2.5)
# Upregulated genes
gsa.up = enricher(up_genes, 
                  minGSSize=10, maxGSSize=500, 
                  universe=rownames(matFPKM.c), qvalueCutoff=0.1, 
                  TERM2GENE=TERM2GENE.GO)
gsa.up = as.data.frame(gsa.up)
terms = go2term(gsa.up$ID) %>% column_to_rownames("go_id")
gsa.up$Description = terms[gsa.up$ID, 1]
if(nrow(gsa.up) > 0) gsa.up = data.frame(Group="up", gsa.up, row.names=NULL)

# Downregulated genes
gsa.down = enricher(down_genes, 
                    minGSSize=10, maxGSSize=500, 
                    universe=rownames(matFPKM.c), qvalueCutoff=0.1, 
                    TERM2GENE=TERM2GENE.GO)
gsa.down = as.data.frame(gsa.down)
terms = go2term(gsa.down$ID) %>% column_to_rownames("go_id")
gsa.down$Description = terms[gsa.down$ID, 1]
if(nrow(gsa.down) > 0) gsa.down = data.frame(Group="down", gsa.down, row.names=NULL)

gsa = rbind(gsa.up, gsa.down)
write.xlsx(gsa, file="3.2_Endozoicomonas_DEG.GSA.GO.xlsx")

barGSA(gsa.up, n=10, color_bar="coral", limits=c(0,3), font_color="black", font=2.8) + 
  ggsave("3.2_Endozoicomonas_DEG.GSA.GO.Up.pdf", width=3.5, height=2.3)
barGSA(gsa.down, n=10, color_bar="turquoise", limits=c(0,6), font_color="black") + 
  ggsave("3.2_Endozoicomonas_DEG.GSA.GO.Down.pdf", width=3.5, height=1)
# Upregulated genes
gsa.up = enricher(up_genes, 
                  minGSSize=10, maxGSSize=500, 
                  universe=rownames(matFPKM.c), qvalueCutoff=0.1, 
                  TERM2GENE=TERM2GENE.KO)
gsa.up = as.data.frame(gsa.up)
if(nrow(gsa.up) > 0) gsa.up = data.frame(Group="up", gsa.up, row.names=NULL)

# Downregulated genes
gsa.down = enricher(down_genes, 
                    minGSSize=10, maxGSSize=500, 
                    universe=rownames(matFPKM.c), qvalueCutoff=0.1, 
                    TERM2GENE=TERM2GENE.KO)
gsa.down = as.data.frame(gsa.down)
if(nrow(gsa.down) > 0) gsa.down = data.frame(Group="down", gsa.down, row.names=NULL)

gsa = rbind(gsa.up, gsa.down)
write.xlsx(gsa, file="3.2_Endozoicomonas_DEG.GSA.KEGG.xlsx")
# Top hit gene (Ephrin receptor)
fig1 = gene_boxplot(log2(matFPKM.c+1), "MSTRG.5097", "Epha5") 
fig2 = gene_boxplot(log2(matFPKM.c+1), "MSTRG.31636", "Ephb1")
fig3 = gene_boxplot(log2(matFPKM.c+1), "MSTRG.38764", "Epha3")
cowplot::plot_grid(fig1, fig2, fig3, nrow=1) + ggsave("3.2_Endozoicomonas_DEG.Eph_receptor.pdf", width=6, height=2.5)

fig1 = gene_boxplot(log2(matFPKM.c+1), "MSTRG.12568", "P2rx7") 
fig2 = gene_boxplot(log2(matFPKM.c+1), "MSTRG.9878", "P2rx7")
fig3 = gene_boxplot(log2(matFPKM.c+1), "MSTRG.9924", "P2rx7")
cowplot::plot_grid(fig1, fig2, fig3, nrow=1) + ggsave("3.2_Endozoicomonas_DEG.P2X7.pdf", width=6, height=2.5)

Model 2: Anaplasma (Is1,Is3,Mi1,Mi2 vs Others)

tmp = meta[colnames(matCount.c),] %>% mutate(Bacteria = Colony %in% c("Is1", "Is3", "Mi1", "Mi2"))
group = tmp$Bacteria

design = model.matrix(~Bacteria, data=tmp)
head(design)

d <- DGEList(counts=matCount.c, group=group)
d <- calcNormFactors(d)
d <- estimateGLMCommonDisp(d, design)
d <- estimateGLMTrendedDisp(d, design)
d <- estimateGLMTagwiseDisp(d, design)

fit <- glmFit(d, design)
lrt <- glmLRT(fit, coef = 2)
result.A = topTags(lrt, p.value = 0.01, n=Inf)

load("../Annotation/coral/Acropora_tenuis_UT.StringTie.transcripts.fasta.transdecoder.pep.swissprot_blastp.parse.Uniprot-mapping.RData")
colnames(mapping)[1] = "Uniprot"
mat = result.A %>% as.data.frame() %>% 
  rownames_to_column("Gene") %>% left_join(gene2Name.c, by="Gene") %>%
  filter(abs(logFC) > 1) %>%
  left_join(mapping, by="Uniprot") %>% unique()
  
write.xlsx(mat, file="3.2_Anaplasma_DEG.xlsx")
up_genes = result.A %>% as.data.frame() %>% rownames_to_column("Gene") %>%
  filter(logFC > 1 & FDR < 0.01) %>% pull(Gene) 
down_genes = result.A %>% as.data.frame() %>% rownames_to_column("Gene") %>%
  filter(logFC < -1 & FDR < 0.01) %>% pull(Gene)

make_heatmap(log2(matFPKM.c+1), up_genes, down_genes, "3.2_Anaplasma_DEG.Heatmap.pdf", width=3, height=2.5)
# Upregulated genes
gsa.up = enricher(up_genes, 
                  minGSSize=10, maxGSSize=500, 
                  universe=rownames(matFPKM.c), qvalueCutoff=0.1, 
                  TERM2GENE=TERM2GENE.GO)
gsa.up = as.data.frame(gsa.up)
terms = go2term(gsa.up$ID) %>% column_to_rownames("go_id")
gsa.up$Description = terms[gsa.up$ID, 1]
if(nrow(gsa.up) > 0) gsa.up = data.frame(Group="up", gsa.up, row.names=NULL)

# Downregulated genes
gsa.down = enricher(down_genes, 
                    minGSSize=10, maxGSSize=500, 
                    universe=rownames(matFPKM.c), qvalueCutoff=0.1, 
                    TERM2GENE=TERM2GENE.GO)
gsa.down = as.data.frame(gsa.down)
terms = go2term(gsa.down$ID) %>% column_to_rownames("go_id")
gsa.down$Description = terms[gsa.down$ID, 1]
if(nrow(gsa.down) > 0) gsa.down = data.frame(Group="down", gsa.down, row.names=NULL)

gsa = rbind(gsa.up, gsa.down)
write.xlsx(gsa, file="3.2_Anaplasma_DEG.GSA.GO.xlsx")

barGSA(gsa.up, n=10, color_bar="coral", limits=c(0,3), font_color="black", font=2.7) + 
  ggsave("3.2_Anaplasma_DEG.GSA.GO.Up.pdf", width=3, height=2.3)
# Upregulated genes
gsa.up = enricher(up_genes, 
                  minGSSize=10, maxGSSize=500, 
                  universe=rownames(matFPKM.c), qvalueCutoff=0.1, 
                  TERM2GENE=TERM2GENE.KO)
gsa.up = as.data.frame(gsa.up)
if(nrow(gsa.up) > 0) gsa.up = data.frame(Group="up", gsa.up, row.names=NULL)

# Downregulated genes
gsa.down = enricher(down_genes, 
                    minGSSize=10, maxGSSize=500, 
                    universe=rownames(matFPKM.c), qvalueCutoff=0.1, 
                    TERM2GENE=TERM2GENE.KO)
gsa.down = as.data.frame(gsa.down)
if(nrow(gsa.down) > 0) gsa.down = data.frame(Group="down", gsa.down, row.names=NULL)

gsa = rbind(gsa.up, gsa.down)
write.xlsx(gsa, file="3.2_Anaplasma_DEG.GSA.KEGG.xlsx")