Data derived from Hu et al. 2018, complete github can be found here.
background on environment and research questions
summary of metaT data analysis done so far
Question for ordination analysis: How similar are the dominant microeukaryote taxonomic groups at each depth?
In order to appropriately assess across different taxonomic groups and sample depths, we subset the read counts so there were not zeroes. Meaning, we retained read counts of shared KEGG IDs across the 5 taxonomic groups. This subset of shared counts was created first (see 'counts-bytaxa-sharedKeggIDs.csv').
Below R script will walk through importing this data, performing normalization (considering replicates), and downstream ordination.
Load libraries needed.
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("edgeR")
# install.packages("tidyverse")
# install.packages("vegan)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(edgeR)
## Warning: package 'edgeR' was built under R version 4.1.1
## Loading required package: limma
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
Import .csv file
countfile <- read.csv("counts-bytaxa-sharedKeggIDs.csv")
# names(countfile) # column names
Normalize with edgeR. Using the DGEList()
command to assign which columns are count data, KEGG/annotation information, and replicates for the 3 sample depths.
dge.metaT.cca <- DGEList(counts = countfile[,3:15],
genes = countfile[1:2],
group=c(rep("Surface",6),
rep("Deep_150m",3),
rep("Deep_890m",4)))
# List library size, and samples as grouped replicates.
dge.metaT.cca$samples
## group lib.size norm.factors
## Sample7 Surface 1578111 1
## Sample8 Surface 841926 1
## Sample9 Surface 2219463 1
## Sample10 Surface 2707843 1
## Sample11 Surface 2225508 1
## Sample12 Surface 1928552 1
## Sample25 Deep_150m 2312033 1
## Sample26 Deep_150m 1484394 1
## Sample27 Deep_150m 2800940 1
## Sample28 Deep_890m 726718 1
## Sample29 Deep_890m 2904457 1
## Sample30 Deep_890m 2200135 1
## Sample31 Deep_890m 2524094 1
Note the last column from the previous output lists the 'norm.factors'. In this next step we will do a normalization called TMM or trimmed mean
#normalize library using trimmed mean of M-values
data <- calcNormFactors(dge.metaT.cca, method = "TMM")
# ?calcNormFactors() #explore other options
Change to normalized counts per million (not logged)
cpm_data <- cpm(data, normalized.lib.sizes=TRUE, log=FALSE)
cpm_data <- as.data.frame(cpm_data)
# Compile with annotation information ($genes)
cca_cpm <- data.frame(data$genes, cpm_data)
#Calculate relative abundance the mean CPM by taxonomic group.
norm.m.data_CPM <- cca_cpm %>%
pivot_longer(starts_with("Sample")) %>%
group_by(Level2, name) %>%
mutate(count_norm = value/sum(value)) %>%
select(-value) %>%
pivot_wider(names_from = K0_number, values_from = count_norm, values_fill = 0) %>%
unite(tmp, Level2, name, sep = "_", remove = FALSE) %>%
select(tmp, Level2, variable = name, everything()) %>%
column_to_rownames(var = "tmp") %>%
data.frame
# head(norm.m.data_CPM)
Add in values for symbols and colors.
#format a key:
key_forPCA<-function(df){
df$shape<-22 #surface
deep150<-c("Sample25", "Sample26", "Sample27")
df$shape[df$variable %in% deep150]=21
deep890<-c("Sample28", "Sample29", "Sample30", "Sample31")
df$shape[df$variable %in% deep890]=24
#select taxa:
df$taxcolor[df$Level2=="Dinoflagellate"]="#c2a5cf"
df$taxcolor[df$Level2=="Ciliate"]="#d53e4f"
df$taxcolor[df$Level2=="Diatom"]="#e6f598"
df$taxcolor[df$Level2=="Chlorophyte"]="#5aae61"
df$taxcolor[df$Level2=="Haptophyte"]="#f46d43"
return(df)
}
cpm_PCA.w.key<-key_forPCA(norm.m.data_CPM)
# Isolate key and count data only
CCA_data <- cpm_PCA.w.key %>% select(-Level2, -variable, -shape, -taxcolor) #only numerica in CCA input information
key <- cpm_PCA.w.key %>% select(Level2, variable, shape, taxcolor)
Input data is all numeric with rownames equal to the categories I want to show in the legend of my ordination. In this case, it is sample (depth) and taxa. Values are normalized counts (Relative abundance), and columns are each of the KEGG identities of the reads.
# head(CCA_data) # View data frame structure this way
set.seed(111) #for reproducibility
ca <- cca(CCA_data, scaling=TRUE)
Using base R to plot outputs.
#CCA ploting set up:
par(xpd = FALSE, oma = c(0,0,0,8))
# Isolate variances for axes
eig<-eigenvals(ca); x<-signif((eig[1]*100),3); y<-signif((eig[2]*100),3) #extract variances for CCA1 and CCA2 axes
plot(ca,type=c("points"),display=c("wa", "sites"),main="",xlab=bquote(.(x)*"%"), ylab=(bquote(.(y)*"%")))
points(ca,pch=key$shape,col="black",bg=key$taxcolor,cex=1.5)
par(xpd=NA)
legend(4,1.5,c("Surface", "150m", "890m"),pch=c(0,1,2),col=c("black"),cex=1,pt.cex=1.6,bty="n",y.intersp=2)
legend(4,-1.5,c("Dinoflagellate","Ciliate","Haptophyte","Diatom","Chlorophyte"),pch=22,pt.bg=c("#c2a5cf","#d53e4f", "#f46d43", "#e6f598", "#5aae61"),col=c("black","black","black","black","black"),y.intersp=2,bty="n",pt.cex=2,cex=1)