MAGs clustered as Dictyochophyceae or putative stramenopiles isolated as a case study to demonstrate how we can use metatranscriptomic data to explore the ecological niches of stramenopiles. Putative stramenopile MAGs were assigned to the broader SAR group (Stramenopiles, Alveolata, and Rhizaria) based on MMSeqs and EUKulele (see Figure 2), but one phylogenetically similar clade was chosen to represent putative heterotrophic stramenopiles likely related to Dictyochophyceae. Core questions to address:
(1) What is the biogeography of these MAGs with respect to region, depth, and size fraction? (2) How does the distribution of MAGs related to predicted trophic mode? (3) What are the core ecological differences among these taxa? (4) Are the currently SAR-assigned MAGs related to the more heterotrophic Dictyochophyceae species?
library(tidyverse); library(broom); library(ggtext); library(directlabels)
library(ggupset); library(cowplot); library(pheatmap)
Import and clean metadata for all MAGs.
<- read.delim("../data/SampleList_2020_metaT.txt")
metaT_metadata <- read.delim("../data/SampleList_2020_metaG.txt")
metaG_metadata # Import Lat & Long
<- read.delim("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/PRJEB4352_metaG_wenv.txt") tmp_metadata
# Separate ERRs into rows & keep only what I need
<- metaG_metadata %>%
metaG_reformat separate_rows(ERR_list, sep = ", ") %>%
separate(Sub_region, c("REGION", "subregion"), sep = "-", remove = FALSE) %>%
separate(Depth_sizefrac, c("DEPTH", "min", "max"), sep = "-", remove = FALSE) %>%
unite(SIZEFRAC, min, max, sep = "-") %>%
select(-Assembly_group, -ERR_count, run_accession = "ERR_list", REGION, DEPTH, SIZEFRAC) %>%
left_join(tmp_metadata %>% select(run_accession, sampleid = Sample.material, Longitude, Latitude, Station)) %>%
distinct()
24 MAGs selected by membership to Dictyochophyceae and closely related heterotroph-predicted stramenopile clade.
<- read.delim("../data/mags-of-interest-24.txt", header = FALSE)
mags_tmp <- as.character(mags_tmp$V1);mags_of_interest mags_of_interest
## [1] "TOPAZ_SAS1_E003" "TOPAZ_MSS1_E028" "TOPAZ_IOS1_E045" "TOPAZ_MSS1_E004"
## [5] "TOPAZ_SAS1_E036" "TOPAZ_SPS1_E066" "TOPAZ_SAS1_E034" "TOPAZ_SPS1_E018"
## [9] "TOPAZ_SPS1_E054" "TOPAZ_MSS1_E011" "TOPAZ_NPS1_E005" "TOPAZ_NAS1_E036"
## [13] "TOPAZ_NAX1_E011" "TOPAZ_MSS1_E023" "TOPAZ_NPS1_E025" "TOPAZ_SPS1_E074"
## [17] "TOPAZ_SPS1_E030" "TOPAZ_MSD1_E003" "TOPAZ_MSS1_E026" "TOPAZ_NAS1_E023"
## [21] "TOPAZ_SAD1_E033" "TOPAZ_SPD1_E019" "TOPAZ_SPS1_E070" "TOPAZ_SAS1_E035"
n = 13 from SAR n = 11 are dictyochophyte MAGs (derived from EUKulele & MMSeqs taxonomy assignment)
Import MAG files, including a name schematic key for the MAG TOPAZ-based naming. Also add the estimated taxonomy assignments.
<- read.csv("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/MAG_tpm.csv")
mag_counts_updated <- read.delim("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/renamed-eukaryotic-mags.tsv", sep = "\t")
mag_rename_key
# Import MAG taxonomic assignments (Feb 2, 2021)
<- read.csv("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/2021-marzoanmmetsp-estimated-taxonomy-levels.csv")
mag_taxonomy # head(mag_taxonomy)
Import list of MAGs with EUKulele taxonomy assignment at lower threshold (n=33); this way it includes more MAGs with dictyochophyte assignments. Incorporate BUSCO completeness and contamination to pare down the list.
<- read.delim("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/list_dictyochophyceae.txt", header = FALSE)
euk_thres <- read.csv("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/EUK_BUSCO_CC.csv") buscos
Isolate TPM MAG abundances.
<- mag_counts_updated %>%
mag_tpm left_join(mag_rename_key, by = c("Genome" = "old_mag_name")) %>%
filter(new_mag_name %in% mags_of_interest) %>%
pivot_longer(cols = starts_with("ERR"), names_to = "run_accession", values_to = "tpm") %>%
left_join(metaG_reformat) %>%
select(mag = new_mag_name, -Genome, everything()) %>%
data.frame
## Joining, by = "run_accession"
# head(mag_tpm)
# length(unique(mag_tpm$mag))
Sum TPM of MAGs based on Size Fraction, Location, and Depth
<- mag_tpm %>%
mag_tpm_location filter(!is.na(REGION)) %>%
group_by(mag, DEPTH, REGION, SIZEFRAC) %>%
summarise(mag_tpm = sum(tpm)) %>%
mutate(log_mag_tpm = log(mag_tpm)) %>%
unite(SAMPLE, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
filter(!is.na(mag_tpm))
## `summarise()` has grouped output by 'mag', 'DEPTH', 'REGION'. You can override using the `.groups` argument.
# head(mag_tpm_location)
hist(mag_tpm_location$log_mag_tpm)
Save output R objects as data checkpoints to add to available repo.
# save(mag_tpm_location, mag_tpm, file = "../data/MAG-casestudy-Stramenopile-2021.RData")
Load existing data from R objects.
load("../data/MAG-casestudy-Stramenopile-2021.RData", verbose = TRUE)
## Loading objects:
## mag_tpm_location
## mag_tpm
Visualize MAGs with pheatmap
<- mag_tpm_location %>%
pheat_mags select(mag, SAMPLE, log_mag_tpm) %>%
filter(!is.na(log_mag_tpm)) %>%
mutate_if(is.numeric, function(x) ifelse(is.infinite(x), 0, x)) %>%
pivot_wider(id_cols = SAMPLE, names_from = mag, values_from = log_mag_tpm, values_fill = 0) %>%
column_to_rownames(var = "SAMPLE")
## Adding missing grouping variables: `DEPTH`, `REGION`
## `mutate_if()` ignored the following grouping variables:
## Columns `mag`, `DEPTH`, `REGION`
# Obtain annotation
<- mag_tpm_location %>%
annotate_mags ungroup() %>%
filter(!is.na(REGION)) %>%
select(SAMPLE, REGION, DEPTH, SIZEFRAC) %>%
distinct() %>%
column_to_rownames(var = "SAMPLE")
rownames(annotate_mags) <- rownames(pheat_mags)
# Specify color schema
= list(
annotation_colors REGION = c(IO= "#711518", MS= "#ce536b", NAO= "#c76b4a", NPO= "#dfa837", SAO= "#93b778", SO= "#61ac86", SPO= "#657abb", RS= "#67765b"),
DEPTH = c(DCM= "#74a9cf", FSW= "#969696", SRF= "#d0d1e6", MIX= "#0570b0", MES= "#023858", ZZZ= "#252525"),
SIZEFRAC = c(`0.8-5.00` = "#f7fcb9",`180-2000.00` = "#004529", `20-180.00` = "#41ab5d",`5-20.00` = "#addd8e"))
MAG relative abundace by sample and TPM.
# png("../si-figures/SI-SAR-Dictyocho-MAG-MAG-relabun-bysample.png", h=1100, w=800)
pheatmap(pheat_mags,
annotation_row = annotate_mags,
annotation_colors = annotation_colors,
scale = "none",
cluster_cols = TRUE,
cluster_rows = TRUE,
clustering_method = "average",
cellwidth = 10, cellheight = 10,
main = "MAG Relative Abundance (CPM)")
# dev.off()
Plot MAGs based on global distribution - TPM.
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
Figure out latitute and longitude
<- mag_tpm %>%
mag_tpm_tmp mutate(lat_1 = round(Latitude, digits = 1),
long_1 = round(Longitude, digits = 1),
lat_2 = round(Latitude, digits = 2),
lat_3 = round(Latitude, digits = 3),
long_2 = round(Longitude, digits = 2),
long_3 = round(Longitude, digits = 3))
# length(unique(mag_tpm_tmp$lat_1))
# length(unique(mag_tpm_tmp$long_1))
Make summary data frame by latitute and longitude.
<- mag_tpm_tmp %>%
mag_tpm_map filter(tpm > 0) %>%
filter(!is.na(REGION)) %>%
group_by(mag, DEPTH, REGION, SIZEFRAC, lat_1, long_1) %>%
summarise(mag_tpm_latlong = sum(tpm)) %>%
mutate(log_mag_tpm = log(mag_tpm_latlong)) %>%
unite(SAMPLE, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
filter(!is.na(mag_tpm_latlong)) %>%
mutate(Location = paste(lat_1, long_1))
## `summarise()` has grouped output by 'mag', 'DEPTH', 'REGION', 'SIZEFRAC', 'lat_1'. You can override using the `.groups` argument.
Get background world map
<- function(resolution="coarse"){
map_get_world <- rworldmap::getMap(resolution = resolution) # Change to "coarse" for global maps / "low" for regional maps
worldMap <- fortify(worldMap)
world.points $region <- world.points$id
world.points<- world.points[,c("long","lat","group", "region")]
world.df
}# ?rworldmap::getMap
<- function(color_continents = "#969696", color_borders = "#969696", resolution = "coarse") {
map_world <- map_get_world(resolution)
world.df <- ggplot() +
map geom_polygon(data = world.df, aes(x=long, y = lat, group = group), fill=color_continents, color=color_borders) +
# scale_fill_manual(values= color_continents , guide = FALSE) +
scale_x_continuous(breaks = (-4:4) * 45) +
scale_y_continuous(breaks = (-2:2) * 30) +
xlab("Longitude") + ylab("Latitude") +
coord_fixed(1.3) +
theme_bw()
# species_map <- species_map + coord_map () # Mercator projection
# species_map <- species_map + coord_map("gilbert") # Nice for the poles
return(map)
}
Factor for depths for the map.
<- c("DCM", "FSW", "SRF", "MIX", "MES", "ZZZ")
depth_order <- c("#74a9cf", "#969696", "#d0d1e6", "#0570b0", "#023858", "#252525")
depth_color $DEPTH_ORDER <- factor(mag_tpm_map$DEPTH, levels = depth_order)
mag_tpm_map#
<- c("TOPAZ_MSS1_E028","TOPAZ_SAS1_E003","TOPAZ_NAX1_E011","TOPAZ_IOS1_E045","TOPAZ_SAS1_E036","TOPAZ_NAS1_E036","TOPAZ_MSS1_E004","TOPAZ_SAD1_E033","TOPAZ_MSS1_E023","TOPAZ_NPS1_E025","TOPAZ_MSS1_E011","TOPAZ_NPS1_E005","TOPAZ_SPS1_E018","TOPAZ_SPS1_E054","TOPAZ_MSD1_E003","TOPAZ_MSS1_E026","TOPAZ_SAS1_E035","TOPAZ_NAS1_E023","TOPAZ_SPS1_E030","TOPAZ_SPS1_E074","TOPAZ_SPD1_E019","TOPAZ_SPS1_E070","TOPAZ_SAS1_E034","TOPAZ_SPS1_E066")
mag_order $MAG_ORDER <- factor(mag_tpm_map$mag, levels = mag_order) mag_tpm_map
# png("../si-figures/SI-stramenopile-MAG-map.png", w=1100,h=850)
map_world() +
geom_jitter(data = mag_tpm_map,
aes(x = long_1, y = lat_1,
fill = DEPTH_ORDER,
size = mag_tpm_latlong),
shape = 21, color = "black") +
scale_size_continuous(range = c(1,14)) +
scale_fill_manual(values = depth_color) +
facet_wrap(MAG_ORDER ~ ., ncol = 4) +
theme(strip.background = element_blank(),
strip.text = element_text(face = "bold", size = 14, hjust = 1, vjust = 1),
axis.text = element_text(size = 10, hjust = 1, vjust = 1),
legend.title = element_blank())
## Regions defined for each Polygons
## Warning: Removed 24 rows containing missing values (geom_point).
# dev.off()
Import the results from metatranscriptome mapping onto putative dictyochophyte and SAR group MAGs. Explore trophic roles and shared/unique functional profiles.
For loop to determine summed transcript counts that hit the 11 core putative dictyochophyte MAGs.
# For loop to import all data and compile
<- list.files(path = "/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data",pattern = "tpm.summedcounts")
summed_counts # summed_counts
# rm(imported)
# rm(compiled_metaT_hits)
for (a in summed_counts) {
<- read.csv(paste("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/", a, sep = ""))
imported <- unlist(strsplit(a,".tpm.summedcounts"))
names $MAG <- names[1]
importedif (!exists("compiled_metaT_hits")) {
<- imported
compiled_metaT_hits else {
} <- rbind(compiled_metaT_hits, imported)
compiled_metaT_hits
}rm(imported)
}# rm(compiled_metaT_hits)
# rm(imported)
# Rename MAGs with TOPAZ name schematic
<- compiled_metaT_hits %>%
metaT_mapped left_join(mag_rename_key, by = c("MAG" = "old_mag_name")) %>%
select(-MAG) %>%
select(MAG = new_mag_name, everything()) %>%
data.frame# head(metaT_mapped)
# dim(compiled_metaT_hits)
Checkpoint for saving the metatranscriptome data with TOPAZ naming schematic.
# save(metaT_mapped, mag_rename_key, file = "../data/MAG-casestudy-Stramenopile-metaT-2021.RData")
Match KEGG identities to kegg module information and curated list of kegg pathways of interest.
Import metatranscriptome data from MAGs of interest.
load("../data/MAG-casestudy-Stramenopile-metaT-2021.RData", verbose=T)
## Loading objects:
## metaT_mapped
## mag_rename_key
<- read.csv("../data/mag_heterotrophy_rf_13May2021.csv")
trophy # head(mag_rename_key)
# head(trophy)
<- trophy %>%
trophy_refine left_join(mag_rename_key, by = c("MAGs" ="old_mag_name")) %>%
select(-MAGs)
# head(trophy_refine)
# mag_order
# write_delim(filter(trophy_refine, new_mag_name %in% mag_order) %>% distinct(), path = "trophy-24mags.txt", delim = "\t")
<- metaT_mapped %>%
metaT_mapped_trophy left_join(trophy_refine, by = c("MAG" = "new_mag_name"))
# Get list of phototrophy predicted MAGs
<- metaT_mapped_trophy %>%
tmp filter(PredictedTrophicMode == "Phototroph")
<- as.character(unique(tmp$MAG))
photo_mags # length(photo_mags)
# head(metaT_mapped)
<- metaT_mapped %>%
MAG_dendro_binary type.convert(as.is = TRUE) %>%
select(MAG, KO) %>%
add_column(BIN = 1) %>%
pivot_wider(names_from = "KO", values_from = BIN, values_fill = 0) %>%
column_to_rownames(var = "MAG") %>%
data.frame# ?pivot_wider
# head(MAG_dendro_binary)
library(ggdendro)
<- hclust(dist((MAG_dendro_binary)), method = "average")
cluster_mags <- dendro_data(as.dendrogram(cluster_mags), type = "rectangle")
dendro_mags
# head(dendro_mags)
<- as.character(dendro_mags$labels$label)
labeled_dendro_order # labeled_dendro_order
Add trophy information
<- label(dendro_mags)
labs <- labs %>%
labs_refined mutate(group = case_when(
%in% photo_mags ~ "Phototroph",
label TRUE ~ "Heterotroph"
))# head(labs_refined)
Plot dendrogram - derived from the presence/absence of known orthologs.
<- ggplot(segment(dendro_mags)) +
dendro_mag_plot geom_segment(aes(x = x, y = y, xend = xend,
yend = yend)) +
coord_flip() +
scale_y_reverse(expand = c(1, 1)) +
geom_text(aes(x = x, y = y, label = label, angle = 0,
hjust = 0, color = labs_refined$group),
data = label(dendro_mags), fontface = "bold", size = 5) +
scale_color_manual(values = c("#CC3D6D", "#B4CC3D")) +
theme_dendro() +
theme(legend.position = "bottom", legend.title = element_blank())
# svg("dendrogram-plot-trophy.svg", w=7, h=10)
dendro_mag_plot
# dev.off()
<- rev(labeled_dendro_order) mag_pie_order
Prepare data frame and factor.
<- mag_tpm_location %>%
region_mags_df replace_na(list(mag_tpm=0)) %>%
group_by(REGION, mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))
## `summarise()` has grouped output by 'REGION'. You can override using the `.groups` argument.
<- c("IO","MS","NAO","NPO","SAO","SO","SPO","RS")
region_order <- c("#711518", "#ce536b", "#c76b4a", "#dfa837", "#93b778", "#61ac86", "#657abb", "#67765b")
region_color $REGION_ORDER <- factor(region_mags_df$REGION, levels = region_order)
region_mags_df$MAG_ORDER <- factor(region_mags_df$mag, levels = mag_pie_order)
region_mags_dfnames(region_color) <- region_order
# View(region_dictyo)
Plot pies by region
<- region_mags_df %>%
region_mag add_column(fake = "fake") %>%
ggplot(aes(x = fake, fill = REGION_ORDER, y = mag_sum_tpm)) +
geom_bar(color = "white", stat = "identity", position = "fill") +
scale_fill_manual(values = region_color) +
# theme_linedraw() +
coord_polar(theta = "y") +
# coord_flip() +
facet_wrap(~MAG_ORDER, ncol = 1) +
# scale_y_continuous(expand = c(0,0)) +
theme_void() +
theme(legend.title = element_blank(),
strip.text = element_text(size = 3),
# plot.margin = element_blank(),
plot.background = element_blank()) +
labs(y = "MAG Abundance TPM", x = "Putative Dictyochophyte MAGs") #+
# theme(strip.text = element_text(size = rel(3.0), vjust = -4.0),
# panel.margin.y = unit(-2, "lines"))
# region_mag
Prepare data frame and factor.
<- mag_tpm_location %>%
depth_mags_df replace_na(list(mag_tpm=0)) %>%
group_by(DEPTH, mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))
## `summarise()` has grouped output by 'DEPTH'. You can override using the `.groups` argument.
<- c("DCM", "FSW", "SRF", "MIX", "MES", "ZZZ")
depth_order <- c("#74a9cf", "#969696", "#d0d1e6", "#0570b0", "#023858", "#252525")
depth_color $DEPTH_ORDER <- factor(depth_mags_df$DEPTH, levels = depth_order)
depth_mags_df
$MAG_ORDER <- factor(depth_mags_df$mag, levels = mag_pie_order)
depth_mags_dfnames(depth_color) <- depth_order
# View(depth_dictyo)
Plot pies by depth
<- depth_mags_df %>%
depth_mag add_column(fake = "fake") %>%
ggplot(aes(x = fake, fill = DEPTH_ORDER, y = mag_sum_tpm)) +
geom_bar(color = "white", stat = "identity", position = "fill") +
scale_fill_manual(values = depth_color) +
# theme_linedraw() +
coord_polar(theta = "y") +
# coord_flip() +
facet_wrap(~MAG_ORDER, ncol = 1) +
# scale_y_continuous(expand = c(0,0)) +
theme_void() +
theme(legend.title = element_blank(),
strip.text = element_text(size = 4)) +
labs(y = "MAG Abundance TPM", x = "Putative Dictyochophyte MAGs")
Prepare data frame and factor.
<- mag_tpm_location %>%
frac_mags_df replace_na(list(mag_tpm=0)) %>%
group_by(SIZEFRAC, mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))
## `summarise()` has grouped output by 'SIZEFRAC'. You can override using the `.groups` argument.
<- c("0.8-5.00", "5-20.00", "20-180.00", "180-2000.00")
frac_order <- c("#f7fcb9", "#addd8e", "#41ab5d", "#004529")
frac_color $FRAC_ORDER <- factor(frac_mags_df$SIZEFRAC, levels = frac_order)
frac_mags_df
$MAG_ORDER <- factor(frac_mags_df$mag, levels = mag_pie_order)
frac_mags_dfnames(frac_color) <- frac_order
# View(frac_dictyo)
Plot pies by size fraction
<- frac_mags_df %>%
frac_mag add_column(fake = "fake") %>%
ggplot(aes(x = fake, fill = FRAC_ORDER, y = mag_sum_tpm)) +
geom_bar(color = "white", stat = "identity", position = "fill") +
scale_fill_manual(values = frac_color) +
# theme_linedraw() +
coord_polar(theta = "y") +
# coord_flip() +
facet_wrap(~MAG_ORDER, ncol = 1) +
# scale_y_continuous(expand = c(0,0)) +
theme_void() +
theme(legend.title = element_blank(),
strip.text = element_text(size = 4)) +
labs(y = "MAG Abundance TPM", x = "Putative Dictyochophyte MAGs")
Prepare data frame and factor.
head(mag_tpm_location)
## # A tibble: 6 x 7
## # Groups: mag, DEPTH, REGION [2]
## mag SAMPLE DEPTH REGION SIZEFRAC mag_tpm log_mag_tpm
## <fct> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 TOPAZ_IOS1_E045 IO DCM 0.8-5.00 DCM IO 0.8-5.00 8598. 9.06
## 2 TOPAZ_IOS1_E045 IO DCM 180-2000.00 DCM IO 180-2000.… 2176. 7.69
## 3 TOPAZ_IOS1_E045 IO DCM 20-180.00 DCM IO 20-180.00 3185. 8.07
## 4 TOPAZ_IOS1_E045 IO DCM 5-20.00 DCM IO 5-20.00 162. 5.09
## 5 TOPAZ_IOS1_E045 MS DCM 0.8-5.00 DCM MS 0.8-5.00 1648. 7.41
## 6 TOPAZ_IOS1_E045 MS DCM 180-2000.00 DCM MS 180-2000.… 1950. 7.58
<- mag_tpm_location %>%
total_relabun_mag replace_na(list(mag_tpm=0)) %>%
group_by(mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))
$MAG_ORDER <- factor(total_relabun_mag$mag, levels = mag_pie_order) total_relabun_mag
<- total_relabun_mag %>%
rel_abun add_column(fake = "fake") %>%
ggplot(aes(x = fake, size = mag_sum_tpm, y = mag)) +
geom_point(color = "black") +
scale_size_continuous(range = c(1, 17)) +
facet_wrap(~MAG_ORDER, ncol = 1) +
# scale_y_continuous(expand = c(0,0)) +
theme_void() +
theme(legend.title = element_blank(),
strip.text = element_text(size = 4))
Compiled plots. Same order as the dendrogram above, so these pie charts can be lined up with the dendrogram.
# svg("pies-mag-abun-withoutlabels.svg", h = 20, w = 15)
# plot_grid(region_mag, depth_mag, frac_mag, rel_abun, nrow = 1)
plot_grid(region_mag + theme(strip.text = element_text(size = rel(3.0), vjust = -4.0),
panel.margin.y = unit(-2, "lines")),
+ theme(strip.text = element_text(size = rel(3.0), vjust = -4.0),
depth_magpanel.margin.y = unit(-2, "lines")),
+ theme(strip.text = element_text(size = rel(3.0), vjust = -4.0),
frac_magpanel.margin.y = unit(-2, "lines")),
rel_abun,nrow = 1)
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
# dev.off()
First, incorporate KEGG module information into mapped metatranscriptome data.
<- select(metaT_mapped, KO) %>% distinct()
keggs_to_match # head(keggs_to_match)
Import provided KEGG information (module information) and list of KEGG IDs selected to model trophic mode.
<- read.csv("../data/vita_vars.csv")
kegg_vita load(file = "../data/KEGG-key.RData", verbose = TRUE)
## Loading objects:
## kegg
## K0_gene_wMods
<- keggs_to_match %>%
kegg_key_dictyo left_join(kegg_vita %>% add_column(VITA = "vita"), by = c("KO" = "Selected.KOs")) %>%
left_join(kegg, by = c("KO" = "KO_number")) %>%
data.frame
# head(kegg_key_dictyo)
# dim(keggs_to_match);dim(kegg_key_dictyo)
Supplementary figure showing shared and unique metatranscriptome reads among selected MAGs.
# png("../si-figures/SI-SAR-Dictyocho-MAG-KEGG-intersection.png", h=1000, w=900)
%>%
metaT_mapped_trophy type.convert(as.is = TRUE) %>%
pivot_longer(cols = starts_with("ERR"), names_to = "SAMPLEID", values_to = "transcript_tpm") %>%
left_join(kegg_key_dictyo %>% select(KO, B) %>% distinct()) %>%
group_by(MAG, KO, B) %>%
summarise(transcript_tpm_sum = sum(transcript_tpm)) %>%
ungroup() %>%
group_by(KO, B) %>%
summarise(transcript_to_mag = list(MAG)) %>%
ggplot(aes(transcript_to_mag)) +
# geom_bar(color = "black", fill = "#AC7070", width = 0.7) +
geom_bar(color = "black", width = 0.7, aes(fill = B)) +
scale_fill_brewer(palette = "Dark2") +
scale_x_upset(order_by = "freq",
n_intersections = 30, position = "top") +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme_combmatrix(combmatrix.label.text = element_text(color = "black", size = 14),
combmatrix.label.extra_spacing = 10) +
theme(axis.text = element_text(color = "black", face = "bold", size = 12),
legend.title = element_blank()) +
labs(x = "Frequency across MAGs",
y = "Shared KEGG IDs",
title = "")
## Joining, by = "KO"
## `summarise()` has grouped output by 'MAG', 'KO'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'KO'. You can override using the `.groups` argument.
## Warning: Removed 3805 rows containing non-finite values (stat_count).
# dev.off()
Following selection of specific MAGs that may represent phototrophic dictyochophyceae, mixotrophic dictyochophyceae, and MAST - look at the relative TPM expression of these core gene set (see above section for selection) across different ocean regions and depths.
Select MAGs of interest
<- c("TOPAZ_MSS1_E011","TOPAZ_SAS1_E035","TOPAZ_SPS1_E066","TOPAZ_NAX1_E011")
mags_to_highlight library(gghighlight)
# mags_to_highlight <- c("TOPAZ_MSS1_E011","TOPAZ_SAS1_E035","TOPAZ_SPS1_E066","TOPAZ_NAX1_E011")
<- function(mag){
fxn_pca_highlightMAG ggplot(pca_shared_all, aes(x = PC1, y = PC2, size = HeterotrophyScore, fill = MAG)) +
geom_point(aes(fill = MAG), shape = 21,
stroke = 0.6) +
scale_size_binned(nice.breaks = TRUE, n.breaks = 8, range = c(0.4,8)) +
theme_bw() +
scale_fill_manual(values = c("#fb6a4a")) +
theme(axis.text = element_text(size = 10, color = "black")) + guides(fill = guide_legend(ncol = 2)) +
gghighlight(MAG == mag)
}# svg("highlight-pca.svg", h = 14, w = 5.5)
# plot_grid(fxn_pca_highlightMAG("TOPAZ_MSS1_E011"),
# fxn_pca_highlightMAG("TOPAZ_SAS1_E035"),
# fxn_pca_highlightMAG("TOPAZ_SPS1_E066"),
# fxn_pca_highlightMAG("TOPAZ_NAX1_E011"),
# ncol = 1, nrow = 4, labels = c("Phototroph predicted TOPAZ_MSS1_E011",
# "Heterotroph predicted TOPAZ_SAS1_E035",
# "Putative mixotroph TOPAZ_SPS1_E066",
# "Putative mixotroph TOPAZ_NAX1_E011"),
# label_size = 8)
# dev.off()
Subset genes that are more prominent in certain categories. While the intention is to isolate genes that may be indicative of a more phototrophic versus heterotrophic lifestyle, it is likely that there will be a gene suite that is more prominent in one of those categories. BUT as a whole, analysis of all the genes will be the most appropriate avenue for determining potential trophic strategies.
Biogeography of hypothesized MAST & Dictyochophyceae MAGs Following selection of specific MAGs that may represent phototrophic dictyochophyceae, mixotrophic dictyochophyceae, and MAST - look at the relative TPM expression of these core gene set (see above section for selection) across different ocean regions and depths.
Generate lollipop plot of metaT CPM derived from more phototrophy vs heterotrophy associated KEGG IDs. Map these across samples.
<- read.csv("../data/kegg_training_scores.csv")
trophy_contri # head(trophy_contri)
Taking input scores for vita-selected KEGG IDs, and working to distinguish if each ID contributes to phototrophy versus heterotrophy.
<- as.character(unique(kegg_vita$Selected.KOs))
vita_keggs_list <- trophy_contri %>%
trophy_contri_mod select(Predicted.trophic.mode, KEGG_ID_NUM, Ratio) %>%
pivot_wider(names_from = Predicted.trophic.mode, values_from = Ratio) %>%
mutate(VITA = case_when(
%in% vita_keggs_list ~ "vita"
KEGG_ID_NUM %>%
)) # Placement based on whichever has highest ratio
mutate(HIGHEST = case_when(
> Phototroph ~ "Heterotrophy",
Heterotroph TRUE ~ "Phototrophy"
),# Placement based on the ID having a >0.6 ratio
THRES_0.6 = case_when(
>= 0.6 ~ "Heterotrophy",
Heterotroph >= 0.6 ~ "Phototrophy",
Phototroph TRUE ~ "else"
),# Placement based on the ID having a >0.8 ratio
THRES_0.8 = case_when(
>= 0.8 ~ "Heterotrophy",
Heterotroph >= 0.8 ~ "Phototrophy",
Phototroph TRUE ~ "else"
))# head(trophy_contri_mod)
# table(trophy_contri_mod$HIGHEST)
# table(trophy_contri_mod$THRES_0.8)
# MetaT mapped to MAGs of interest, summed (CPM) to region, depth, and size fraction
<- metaT_mapped_trophy %>%
sum_metaT_by_region pivot_longer(cols = starts_with("ERR"), names_to = "run_accession", values_to = "transcript_tpm") %>%
left_join(metaT_reformat) %>%
group_by(MAG, KO, REGION, DEPTH, SIZEFRAC, PredictedTrophicMode, HeterotrophyScore) %>%
summarise(SUM = sum(transcript_tpm)) %>%
unite(MAG_troph, MAG, PredictedTrophicMode, sep = " ", remove = FALSE)
## Joining, by = "run_accession"
## `summarise()` has grouped output by 'MAG', 'KO', 'REGION', 'DEPTH', 'SIZEFRAC', 'PredictedTrophicMode'. You can override using the `.groups` argument.
Isolate specific MAGs to plot
<- c("TOPAZ_MSS1_E011","TOPAZ_SAS1_E035","TOPAZ_SPS1_E066","TOPAZ_NAX1_E011", "TOPAZ_IOS1_E045", "TOPAZ_SAS1_E003", "TOPAZ_NPS1_E025", "TOPAZ_MSS1_E023")
mags_to_highlight <- c("SRF", "DCM")
depths
<- sum_metaT_by_region %>%
subset_lolli_input type.convert(as.is = TRUE) %>%
filter(MAG %in% mags_to_highlight) %>%
filter(DEPTH %in% depths) %>%
unite(MAG_troph, MAG, PredictedTrophicMode, HeterotrophyScore, sep = " ", remove = FALSE) %>%
left_join(trophy_contri_mod, by = c("KO" = "KEGG_ID_NUM")) %>%
unite(SAMPLE_INFO, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
group_by(SAMPLE_INFO, MAG) %>% #relative abundance is based on...
mutate(TOTAL_MAG_TPM = sum(SUM)) %>%
ungroup() %>%
group_by(MAG, PredictedTrophicMode, MAG_troph, SAMPLE_INFO, REGION, DEPTH, SIZEFRAC, HIGHEST) %>%
summarise(SUM_HIGHEST = sum(SUM),
REL_ABUN = SUM_HIGHEST/TOTAL_MAG_TPM) %>%
#Use Relative abundance
mutate(VALUE = case_when(
== "Phototrophy" ~ (-1)*REL_ABUN,
HIGHEST == "Heterotrophy" ~ REL_ABUN
HIGHEST %>%
)) select(TROPHY = HIGHEST, everything()) %>%
distinct()
## `summarise()` has grouped output by 'MAG', 'PredictedTrophicMode', 'MAG_troph', 'SAMPLE_INFO', 'REGION', 'DEPTH', 'SIZEFRAC', 'HIGHEST'. You can override using the `.groups` argument.
# Factorization
<- c("IO","MS","NAO","NPO","SAO","SO","SPO","RS")
region_order <- c("#711518", "#ce536b", "#c76b4a", "#dfa837", "#93b778", "#61ac86", "#657abb", "#67765b")
region_color <- c("0.8-5.00", "5-20.00", "20-180.00", "180-2000.00")
frac_order <- c("#f7fcb9", "#addd8e", "#41ab5d", "#004529")
frac_color <- c("DCM", "FSW", "SRF", "MIX", "MES", "ZZZ")
depth_order <- c("#74a9cf", "#969696", "#d0d1e6", "#0570b0", "#023858", "#252525")
depth_color
$REGION_ORDER <- factor(subset_lolli_input$REGION, levels = region_order)
subset_lolli_inputnames(region_color) <- region_order
$DEPTH_ORDER <- factor(subset_lolli_input$DEPTH, levels = depth_order)
subset_lolli_inputnames(depth_color) <- depth_order
$SIZEFRAC_ORDER <- factor(subset_lolli_input$SIZEFRAC, levels = frac_order)
subset_lolli_inputnames(frac_color) <- frac_order
# "TOPAZ_MSS1_E011","TOPAZ_SAS1_E035","TOPAZ_SPS1_E066","TOPAZ_NAX1_E011"
<- subset_lolli_input %>%
lolli_metaT filter(PredictedTrophicMode == "Phototroph") %>%
# filter(PredictedTrophicMode != "Phototroph") %>%
ggplot(aes(x = SAMPLE_INFO, y = VALUE, fill = TROPHY)) +
geom_hline(yintercept = 0) +
geom_segment(aes(x = SAMPLE_INFO, xend = SAMPLE_INFO,
y = 0, yend = VALUE, color = TROPHY),
lineend = "butt", size = 1) +
geom_point(size = 2, shape = 19, aes(color = TROPHY)) +
scale_color_manual(values = c("#CC3D6D", "#B4CC3D")) +
theme_bw() +
# scale_y_continuous(limits = c(-0.2,0.2)) +
facet_grid(SIZEFRAC_ORDER + DEPTH_ORDER + REGION_ORDER ~ MAG_troph, space = "free", scales = "free_y") +
theme(axis.line.y = element_line(color = "black"),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
panel.spacing.x = unit(0, "lines"),panel.spacing.y = unit(0, "lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background.x = element_blank(),
strip.text.y = element_text(size = 10),
legend.position = "none") +
coord_flip() +
labs(x = "", y ="")
# lolli_metaT
Plot with color bar (*note there is a bug in R that does not allow me to remove the strip text bar). Repeat below for phototrophic and heterotrophic subset of MAGs.
# svg("heterotrophs-lolli-29052021.svg", h=12, w=18)
# svg("phototrophs-lolli-29052021.svg", h=12, w=18)
plot_grid(lolli_metaT,
ggplot(subset_lolli_input, aes(x = SAMPLE_INFO, fill = SIZEFRAC_ORDER, y = 1)) +
geom_tile() +
scale_fill_manual(values = frac_color) +
theme_bw() +
facet_grid(SIZEFRAC_ORDER + DEPTH_ORDER + REGION_ORDER ~ ., space = "free", scales = "free_y") +
theme(axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.spacing.x = unit(0, "lines"),panel.spacing.y = unit(0, "lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background.x = element_blank(),
# strip.text = element_blank(),
legend.position = "none") +
coord_flip() +
labs(x = "", y =""),
##
ggplot(subset_lolli_input, aes(x = SAMPLE_INFO, fill = DEPTH_ORDER, y = 0.1)) +
geom_tile() +
scale_fill_manual(values = depth_color) +
theme_bw() +
facet_grid(SIZEFRAC_ORDER + DEPTH_ORDER + REGION_ORDER ~ ., space = "free", scales = "free_y") +
theme(axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.spacing.x = unit(0, "lines"),panel.spacing.y = unit(0, "lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background.x = element_blank(),
# strip.text = element_blank(),
legend.position = "none") +
coord_flip() +
labs(x = "", y =""),
##
ggplot(subset_lolli_input, aes(x = SAMPLE_INFO, fill = REGION_ORDER, y = 0.1)) +
geom_tile() +
scale_fill_manual(values = region_color) +
theme_bw() +
facet_grid(SIZEFRAC_ORDER + DEPTH_ORDER + REGION_ORDER ~ ., space = "free", scales = "free_y") +
theme(axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.spacing.x = unit(0, "lines"),panel.spacing.y = unit(0, "lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background.x = element_blank(),
# strip.text = element_blank(),
legend.position = "none") +
coord_flip() +
labs(x = "", y =""),
align = "h", axis = "t", nrow = 1,
rel_widths = c(1,0.1, 0.1, 0.1))
## Warning: Removed 224 rows containing missing values (geom_segment).
## Warning: Removed 224 rows containing missing values (geom_point).
# dev.off()
# ?plot_grid()
Try x-y scatter plot on this
# head(subset_lolli_input)
# subset_lolli_input %>%
# filter(PredictedTrophicMode == "Phototroph") %>%
# # filter(PredictedTrophicMode != "Phototroph") %>%
# filter(!is.na(TROPHY)) %>%
# select(-SUM_HIGHEST, -VALUE) %>%
# pivot_wider(names_from = TROPHY, values_from = REL_ABUN) %>%
# ggplot(aes(x = Heterotrophy, y = Phototrophy, fill = MAG_troph)) +
# geom_point(shape = 21, size = 2, color = "black") +
# theme_linedraw() +
# # scale_y_continuous(limits = c(0,0.2)) +
# # scale_x_continuous(limits = c(0,0.2))
# scale_y_log10() + scale_x_log10()
# head(sum_metaT_by_region)
# head(trophy_contri_mod)
# head(sum_metaT_by_region)
# df_pheat_subset <- sum_metaT_by_region %>%
# type.convert(as.is = TRUE) %>%
# unite(MAG_troph, MAG, PredictedTrophicMode, HeterotrophyScore, sep = " ", remove = FALSE) %>%
# left_join(trophy_contri_mod, by = c("KO" = "KEGG_ID_NUM")) %>%
# filter(VITA == "vita") %>%
# unite(SAMPLE_INFO, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
# group_by(MAG, PredictedTrophicMode, MAG_troph, SAMPLE_INFO, REGION, DEPTH, SIZEFRAC, HIGHEST) %>%
# summarise(SUM_HIGHEST = sum(SUM)) %>%
# distinct()
# head(df_pheat_subset)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS/LAPACK: /Users/sarahhu/anaconda3/envs/r_3.6.0/lib/R/lib/libRblas.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gghighlight_0.3.1 ggdendro_0.1-20 rworldmap_1.3-6
## [4] sp_1.4-2 pheatmap_1.0.12 cowplot_1.0.0
## [7] ggupset_0.3.0 directlabels_2021.1.13 ggtext_0.1.0
## [10] broom_0.7.5 forcats_0.5.0 stringr_1.4.0
## [13] dplyr_1.0.5 purrr_0.3.4 readr_1.3.1
## [16] tidyr_1.1.3 tibble_3.1.0 ggplot2_3.3.1
## [19] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 maps_3.3.0 jsonlite_1.6.1
## [4] dotCall64_1.0-0 modelr_0.1.8 assertthat_0.2.1
## [7] highr_0.8 tensorA_0.36.1 blob_1.2.1
## [10] cellranger_1.1.0 robustbase_0.93-6 yaml_2.2.1
## [13] pillar_1.5.1 backports_1.2.1 lattice_0.20-41
## [16] glue_1.4.2 quadprog_1.5-8 digest_0.6.27
## [19] RColorBrewer_1.1-2 gridtext_0.1.3 rvest_0.3.5
## [22] colorspace_2.0-0 htmltools_0.5.1.1 pkgconfig_2.0.3
## [25] haven_2.3.1 scales_1.1.1 generics_0.1.0
## [28] farver_2.1.0 ellipsis_0.3.1 withr_2.4.1
## [31] cli_2.4.0 magrittr_2.0.1 crayon_1.4.1
## [34] readxl_1.3.1 maptools_1.0-1 evaluate_0.14
## [37] fs_1.4.1 fansi_0.4.2 MASS_7.3-51.6
## [40] compositions_1.40-5 xml2_1.3.2 foreign_0.8-76
## [43] tools_3.6.1 hms_0.5.3 lifecycle_1.0.0
## [46] munsell_0.5.0 reprex_0.3.0 compiler_3.6.1
## [49] rlang_0.4.10 debugme_1.1.0 grid_3.6.1
## [52] rstudioapi_0.11 spam_2.6-0 labeling_0.4.2
## [55] rmarkdown_2.6 gtable_0.3.0 DBI_1.1.0
## [58] R6_2.5.0 bayesm_3.1-4 lubridate_1.7.8
## [61] knitr_1.31 utf8_1.2.1 stringi_1.5.3
## [64] Rcpp_1.0.5 fields_11.6 vctrs_0.3.7
## [67] DEoptimR_1.0-8 dbplyr_1.4.4 tidyselect_1.1.0
## [70] xfun_0.20