“Publication Figure 5”
百度云盘链接: https://pan.baidu.com/s/15g7caZp354zIWktpnWzWhQ
提取码: 4sh7
Libraries
Standard Import
library(tidyverse)
library(cowplot)
library(scales)
library(ggpubr)
Special
library(caret)
library(randomForest)
library(vip)
library(ggrepel)
library(grid)
library(tidytext)
library(tidymodels)
library(plotROC)
Paths
bin_dir = '../bin/'
data_dir = '../data/'
results_dir = '../results/'
Custom Scripts / Theme
source(paste0(bin_dir, 'theme_settings.R'))
source (paste0(bin_dir, 'utilities.R'))
source(paste0(bin_dir, 'species_competition_functions.R'))
source(paste0(bin_dir, 'distmat_utils.R'))
source(paste0(bin_dir, 'analysis_metadata.R'))
date <- format(Sys.time(), "%d%m%y")
Import Tables
Metadata
samples.metadata <- read_tsv(paste0(data_dir, 'samples.metadata.tsv'))
microbe.taxonomy <- read_tsv(paste0(data_dir, 'microbe.taxonomy.tsv'))
microbe.metadata <- microbe.taxonomy %>%
right_join(., read_tsv(paste0(data_dir, 'microbe.metadata.tsv'))) %>%
mutate(gram.bool = ifelse(gram_stain == 'positive', T,
ifelse(gram_stain == 'negative', F, NA)),
spores.bool = ifelse(spore_forming == 'spore-forming', T,
ifelse(spore_forming == 'non-spore-forming', F, NA)))
Taxonomy
MetaPhlAn2 tables
combine with taxonomy
mp_species.long <- microbe.taxonomy %>%
right_join(., read_tsv(paste0(data_dir, 'samples.mp_profiles.species.tsv')),
by = 'species') %>%
left_join(., samples.metadata, by = 'Name')
rCDI subset
annotate with metadata
mp_species.rcdi.long <-
mp_species.long %>%
filter(Study %in% c(rcdi.studies))
Case Summary
SameStr Case-Summary table
sstr_cases <- read_tsv(paste0(data_dir, 'samples.case_summary.tsv')) %>%
left_join(., microbe.taxonomy, by = 'species')
Set Figure
fig = paste0('Fig_5.')
Strain-level Source (rCDI)
Strain-level plot of per-Case post-FMT rel. abund. by source
Format Data
sstr_cases.rcdi.metrics <-
sstr_cases %>%
filter(Study %in% c('ALM', 'FRICKE')) %>%
filter(Case_Name %in% cases.full) %>%
tag_species_competition(.) %>%
mutate(n = 1) %>%
mutate(source = ifelse(grepl(species, pattern = 'unclassified'), 'Unclassified', source)) %>%
mutate(source = case_when(
analysis_level == 'species' & source == 'self' ~ 'Self Sp.',
analysis_level == 'species' & source == 'donor' ~ 'Donor Sp.',
analysis_level == 'species' & source == 'unique' ~ 'Unique Sp.',
T ~ source
)) %>%
group_by(Study_Type, Case_Name, source, Days_Since_FMT.post, fmt_success.label) %>%
summarize(n = sum(n, na.rm = T),
rel_abund = sum(rel_abund.post, na.rm = T) / 100) %>%
group_by(Case_Name, Days_Since_FMT.post) %>%
mutate(f = n / sum(n, na.rm = T)) %>%
ungroup() %>%
group_by(Case_Name) %>%
mutate(source = case_when(source == 'Unclassified' ~ 'Unclassified Sp.',
source == 'same_species' ~ 'Same Sp.',
source == 'unique' ~ 'Unique to Time Point',
source == 'self' ~ 'Recipient / Initial Sample',
source == 'donor' ~ 'Donor',
source == 'both' ~ 'Coexistence',
T ~ source)) %>%
pivot_wider(names_from = 'source',
values_from = c('rel_abund','f', 'n'), names_sep = '___') %>%
mutate_at(.vars = vars(contains('___')),
.funs = funs(replace_na(., 0))) %>%
pivot_longer(cols = contains("___"),
names_to = c("metric", "source"),
names_sep = '___', values_drop_na = F)
Bar Chart, per Case
pseudo = 0.5
bb = c(pseudo, 2, 7, 14, 35, 84, 365)
strain_order <- c('Donor', 'Donor Sp.', 'Coexistence', 'Recipient / Initial Sample', 'Self Sp.', 'Unique to Time Point', 'Unique Sp.', 'Same Sp.')
strain_colors <- c(colors.discrete[c(6,1,4, 8,3, 7,2, 5,10)])
use_vars = list(strain_order, strain_colors)
plot.rcdi <-
sstr_cases.rcdi.metrics %>%
filter(metric == 'rel_abund') %>%
ungroup() %>%
mutate(ordering = -as.numeric(as.factor(Case_Name)) + (Days_Since_FMT.post/1000)) %>%
ggplot(aes(
fct_reorder(paste0('D', Days_Since_FMT.post, ' | ', str_remove(Case_Name, pattern = 'Case_')), ordering),
y = value,
fill = fct_relevel(source, use_vars[[1]]))) +
geom_bar(stat = 'identity', position = position_fill(), width = 1, col = 'black') +
theme_cowplot() +
scale_y_continuous(labels = percent_format(),
breaks = c(1, .75, .5, .25, 0),
expand = c(0,0)) +
scale_fill_manual(values = strain_colors,
guide = guide_legend(reverse = TRUE)) +
labs(y = 'Rel. Abund.') +
theme(
axis.title.x = element_blank(),
axis.line.y = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 10),
legend.title=element_blank(),
legend.position = 'top')
guides(fill=guide_legend(nrow = 2))
plot.rcdi
Exporting plots
output_name = 'PostSource.Cases.rCDI.Strain'
ggsave(plot.rcdi + theme(legend.position = 'none'),
device = 'pdf', dpi = 300, width = 9, height = 3.5,
filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
Area Chart, Across Cases
rCDI - Area chart showing post-FMT rel. abund. of sets of co-occurring strains in recipients, donors, and post-FMT patients based on glm 2nd binomial model across successful rCDI cases.
We only have sparse data and single cases for later time points, limiting to <= 84 days
bb = c(2, 7, 14, 35, 84, 365)
strain_order <- c('Donor', 'Donor Sp.', 'Coexistence', 'Recipient / Initial Sample', 'Self Sp.', 'Unique to Time Point', 'Unique Sp.', 'Same Sp.')
strain_colors <- c(colors.discrete[c(6,1,4, 8,3, 7,2, 5,10)])
use_vars = list(strain_order, strain_colors)
plot <-
sstr_cases.rcdi.metrics %>%
filter(fmt_success.label == 'Resolved', Days_Since_FMT.post <= 84) %>%
filter(metric != 'n') %>%
ggplot(aes(x = Days_Since_FMT.post,
y = value,
fill = fct_relevel(source, use_vars[[1]]))
) +
stat_smooth(geom = 'area',
method = 'glm', colour = 'black', size = 0.25,
position = 'fill',
# formula = y ~ poly(x, 2),
method.args=list(family='binomial')
) +
facet_grid(scales = 'free_x', space = 'free_x',
cols = vars(''),
rows = vars(ifelse(grepl(metric, pattern = '^f'),
'Spp. Fraction',
'Rel. Abund.'))) +
theme_cowplot() +
scale_y_continuous(labels = percent_format(),
breaks = c(1, .75, .5, .25, 0),
expand = c(0,0),
) +
scale_x_continuous(trans = pseudo_log_trans(),
breaks = bb,
expand = c(0,0)) +
scale_fill_manual(values = use_vars[[2]],
guide = guide_legend(reverse = TRUE)) +
labs(x = 'Days Since FMT\n') +
theme(axis.title.y = element_blank(),
plot.margin = unit(c(0, 0, 0, 0), "cm"),
strip.background = element_blank(),
strip.text = element_text(size = 14),
panel.spacing.y = unit(.5, "cm"),
# panel.spacing.x = unit(.35, "cm"),
legend.title=element_blank(),
legend.position = 'top',
legend.margin=ggplot2::margin(l = 1.5, unit='cm')) +
guides(fill=guide_legend(ncol = 2))
plot + theme(legend.position = 'none')
Histogram of available samples for modelling
plot.histogram <-
sstr_cases.rcdi.metrics %>%
filter(fmt_success.label == 'Resolved', Days_Since_FMT.post <= 84) %>%
filter(metric == 'f') %>%
group_by(Study_Type, Case_Name, Days_Since_FMT.post) %>%
summarize(value = sum(value, na.rm = T)) %>%
ggplot(aes(Days_Since_FMT.post + pseudo)) +
geom_histogram(fill = 'black', bins = 10) +
facet_grid(scales = 'free_x', space = 'free_x',
cols = vars(fct_relevel(Study_Type, 'rCDI', 'Control')),
rows = vars('n')) +
scale_x_continuous(trans = pseudo_log_trans(0.5, 10),
breaks = c(2, 7, 14, 35, 84, 365),
expand = c(0,0)) +
scale_y_continuous(#trans = 'log10',
breaks = c(1, 5, 10, 100),
expand = c(0,0),
) +
theme_cowplot() +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
axis.title = element_blank(),
axis.line.x = element_line(size = 1),
axis.line.y = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.title=element_blank())
plot.histogram
legend = cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)
Exporting plot
output_name = 'PostSource.AcrossCases.rCDI.Strain'
ggsave(plot + theme(legend.position = 'none'),
device = 'pdf', dpi = 300, width = 3, height = 5,
filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
ggsave(plot.histogram + theme(legend.position = 'none'),
device = 'pdf', dpi = 300, width = 3, height = .6,
filename = paste0(results_dir, fig, output_name, '.Histogram','.Plot.pdf'))
ggsave(legend,
device = 'pdf', dpi = 300, width = 5, height = 2,
filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))
Persistence / Engraftment within time periods:
sstr_cases.rcdi.metrics %>%
filter(fmt_success.label == 'Resolved', metric == 'rel_abund') %>%
filter(Days_Since_FMT.post > 0, Days_Since_FMT.post <= 7) %>%
filter(source %in% c('Donor', 'Recipient / Initial Sample')) %>%
group_by(source) %>%
summarize(mean = mean(value) * 100,
sd = sd(value) * 100,
.groups = 'drop')
sstr_cases.rcdi.metrics %>%
filter(fmt_success.label == 'Resolved', metric == 'rel_abund') %>%
filter(Days_Since_FMT.post > 70, Days_Since_FMT.post <= 84) %>%
filter(source %in% c('Donor', 'Recipient / Initial Sample')) %>%
group_by(source) %>%
summarize(mean = mean(value) * 100,
sd = sd(value) * 100,
.groups = 'drop')
Donor -> 42.5 ± 30.3 to 26.5 ± 21.9
Recipient -> 18.9 ± 22.3 to 4.9 ± 9.0
Grouped by Case
sstr_cases.rcdi.metrics.format <-
sstr_cases.rcdi.metrics %>%
mutate(fmt_success.tag = ifelse(fmt_success.label == 'Resolved', 'S', 'F')) %>%
filter(metric == 'rel_abund') %>%
mutate(ALM = convert_to_alm_cases(Case_Name)) %>%
mutate(Study = str_split_fixed(Case_Name, pattern = '_', 2)[,1]) %>%
mutate(Case_Number = ifelse(Study == 'ALM', ALM, str_split_fixed(Case_Name, pattern = 'Case_', 2)[,2]))
sstr_cases.rcdi.metrics.format %>%
ggplot(aes(x = fct_reorder(paste0(Days_Since_FMT.post), Days_Since_FMT.post, .desc = F),
y = value,
fill = fct_relevel(source, use_vars[[1]]))
) +
geom_bar(stat = 'identity', show.legend = F, width = 1, col = 'black', position = position_fill()) +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(labels = percent_format_signif, expand = c(0,0)) +
scale_fill_manual(values = strain_colors) +
theme_cowplot() +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
legend.title = element_blank()) +
facet_grid(cols = vars(fmt_success.label, Study, Case_Number),
scales = 'free', space = 'free')
sstr_cases.rcdi.metrics.out <-
sstr_cases.rcdi.metrics.format %>%
mutate(source = ifelse(source == 'Recipient / Initial Sample', 'Self',
str_split_fixed(source, pattern = ' ', 2)[,1])) %>%
dplyr::select(-metric, -ALM, -Study_Type, -fmt_success.tag) %>%
ungroup() %>%
pivot_wider(names_from = 'source', values_from = 'value', values_fn = list(value = sum)) %>%
mutate(Total = Donor + Same + Self + Unclassified + Unique + Coexistence,
DonorRecipientRatio = log10((Donor + 1e-6) / (Self + 1e-6)))
write_tsv(sstr_cases.rcdi.metrics.out, paste0(results_dir, fig, 'FMT_Events.tsv'))
Shared Strains per Taxa
Format
Summarize per genus
sstr_cases.study <-
sstr_cases %>%
filter(Days_Since_FMT.post <= 373) %>% # same time frame rCDI / control
filter(Study_Type == 'Control' | fmt_success) %>%
filter(kingdom == 'Bacteria', !grepl(species, pattern = 'unclassified')) %>%
mutate(source = case_when(
analysis_level == 'species' & source == 'same_species' & Study_Type == 'Control' ~ 'self',
analysis_level == 'species' & source == 'self' ~ 'Recipient / Initial Sample Sp.',
analysis_level == 'species' & source == 'donor' ~ 'Donor Sp.',
analysis_level == 'species' & source == 'unique' ~ 'Unique to Time Point Sp.',
T ~ source
)) %>%
mutate(source = case_when(source == 'Unclassified' ~ 'Unclassified Sp.',
source == 'same_species' ~ 'Same Sp.',
source == 'unique' ~ 'Unique to Time Point',
source == 'self' ~ 'Recipient / Initial Sample',
source == 'donor' ~ 'Donor',
source == 'both' ~ 'Coexistence',
T ~ source)) %>%
filter(!source %in% c('Same Sp.') | Study_Type == 'Control') %>%
mutate(genus = case_when(
genus == 'Clostridiaceae_noname' ~ 'Clostridiales_noname',
T ~ genus))
sstr_cases.species <-
sstr_cases.study %>%
mutate(n = 1) %>%
group_by(Study_Type, source, species) %>%
summarize(n = sum(n, na.rm = T), .groups = 'drop') %>%
pivot_wider(names_from = 'Study_Type', values_from = 'n') %>%
mutate_at(.vars = vars(rCDI, Control),
.funs = funs(replace_na(., 0))) %>%
mutate(source.simple = gsub(source, pattern = ' Sp.', replacement = ''))
sstr_cases.genus <-
sstr_cases.study %>%
mutate(n = 1) %>%
group_by(Study_Type, source, genus) %>%
summarize(n = sum(n, na.rm = T), .groups = 'drop') %>%
pivot_wider(names_from = 'Study_Type', values_from = 'n') %>%
mutate_at(.vars = vars(rCDI, Control),
.funs = funs(replace_na(., 0))) %>%
mutate(source.simple = gsub(source, pattern = ' Sp.', replacement = ''))
Feature rates
Fraction of Taxa from shown post-FMT samples in which species have a feature
genus_feature_rates <-
mp_species.rcdi.long %>%
filter(kingdom == 'Bacteria', !grepl(species, pattern = 'unclassified')) %>%
filter(Sample_Type %in% c('post'), fmt_success) %>%
left_join(., microbe.metadata) %>%
group_by(genus) %>%
summarize_at(.vars = vars(habit.oral, oxygen.tolerant, gram.bool, spores.bool),
.funs = funs(replace_na(
sum(. == T, na.rm = T) / sum(!is.na(.)),
0)))
species_feature_rates <-
mp_species.rcdi.long %>%
filter(kingdom == 'Bacteria', !grepl(species, pattern = 'unclassified')) %>%
filter(Sample_Type %in% c('post')) %>%
left_join(., microbe.metadata) %>%
group_by(species) %>%
summarize_at(.vars = vars(habit.oral, oxygen.tolerant, gram.bool, spores.bool),
.funs = funs(replace_na(
sum(. == T, na.rm = T) / sum(!is.na(.)),
0)))
Taxa importance
Measured by observed events in each genus
genus_importance <-
sstr_cases.genus %>%
group_by(genus) %>%
summarize_at(.vars = vars(rCDI, Control),
.funs = funs(sum(., na.rm = T))) %>%
ungroup() %>%
mutate(total = rCDI + Control) %>%
filter(rCDI >= 10, Control >= 5) %>%
arrange(desc(rCDI))
species_importance <-
sstr_cases.species %>%
group_by(species) %>%
summarize_at(.vars = vars(rCDI, Control),
.funs = funs(sum(., na.rm = T))) %>%
ungroup() %>%
mutate(total = rCDI + Control) %>%
filter(rCDI >= 10, Control >= 5) %>%
arrange(desc(rCDI))
Transmission Rates
Control persistence and rCDI Engraftment rates for genera and species
pseudo = 1e-10
genus_transmission_rates <-
sstr_cases.genus %>%
filter(genus %in% genus_importance$genus) %>%
pivot_longer(names_to = 'Study_Type', values_to = 'n', cols = c('Control','rCDI')) %>%
# summarize engraftment (strain + species)
group_by(Study_Type, source.simple, genus) %>%
summarize(n = sum(n, na.rm = T)) %>%
group_by(Study_Type, genus) %>%
mutate(f = n / sum(n, na.rm = T)) %>%
# filter control-persistence and rcdi-engraftment
mutate(label = case_when(
source.simple == 'Recipient / Initial Sample' & Study_Type == 'Control' ~ 'Control.Persistence',
source.simple == 'Recipient / Initial Sample' & Study_Type == 'rCDI' ~ 'rCDI.Persistence',
source.simple == 'Donor' & Study_Type == 'rCDI' ~ 'rCDI.Engraftment',
source.simple == 'Unique to Time Point' & Study_Type == 'Control' ~ 'Control.New')) %>%
filter(!is.na(label)) %>%
ungroup() %>%
# pivot
dplyr::select(genus, f, n, label) %>%
pivot_wider(names_from = 'label',
values_from = c('n','f'),
values_fill = list(n = 0, f = 0),
names_sep = '.') %>%
# calculate percent ranks (pseudo), rescale to 0-1
mutate(Control.Persistence.rank = rescale(dense_rank(f.Control.Persistence), to = c(pseudo, 1 - pseudo)),
rCDI.Engraftment.rank = rescale(dense_rank(f.rCDI.Engraftment), to = c(pseudo, 1 - pseudo)),
rCDI.Persistence.rank = rescale(dense_rank(f.rCDI.Persistence), to = c(pseudo, 1 - pseudo)),
Control.New.rank = rescale(dense_rank(f.Control.New), to = c(pseudo, 1 - pseudo)))
species_transmission_rates <-
sstr_cases.species %>%
filter(species %in% species_importance$species) %>%
pivot_longer(names_to = 'Study_Type', values_to = 'n', cols = c('Control','rCDI')) %>%
# summarize engraftment (strain + species)
group_by(Study_Type, source.simple, species) %>%
summarize(n = sum(n, na.rm = T)) %>%
group_by(Study_Type, species) %>%
mutate(f = n / sum(n, na.rm = T)) %>%
# filter only control-persistence and rcdi-engraftment
mutate(label = case_when(
source.simple == 'Recipient / Initial Sample' & Study_Type == 'Control' ~ 'Control.Persistence',
source.simple == 'Recipient / Initial Sample' & Study_Type == 'rCDI' ~ 'rCDI.Persistence',
source.simple == 'Donor' & Study_Type == 'rCDI' ~ 'rCDI.Engraftment')) %>%
filter(!is.na(label)) %>%
ungroup() %>%
# pivot
dplyr::select(species, f, n, label) %>%
pivot_wider(names_from = 'label', values_from = c('n','f'), names_sep = '.') %>%
mutate_at(.vars = vars(everything(), -species),
.funs = funs(replace_na(., 0))) %>%
# calculate percent ranks (pseudo), rescale to 0-1
mutate(Control.Persistence.rank = rescale(dense_rank(f.Control.Persistence), to = c(pseudo, 1 - pseudo)),
rCDI.Engraftment.rank = rescale(dense_rank(f.rCDI.Engraftment), to = c(pseudo, 1 - pseudo)))
output_name = 'EngraftmentPersistence.Species'
species_transmission_rates %>%
rename_at(.vars = vars(starts_with('n.')),
.funs = funs(paste0(gsub(., pattern = '^n.', replacement = ''), '.count'))) %>%
rename_at(.vars = vars(starts_with('f.')),
.funs = funs(paste0(gsub(., pattern = '^f.', replacement = ''), '.fraction'))) %>%
dplyr::select(species, starts_with('Control'), starts_with('rCDI')) %>%
write_tsv(., paste0(results_dir, fig, output_name, '.Table.tsv'))
output_name = 'EngraftmentPersistence.Genus'
genus_transmission_rates %>%
rename_at(.vars = vars(starts_with('n.')),
.funs = funs(paste0(gsub(., pattern = '^n.', replacement = ''), '.count'))) %>%
rename_at(.vars = vars(starts_with('f.')),
.funs = funs(paste0(gsub(., pattern = '^f.', replacement = ''), '.fraction'))) %>%
dplyr::select(genus, starts_with('Control'), starts_with('rCDI')) %>%
write_tsv(., paste0(results_dir, fig, output_name, '.Table.tsv'))
Per Genus
Format
sstr_cases.genus.fraction <-
sstr_cases.genus %>%
## filter genera
filter(genus %in% genus_importance$genus) %>%
pivot_longer(names_to = 'Study_Type', values_to = 'n', cols = c('Control','rCDI')) %>%
## calculate fraction of events
group_by(Study_Type, genus, source.simple) %>%
mutate(source.n = sum(n, na.rm = T)) %>%
group_by(Study_Type, genus) %>%
mutate(total = sum(n, na.rm = T),
f = n / total,
source.f = source.n / total) %>%
## get genus-level donor-derived for ordering
group_by(genus) %>%
mutate(donor_derived.genus = max(ifelse(Study_Type == 'rCDI' &
source.simple == 'Donor',
source.f, 0), na.rm = T)) %>%
ungroup()
sstr_cases.genus.fraction.annotated <-
sstr_cases.genus.fraction %>%
# add genus metrics for present species
left_join(., genus_feature_rates) %>%
left_join(., genus_transmission_rates) %>%
pivot_longer(names_to = 'feature', values_to = 'value',
cols = c(gram.bool, habit.oral,
oxygen.tolerant, spores.bool,
Control.Persistence.rank, rCDI.Engraftment.rank,
)) %>%
mutate(value_cut = Hmisc::cut2(round(value, 2), cuts = seq(0, 1, 0.2), minmax = T)) %>%
mutate(facet_title = 'Sp. Features')
Bar Plots
Bar plots of per-taxa transmission rates for Control and rCDI data
strain_colors <- c(6, 1, 4, 8, 3, 7, 2)
species_colors <- c(4, 1, 3, 2)
strain_colors_reduced <- c(9, 6, 8, 7, 7 + 5)
p1 <-
sstr_cases.genus.fraction %>%
mutate(source = fct_relevel(source,
'Donor', 'Donor Sp.',
'Coexistence',
'Recipient / Initial Sample','Recipient / Initial Sample Sp.',
'Unique to Time Point','Unique to Time Point Sp.',
)) %>%
mutate(genus = gsub(genus, pattern = '_noname', replacement = '')) %>%
ggplot(aes(x = fct_reorder(genus, donor_derived.genus, .desc = F), y = f, fill = source)) +
geom_bar(stat = 'identity', colour = 'black', size = 0.05) +
coord_flip() +
guides(fill = guide_legend(ncol = 1)) +
facet_grid(cols = vars(Study_Type),
# rows = vars(control_cluster, rcdi_cluster),
space = 'free_y',
scales = 'free') +
scale_fill_manual(values = colors.discrete[strain_colors]) +
tidytext::scale_x_reordered() +
theme_cowplot(9) +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand =c(0,0),
labels = scales::percent_format(), breaks = c(0, 0.5, 1)) +
theme(strip.background = element_blank(),
strip.text = element_text(size = 9),
axis.title = element_blank(),
axis.text.x = element_text(hjust = 0.5),
legend.title=element_blank(),
legend.position = 'none',
panel.spacing.x = unit(0.25, "cm")
# plot.margin = unit(c(0, 0, .9, 0), "cm")
)
p1
Metadata for barplots showing the fraction of species that were found in post-FMT samples within each genus which hold a feature compared to the ones that do not hold a feature. Color coded from white to black (has feature) in 6 steps.
p2 <-
sstr_cases.genus.fraction.annotated %>%
mutate(genus = gsub(genus, pattern = '_noname', replacement = '')) %>%
mutate(feature = case_when(
feature == 'gram.bool' ~ 'Gram (+)',
feature == 'habit.oral' ~ 'Oral Spp.',
feature == 'pathogen.human' ~ 'Pathogen',
feature == 'oxygen.tolerant' ~ 'Oxy. Tolerant',
feature == 'spores.bool' ~ 'Spore Forming',
feature == 'multistrain' ~ 'Multiple Strains',
T ~ feature)) %>%
filter(feature %in% c('Gram (+)','Oral Spp.','Oxy. Tolerant','Spore Forming')) %>%
ggplot(aes(y = fct_reorder(genus, donor_derived.genus),
x = fct_relevel(feature, 'Gram (+)', 'Spore Forming', 'Oral Spp.', 'Oxy. Tolerant',
), fill = value_cut)) +
geom_tile() +
coord_equal() +
facet_grid(cols = vars('')) +
scale_y_discrete(position = 'right', expand = c(0,0)) +
scale_fill_manual(values = c('white', rev(gray.colors(4)), 'black')) +
theme_cowplot(9) +
theme(strip.background = element_blank(),
strip.text = element_text(size = 10),
axis.text.y = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), #
axis.title = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none',
axis.line.y = element_blank())
p2
output_name = 'PostSource.Taxa.Genus'
ggsave(p1, device = 'pdf',
filename = paste0(results_dir, fig, output_name, '.Data.pdf'),
dpi = 300, width = 3.5, height = 6)
ggsave(p2, device = 'pdf',
filename = paste0(results_dir, fig, output_name, '.Features.pdf'),
dpi = 300, width = 3, height = 6)
Engraftment/Persistence
Engrafted genera in rCDI cases are correlated with persisted taxa in control cases
mp_species.rcdi.post.long <-
mp_species.rcdi.long %>%
filter(kingdom == 'Bacteria', !grepl(species, pattern = 'unclassified')) %>%
filter(Sample_Type %in% c('post'), fmt_success) %>%
group_by(kingdom, phylum, class, order, family, genus, species,
Study, Unique_ID, Days_Since_FMT, Case_Name, Donor.Name, Donor.Unique_ID, Donor.Subject) %>%
summarize(rel_abund = sum(rel_abund/100, na.rm = T), .groups = 'drop')
Annotate transmission rates with mean post-FMT rel. abund.
genus_transmission_rates.annotated <-
mp_species.rcdi.long %>%
filter(kingdom == 'Bacteria', !grepl(species, pattern = 'unclassified')) %>%
filter(Sample_Type %in% c('post'), fmt_success) %>%
group_by(kingdom, phylum, class, order, family, genus,
Study, Unique_ID, Days_Since_FMT, Case_Name, Donor.Name, Donor.Unique_ID, Donor.Subject) %>%
summarize(rel_abund = sum(rel_abund/100, na.rm = T), .groups = 'drop') %>%
group_by(kingdom, phylum, class, order, family, genus) %>%
summarize(rel_abund.mean = mean(rel_abund, na.rm = T), .groups = 'drop') %>%
right_join(., genus_transmission_rates)
# correlation metrics
dd <- genus_transmission_rates.annotated %>%
filter(!is.na(rel_abund.mean))
dd.cor <- tidy(cor.test(dd$rCDI.Engraftment.rank, dd$Control.Persistence.rank, exact = F, method = 'spearman'))
dd.cor.est <- round(dd.cor$estimate, 2)
dd.cor.sig <- cut(dd.cor$p.value, breaks = c(-Inf, 0.001, 0.01, 0.05, Inf),
labels = c("***", "**", "*", ""))
plot <-
genus_transmission_rates.annotated %>%
filter(!is.na(rel_abund.mean)) %>%
ggplot(aes(rCDI.Engraftment.rank, Control.Persistence.rank, fill = rel_abund.mean)) +
geom_point(size = 4, shape = 21) +
ggrepel::geom_text_repel(aes(label = genus), nudge_x = 0.05,nudge_y = 0.05,
data = genus_transmission_rates.annotated %>%
filter((rCDI.Engraftment.rank > 0.01 & n.rCDI.Engraftment > 10 |
Control.Persistence.rank > 0.01 & n.Control.Persistence > 10) & rel_abund.mean > 0.05)
) +
coord_fixed() +
scale_fill_gradient2(name = 'Post-FMT Mean Rel. Abund.',
low = colors.discrete[2], mid = colors.discrete[10], high = 'black',
midpoint = pseudo_log_trans(1e-6, 10)$transform(0.003),
trans = pseudo_log_trans(1e-6, 10),
breaks = c(0.001, 0.01, 0.1),
labels = percent_format(),
guide = guide_colorbar(title.position = "top" ,title.hjust = 0.5)) +
scale_x_continuous(labels = percent_format_signif) +
scale_y_continuous(labels = percent_format_signif) +
theme_cowplot() +
theme(legend.position = 'top',
legend.key.width=unit(1,"cm"),
legend.key.size = unit(0.35, "cm"),
legend.text = element_text(size = 8),
legend.title = element_text(size = 10)) +
labs(x = 'rCDI - Engraftment rank',
y = 'Control - Persistence rank') +
annotate('text', label = paste0('r=', dd.cor.est,' ', dd.cor.sig), x = .8, y = .05, size = 5)
plot + theme(legend.position = 'none')
legend = cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)
Exporting plot
output_name = 'EngraftmentPersistence.Genus'
ggsave(plot + theme(legend.position = 'none'),
device = 'pdf', dpi = 300, width = 4, height = 4,
filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
ggsave(legend,
device = 'pdf', dpi = 300, width = 2.5, height = 1,
filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))
Per Species
Format
sstr_cases.species.fraction <-
sstr_cases.species %>%
## filter genera
filter(species %in% species_importance$species) %>%
pivot_longer(names_to = 'Study_Type', values_to = 'n', cols = c('Control','rCDI')) %>%
## calculate fraction of events
group_by(Study_Type, species, source.simple) %>%
mutate(source.n = sum(n, na.rm = T)) %>%
group_by(Study_Type, species) %>%
mutate(total = sum(n, na.rm = T),
f = n / total,
source.f = source.n / total) %>%
## get species-level donor-derived for ordering
group_by(species) %>%
mutate(donor_derived.species = max(ifelse(Study_Type == 'rCDI' &
source.simple == 'Donor',
source.f, 0), na.rm = T)) %>%
ungroup()
sstr_cases.species.fraction.annotated <-
sstr_cases.species.fraction %>%
# add species metrics for present species
left_join(., species_feature_rates) %>%
left_join(., species_transmission_rates) %>%
pivot_longer(names_to = 'feature', values_to = 'value',
cols = c(gram.bool, habit.oral,
oxygen.tolerant, spores.bool,
Control.Persistence.rank, rCDI.Engraftment.rank,
)) %>%
mutate(value_cut = Hmisc::cut2(round(value, 2), cuts = seq(0, 1, 0.2), minmax = T)) %>%
mutate(facet_title = 'Sp. Features')
Bar Plots
Bar plots of per-taxa transmission rates for Control and rCDI data
strain_colors <- c(6, 1, 4, 8, 3, 7, 2)
species_colors <- c(4, 1, 3, 2)
strain_colors_reduced <- c(9, 6, 8, 7, 7 + 5)
p1 <-
sstr_cases.species.fraction %>%
mutate(source = fct_relevel(source,
'Donor', 'Donor Sp.',
'Coexistence',
'Recipient / Initial Sample','Recipient / Initial Sample Sp.',
'Unique to Time Point','Unique to Time Point Sp.',
)) %>%
mutate(species = gsub(species, pattern = '_noname', replacement = '')) %>%
ggplot(aes(x = fct_reorder(species, donor_derived.species, .desc = F), y = f, fill = source)) +
geom_bar(stat = 'identity', colour = 'black', size = 0.05) +
coord_flip() +
guides(fill = guide_legend(ncol = 1)) +
facet_grid(cols = vars(Study_Type),
# rows = vars(control_cluster, rcdi_cluster),
space = 'free_y',
scales = 'free') +
scale_fill_manual(values = colors.discrete[strain_colors]) +
tidytext::scale_x_reordered() +
theme_cowplot(9) +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand =c(0,0),
labels = scales::percent_format(), breaks = c(0, 0.5, 1)) +
theme(strip.background = element_blank(),
strip.text = element_text(size = 9),
axis.title = element_blank(),
axis.text.x = element_text(hjust = 0.5),
legend.title=element_blank(),
legend.position = 'none',
panel.spacing.x = unit(0.25, "cm")
# plot.margin = unit(c(0, 0, .9, 0), "cm")
)
p1
Metadata for barplots showing the fraction of species that were found in post-FMT samples within each species which hold a feature compared to the ones that do not hold a feature. Color coded from white to black (has feature) in 6 steps.
p2 <-
sstr_cases.species.fraction.annotated %>%
mutate(species = gsub(species, pattern = '_noname', replacement = '')) %>%
mutate(feature = case_when(
feature == 'gram.bool' ~ 'Gram (+)',
feature == 'habit.oral' ~ 'Oral Spp.',
feature == 'pathogen.human' ~ 'Pathogen',
feature == 'oxygen.tolerant' ~ 'Oxy. Tolerant',
feature == 'spores.bool' ~ 'Spore Forming',
feature == 'multistrain' ~ 'Multiple Strains',
T ~ feature)) %>%
filter(feature %in% c('Gram (+)','Oral Spp.','Oxy. Tolerant','Spore Forming')) %>%
ggplot(aes(y = fct_reorder(species, donor_derived.species),
x = fct_relevel(feature, 'Gram (+)', 'Spore Forming', 'Oral Spp.', 'Oxy. Tolerant',
), fill = value_cut)) +
geom_tile() +
coord_equal() +
facet_grid(cols = vars('')) +
scale_y_discrete(position = 'right', expand = c(0,0)) +
scale_fill_manual(values = c('white', 'black')) +
theme_cowplot(9) +
theme(strip.background = element_blank(),
strip.text = element_text(size = 10),
axis.text.y = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), #
axis.title = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none',
axis.line.y = element_blank())
p2
output_name = 'PostSource.Taxa.Species'
ggsave(p1, device = 'pdf',
filename = paste0(results_dir, fig, output_name, '.Data.pdf'),
dpi = 300, width = 5, height = 15)
ggsave(p2, device = 'pdf',
filename = paste0(results_dir, fig, output_name, '.Features.pdf'),
dpi = 300, width = 3, height = 15)
Engraftment/Persistence
Engrafted species in rCDI cases are correlated with persisted taxa in control cases
mp_species.rcdi.post.long <-
mp_species.rcdi.long %>%
filter(kingdom == 'Bacteria', !grepl(species, pattern = 'unclassified')) %>%
filter(Sample_Type %in% c('post'), fmt_success) %>%
group_by(kingdom, phylum, class, order, family, genus, species,
Study, Unique_ID, Days_Since_FMT, Case_Name, Donor.Name, Donor.Unique_ID, Donor.Subject) %>%
summarize(rel_abund = sum(rel_abund/100, na.rm = T), .groups = 'drop')
Annotate transmission rates with mean post-FMT rel. abund.
species_transmission_rates.annotated <-
mp_species.rcdi.long %>%
filter(kingdom == 'Bacteria', !grepl(species, pattern = 'unclassified')) %>%
filter(Sample_Type %in% c('post'), fmt_success) %>%
group_by(kingdom, phylum, class, order, family, genus, species) %>%
summarize(rel_abund.mean = mean(rel_abund/100, na.rm = T), .groups = 'drop') %>%
right_join(., species_transmission_rates)
# correlation metrics
dd <- species_transmission_rates.annotated %>%
filter(!is.na(rel_abund.mean))
dd.cor <- tidy(cor.test(dd$rCDI.Engraftment.rank, dd$Control.Persistence.rank, exact = F, method = 'spearman'))
dd.cor.est <- round(dd.cor$estimate, 2)
dd.cor.sig <- cut(dd.cor$p.value, breaks = c(-Inf, 0.001, 0.01, 0.05, Inf),
labels = c("***", "**", "*", ""))
plot <-
species_transmission_rates.annotated %>%
filter(!is.na(rel_abund.mean)) %>%
ggplot(aes(rCDI.Engraftment.rank, Control.Persistence.rank, fill = rel_abund.mean)) +
geom_point(size = 4, shape = 21) +
ggrepel::geom_text_repel(aes(label = species), data = species_transmission_rates.annotated %>%
filter((rCDI.Engraftment.rank > 0.01 & n.rCDI.Engraftment >= 10 |
Control.Persistence.rank > 0.01 & n.Control.Persistence >= 10) & rel_abund.mean > 0.03)
) +
coord_fixed() +
scale_fill_gradient2(name = 'Post-FMT Mean Rel. Abund.',
low = colors.discrete[2], mid = colors.discrete[10], high = 'black',
midpoint = pseudo_log_trans(1e-6, 10)$transform(0.001),
trans = pseudo_log_trans(1e-6, 10),
breaks = c(0.0001, 0.001, 0.01),
labels = percent_format(),
guide = guide_colorbar(title.position = "top" ,title.hjust = 0.5)) +
scale_x_continuous(labels = percent_format_signif) +
scale_y_continuous(labels = percent_format_signif) +
theme_cowplot() +
theme(legend.position = 'top',
legend.key.width=unit(1,"cm"),
legend.key.size = unit(0.35, "cm"),
legend.text = element_text(size = 8),
legend.title = element_text(size = 10)) +
labs(x = 'rCDI - Engraftment rank',
y = 'Control - Persistence rank') +
annotate('text', label = paste0('r=', dd.cor.est,' ', dd.cor.sig), x = 0, y = 1, hjust = 0, size = 5)
plot + theme(legend.position = 'none')
legend = cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)
Exporting plot
output_name = 'EngraftmentPersistence.Species'
ggsave(plot + theme(legend.position = 'none'),
device = 'pdf', dpi = 300, width = 4, height = 4,
filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
ggsave(legend,
device = 'pdf', dpi = 300, width = 2.5, height = 1,
filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))
Post-FMT Species Rel. Abund.
Rel. abund. of post-FMT species is dependent on which strain survived the FMT treatment.
Format
competing_cases <-
sstr_cases %>%
filter(Study_Type == 'rCDI') %>%
filter(replace_na(rel_abund.recipient, 0) > 0 & replace_na(rel_abund.donor, 0) > 0) %>%
dplyr::select(taxonomy.levels, event, source, analysis_level, combined_resolution,
starts_with('existed'),
Study, Case_Name, Days_Since_FMT.post, Study_Type, fmt_success,
rel_abund.recipient, rel_abund.post, rel_abund.donor,
ends_with('.mvs'), ends_with('.overlap'),
starts_with('f_poly.'), starts_with('n_covered.')) %>%
left_join(., microbe.metadata)
Stats
competing_cases %>%
filter(analysis_level != 'species') %>%
group_by(source) %>%
tally() %>%
ungroup() %>%
mutate(f = n / sum(n, na.rm = T)) %>%
column_to_rownames('source') %>%
t() %>%
as.data.frame() %>%
rownames_to_column('metric')
Replacement = 207 (50.7%)
Resistence = 119 (29.2%)
New/Unique = 57 (13.9%)
Co-Existence = 25 (6.1%)
Total = 207 + 119 + 57 + 25 = 408
Export Table
output_name = 'Competing_Cases'
competing_cases %>%
filter(analysis_level != 'species') %>%
dplyr::select(Study, Case_Name, Days_Since_FMT.post, fmt_success,
species, source, starts_with('rel_abund'),
ends_with('mvs'), ends_with('overlap'), starts_with('n_covered')) %>%
write_tsv(., paste0(results_dir, fig, output_name, '.tsv'))
Strain-only
Events with all three R/D/P comparisons at strain-level
bb = c(1e-5, 1e-3, 1e-1)
competing_cases.strain <-
competing_cases %>%
filter(replace_na(rel_abund.post, 0) > 0) %>%
filter(n_covered.donor > min_overlap &
n_covered.recipient > min_overlap &
n_covered.post > min_overlap) %>%
filter(source %in% c('donor', 'self', 'both'), source != 'same_species') %>%
pivot_longer(names_to = 'rel_abund.source', values_to = 'rel_abund',
cols = c('rel_abund.recipient', 'rel_abund.donor')) %>%
mutate(rel_abund.source = case_when(
rel_abund.source == 'rel_abund.recipient' ~ 'Recipient',
rel_abund.source == 'rel_abund.donor' ~ 'Donor',
T ~ rel_abund.source)) %>%
mutate(rel_abund.source = fct_relevel(as.factor(rel_abund.source), 'Recipient')) %>%
mutate(source = case_when(
source == 'both' ~ 'Coexistence',
source == 'self' ~ 'Recipient-Specific',
source == 'donor' ~ 'Donor-Specific',
T ~ source
),
source = fct_relevel(source, 'Recipient-Specific', 'Donor-Specific'))
min_cutoff = min(c(min(competing_cases.strain$rel_abund, na.rm = T),
min(competing_cases.strain$rel_abund.post, na.rm = T))) / 100
The relative abundance of the post-FMT species correlates with the relative abundance of the recipient but not donor species, if the recipient species persisted and vice versa.
plot <-
competing_cases.strain %>%
ggplot(aes((rel_abund.post / 100), (rel_abund / 100))) +
geom_point(aes(fill = rel_abund.source), shape = 21, size = 3, col = 'black', alpha = 1) +
scale_x_log10(label = percent_format_signif,
breaks = bb,
expand = c(0.1,0),
limits = c(min_cutoff, 1)
) +
scale_y_log10(label = percent_format_signif,
breaks = bb,
expand = c(0.1,0),
limits = c(min_cutoff, 200) # space for stat_cor
) +
theme_cowplot() +
stat_cor(method = 'spearman', cor.coef.name = 'r',
aes(col = rel_abund.source,
label = paste0(str_replace(str_remove_all(str_to_lower(..r.label..),
pattern = ' '),
pattern = ',', replacement = ', '))),
output.type="text", show.legend = F) +
geom_smooth(method = 'lm', se =F, aes(col = rel_abund.source), show.legend = F) +
scale_color_manual(values = colors.discrete[c(3, 1)]) +
scale_fill_manual(values = colors.discrete[c(3, 1)],
guide = guide_legend(direction = "horizontal",
title.position = "top",
title = 'Rel. Abund.',
title.theme = element_text(angle = 270, size = 14),
label.position="bottom",
label.hjust = 0,
label.vjust = 0.5,
label.theme = element_text(angle = 270)
)
) +
labs(x = 'Post-FMT') +
theme(aspect.ratio = 1,
legend.position = 'left',
strip.background = element_blank(),
axis.text.x = element_text(hjust = .65),
axis.title.y = element_blank()) +
facet_wrap(~source, ncol = 1) +
geom_abline(intercept = 0, slope = 1, linetype= 'dashed')
plot + theme(legend.position = 'none')
legend = cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)
Combined, 2/3
Events with 2/3 R/D/P comparisons at strain-level
bb = c(1e-5, 1e-3, 1e-1)
competing_cases.combined <-
competing_cases %>%
filter(replace_na(rel_abund.post, 0) > 0) %>%
filter(source %in% c('donor', 'self', 'both'), source != 'same_species') %>%
pivot_longer(names_to = 'rel_abund.source',
values_to = 'rel_abund',
cols = c('rel_abund.recipient', 'rel_abund.donor')) %>%
mutate(rel_abund.source = case_when(
rel_abund.source == 'rel_abund.recipient' ~ 'Recipient',
rel_abund.source == 'rel_abund.donor' ~ 'Donor',
T ~ rel_abund.source)) %>%
mutate(rel_abund.source = fct_relevel(as.factor(rel_abund.source), 'Recipient')) %>%
mutate(source = case_when(
source == 'both' ~ 'Coexistence',
source == 'self' ~ 'Recipient-Derived',
source == 'donor' ~ 'Donor-Derived',
T ~ source
),
source = fct_relevel(source, 'Recipient-Derived', 'Donor-Derived'))
min_cutoff = min(c(min(competing_cases.combined$rel_abund, na.rm = T),
min(competing_cases.combined$rel_abund.post, na.rm = T))) / 100
plot <-
competing_cases.combined %>%
ggplot(aes((rel_abund.post / 100), (rel_abund / 100))) +
geom_point(aes(fill = rel_abund.source), shape = 21, size = 3, col = 'black', alpha = 1) +
scale_x_log10(label = percent_format_signif,
breaks = bb,
expand = c(0.1,0),
limits = c(min_cutoff, 1)
) +
scale_y_log10(label = percent_format_signif,
breaks = bb,
expand = c(0.1,0),
limits = c(min_cutoff, 200) # space for stat_cor
) +
theme_cowplot() +
stat_cor(method = 'spearman',
aes(col = rel_abund.source,
label = paste0(str_replace(str_remove_all(str_to_lower(..r.label..),
pattern = ' '),
pattern = ',', replacement = ', '))),
output.type="text", show.legend = F) +
geom_smooth(method = 'lm', se =F, aes(col = rel_abund.source), show.legend = F) +
scale_color_manual(values = colors.discrete[c(3, 1)]) +
scale_fill_manual(values = colors.discrete[c(3, 1)],
guide = guide_legend(direction = "horizontal",
title.position = "top",
title = 'Rel. Abund.',
title.theme = element_text(angle = 270, size = 14),
label.position="bottom",
label.hjust = 0,
label.vjust = 0.5,
label.theme = element_text(angle = 270)
)
) +
labs(x = 'Post-FMT') +
theme(aspect.ratio = 1,
legend.position = 'left',
strip.background = element_blank(),
axis.text.x = element_text(hjust = .65),
axis.title.y = element_blank()) +
facet_wrap(~source, ncol = 1) +
geom_abline(intercept = 0, slope = 1, linetype= 'dashed')
plot + theme(legend.position = 'none')
legend = cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)
Exporting plot
output_name = 'StrainSource.RelAbund.rCDI'
ggsave(plot + theme(legend.position = 'none'),
device = 'pdf', dpi = 300, width = 3.5, height = 7,
filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
ggsave(legend,
device = 'pdf', dpi = 300, width = 1, height = 3,
filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))
Post-FMT Presence/Absence (Strain)
Using a random forest classifier to predict whether strains found in the donor will be found post-FMT in the patient
Format
scale_gelman <- function(x) {
return((x - mean(x, na.rm = T)) / (2 * sd(x, na.rm = T)))
}
sstr_cases.format <-
sstr_cases %>%
filter(Study_Type %in% c('rCDI') ) %>%
filter(kingdom == 'Bacteria') %>%
mutate(source = ifelse(grepl(species, pattern = 'unclassified'), 'Unclassified', source)) %>%
mutate(source = case_when(
analysis_level == 'species' & source == 'self' ~ 'Self Sp.',
analysis_level == 'species' & source == 'donor' ~ 'Donor Sp.',
analysis_level == 'species' & source == 'unique' ~ 'Unique Sp.',
T ~ source
)) %>%
left_join(., microbe.metadata) %>%
mutate_at(.vars = vars(rel_abund.donor, rel_abund.post, rel_abund.recipient),
.funs = funs(. / 100)) %>%
mutate(Donor = replace_na(rel_abund.donor > 0, F),
PostTreatment = replace_na(rel_abund.post > 0, F),
Pre = replace_na(rel_abund.recipient > 0, F),
Both = Donor & Pre,
Any = Donor | Pre) %>%
# filter only species that existed in donor or both
filter(!grepl(species, pattern = 'unclassified'), species %in% microbe.metadata$species) %>%
filter(Donor | Both) %>% # n_covered.donor > min_overlap
mutate(Engrafted = case_when(
# shared_strain
`Donor/Post-FMT.mvs` > min_similarity &
`Donor/Post-FMT.overlap` > min_overlap ~ T,
# other_strain
`Donor/Post-FMT.mvs` < min_similarity &
`Donor/Post-FMT.overlap` > min_overlap ~ F,
# unique_species
replace_na(`Donor/Post-FMT.overlap`, 0) < min_overlap &
!PostTreatment ~ F
)) %>%
# filter only events for which we know outcome
filter(!is.na(Engrafted)) %>%
rename(OxyTol = 'oxygen.tolerant',
OxyClass = 'oxygen.class',
DaysSinceFMT = 'Days_Since_FMT.post',
Case = 'Case_Name',
AbundanceDonor = 'rel_abund.donor',
AbundanceRecipient = 'rel_abund.recipient',
AbundancePost = 'rel_abund.post') %>%
mutate(Habitat = ifelse(habit.oral, 'Oral', 'Not-Oral'),
OxyTol = ifelse(OxyTol, 'Tolerant', 'Not-Tolerant')) %>%
mutate(Detected = case_when(
Pre & !Donor ~ 'Recipient',
!Pre & Donor ~ 'Donor',
Pre & Donor ~ 'Both')) %>%
mutate(Specificity = ifelse(Detected == 'Both', F, T)) %>%
mutate_at(.vars = vars(Case,
Specificity, Study,
Detected, Engrafted,
phylum, class, order, family,
OxyTol, OxyClass, Habitat, PostTreatment,
habit.site, gram_stain, spore_forming
),
.funs = funs(as.factor(.))) %>%
mutate(
Detected = fct_relevel(Detected, 'Recipient'),
OxyClass = fct_relevel(OxyClass, 'aerobe'),
Habitat = fct_relevel(Habitat, 'Not-Oral'),
OxyTol = fct_relevel(OxyTol, 'Not-Tolerant'),
Phylum = fct_relevel(phylum, 'Firmicutes')
) %>%
mutate(OxyTol = as.factor(replace_na(as.character(OxyTol), 'Unknown')),
OxyClass = as.factor(replace_na(as.character(OxyClass), 'unknown'))) %>%
dplyr::select(phylum, class, order, family, genus, species, Study, Engrafted, PostTreatment, AbundancePost, Detected, AbundanceDonor, AbundanceRecipient, Habitat, OxyTol, DaysSinceFMT, Case, OxyClass, habit.site, gram_stain, spore_forming)
Focussing on donor-derived taxa
d <-
sstr_cases.format %>%
mutate_at(.vars = vars(AbundanceDonor, AbundanceRecipient),
.funs = funs(pseudo_log_trans(1e-6, 10)$transform(.))) %>%
mutate(AbundanceRatio = AbundanceDonor - AbundanceRecipient, # diff is ratio at log
DaysSinceFMT = log10(DaysSinceFMT)
) %>%
mutate_at(.vars = vars(starts_with('Abundance'), DaysSinceFMT),
.funs = funs(scale_gelman(.))) %>%
dplyr::select(-AbundancePost,
-PostTreatment, -Case,
-phylum,
-class,
-genus,
-species,
-habit.site
)
Random Forest Model
Randomly sample 80/20 split, use RF to model engraftment, test on 20% hold-out data
seed = 30
set.seed(seed)
trainIndex <- createDataPartition(d$Engrafted, p = .8,
list = FALSE,
times = 1)[,1]
train <- d[ trainIndex,]
test <- d[-trainIndex,]
unlist(lapply(list(summary(train$Engrafted)), function(x) x / sum(x)))
nrow(train)
unlist(lapply(list(summary(test$Engrafted)), function(x) x / sum(x)))
nrow(test)
Model
set.seed(seed)
rf <- randomForest(
Engrafted ~ .,
na.action = na.roughfix,
data = train,
ntree = 500, nodesize = 1,
mtry = 3, importance = T
)
# imp based on validation data
imp <-
randomForest::importance(rf, type = 1, scale = F, )
Performance
options(yardstick.event_first = FALSE)
test$rf <- predict(rf, test, type = 'prob')[,2]
roc <-
test %>%
ggplot(aes(m = rf, d = Engrafted)) +
geom_roc(n.cuts = 0, labels = F)
auPR <-
test %>%
pr_auc(data = ., truth = Engrafted, rf, na_rm = T) %>%
pull(.estimate) %>% round(., digits = 3)
test %>%
dplyr::mutate(Engrafted = factor(Engrafted)) %>%
pr_curve(data = ., truth = Engrafted, rf) %>%
ggplot(aes(y = precision, x = recall)) +
geom_line(size = 1) +
coord_fixed() +
theme_cowplot() +
scale_color_manual(values = colors.discrete) +
scale_x_continuous(labels = percent_format(), limits = c(0, 1)) +
scale_y_continuous(labels = percent_format(), limits = c(0, 1), sec.axis = dup_axis()) +
labs(x = 'Recall', y = 'Precision') +
geom_abline(intercept = 1, slope = -1, linetype = 'dashed') +
theme(legend.position = 'none',
axis.line.y.right = element_blank(),
axis.ticks.y.right = element_blank(),
axis.text.y.right = element_blank(),
axis.title.y.left = element_blank())
auROC <- calc_auc(roc)$AUC %>% round(., digits = 3)
auROC
auPR
plot <-
roc +
annotate('text', label = paste0('auROC: ', auROC), x = 0.4, y = 0.15, hjust = 0, size = 5) +
annotate('text', label = paste0('auPR: ', auPR), x = 0.4, y = 0.05, hjust = 0, size = 5) +
theme_cowplot() +
coord_fixed() +
scale_x_continuous(labels = percent_format(), limits = c(0, 1)) +
scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
geom_abline(slope = 1, linetype = 'dashed') +
labs(x = 'False Positive', y = 'True Positive') +
theme(aspect.ratio = 1)
plot
Exporting plot
output_name = 'PostPresence.Strain.RF.ROC'
ggsave(plot + theme(legend.position = 'none'),
device = 'pdf', dpi = 300, width = 4, height = 4,
filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
Variable Importance
Permuted vip
rf.vip_acc <- as.data.frame(rf$importance) %>%
rownames_to_column('feature') %>%
dplyr::select(feature, MeanDecreaseAccuracy)
rf.vip_sd <- as.data.frame(rf$importanceSD) %>%
rownames_to_column('feature') %>%
dplyr::select(feature, MeanDecreaseAccuracy) %>%
mutate(sd = MeanDecreaseAccuracy) %>%
dplyr::select(feature, sd)
rf.vip <- full_join(rf.vip_acc, rf.vip_sd, by = 'feature')
feat_order <- c('Taxonomy', 'Species\nAbundance', 'Clinical', 'Physiology')
plot <- rf.vip %>%
rename(importance = 'MeanDecreaseAccuracy') %>%
mutate(feature = as.character(feature)) %>%
mutate(category = case_when(
feature %in% c('kingdom', 'phylum', 'class', 'order', 'family') ~ 'Taxonomy',
feature %in% c('DaysSinceFMT', 'Study') ~ 'Clinical',
feature %in% c('AbundanceDonor', 'AbundanceRecipient', 'AbundanceRatio', 'Detected') ~ 'Species\nAbundance',
feature %in% c('Habitat', 'OxyClass', 'OxyTol', 'gram_stain', 'spore_forming') ~ 'Physiology',
T ~ 'other'
)) %>%
mutate(feature = case_when(
grepl(feature, pattern = '^Abundance') ~ str_split_fixed(feature, 'Abundance', 2)[,2],
feature == 'DaysSinceFMT' ~ 'Days Since FMT',
feature == 'Detected' ~ 'Donor-Specificity',
feature == 'GCcontent' ~ '%GC-Content',
feature == 'OxyTol' ~ 'Oxygen Tolerance',
feature == 'OxyClass' ~ 'Oxygen Usage',
feature == 'Habitat' ~ 'Oral Habitat',
feature == 'family' ~ 'Family',
feature == 'order' ~ 'Order',
feature == 'gram_stain' ~ 'Gram Staining',
feature == 'spore_forming' ~ 'Spore Formation',
T ~ feature
)) %>%
ggplot(aes(fct_reorder(feature, importance), importance)) +
geom_bar(stat = 'identity', fill = 'black', width = 0.1) +
geom_point(size = 3) +
coord_flip() +
scale_y_continuous(labels = percent_format_signif) +
facet_grid(rows = vars(fct_relevel(category, feat_order)), scales = 'free_y', space = 'free_y') +
labs(x= '', y = 'Mean Decrease in Accuracy') +
theme_cowplot() +
theme(strip.background = element_blank(),
strip.text.y = element_text(angle = 0),
# strip.text.y = element_blank()
)
plot
Exporting plot
output_name = 'PostPresence.Strain.RF.VarImp'
ggsave(plot + theme(legend.position = 'none'),
device = 'pdf', dpi = 300, width = 5, height = 4,
filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
Abundance in relation to Engraftment Frequency
Format
sstr_cases.select <-
sstr_cases.format %>%
dplyr::select(species, Case, AbundanceDonor, AbundanceRecipient, Engrafted) %>%
mutate(AbundanceRatio =
pseudo_log_trans(1e-6, 10)$transform(AbundanceDonor) -
pseudo_log_trans(1e-6, 10)$transform(AbundanceRecipient)) %>%
dplyr::select(-species, -Case)
What fraction of events are engrafted, given that the relative abundance of the donor strain is >=x%?
sstr_cases.select.AbundanceDonor <-
sstr_cases.select %>% arrange(desc(AbundanceDonor)) %>%
dplyr::select(AbundanceDonor, Engrafted)
get_donor_fraction_by_abundance <- function(min_abund) {
x <- sstr_cases.select.AbundanceDonor %>%
filter(AbundanceDonor >= min_abund)
return(sum(x$Engrafted == T) / nrow(x))
}
r.abund <- c()
abundance_steps <- 10^(seq(
log10(min(sstr_cases.select.AbundanceDonor$AbundanceDonor)),
log10(sstr_cases.select.AbundanceDonor$AbundanceDonor[15]), # at least 15 observations
length.out = 100))
for (as in abundance_steps) {r.abund <- c(r.abund, get_donor_fraction_by_abundance(as))}
r.abund.df <-
data.frame(min_rel_abund = abundance_steps,
engrafted_fraction = r.abund)
plot.donor <-
r.abund.df %>%
ggplot(aes(x = min_rel_abund,
y = engrafted_fraction)) +
geom_line() +
scale_x_continuous(trans = 'log10',
labels = c('', '.001%', '', '.1%', '', '10%', ''),
breaks = c(1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1)
) +
scale_y_continuous(labels = percent_format_signif) +
theme_cowplot() +
theme(aspect.ratio = 1) +
labs(x = 'Rel. Abund. in Donor', y = 'Cumulative\nEngraftment Frequency')
plot.donor
What fraction of events are engrafted, given that the ratio of relative abundances of the donor to recipient is >=x?
sstr_cases.select.AbundanceRatio <-
sstr_cases.select %>% arrange(desc(AbundanceRatio)) %>%
filter(AbundanceRecipient > 0) %>%
dplyr::select(AbundanceRatio, Engrafted)
get_donor_fraction_by_ratio <- function(min_ratio) {
x <- sstr_cases.select.AbundanceRatio %>% filter(AbundanceRatio >= min_ratio)
return(sum(x$Engrafted == T) / nrow(x))
}
r.ratio <- c()
abundance_steps <- seq(
min(sstr_cases.select.AbundanceRatio$AbundanceRatio),
sstr_cases.select.AbundanceRatio$AbundanceRatio[15], # at least 15 observations
length.out = 100)
for (as in abundance_steps) {r.ratio <- c(r.ratio, get_donor_fraction_by_ratio(as))}
r.ratio.df <-
data.frame(min_ratio = abundance_steps,
engrafted_fraction = r.ratio)
plot.ratio <-
r.ratio.df %>%
ggplot(aes(x = min_ratio,
y = engrafted_fraction)) +
geom_line() +
scale_y_continuous(labels = percent_format_signif) +
theme_cowplot() +
theme(aspect.ratio = 1) +
labs(x = 'Donor/Recipient (Log-Ratio)', y = 'Cumulative\nEngraftment Frequency')
plot.ratio
Exporting plots
output_name = 'EngraftmentFrequency'
ggsave(plot.donor,
device = 'pdf', dpi = 300, width = 4, height = 4,
filename = paste0(results_dir, fig, output_name, '.Donor.Plot.pdf'))
ggsave(plot.ratio,
device = 'pdf', dpi = 300, width = 4, height = 4,
filename = paste0(results_dir, fig, output_name, '.Ratio.Plot.pdf'))