We here define historical growth effect as the growth effect calculated only from posterior samples which yield age at onset estimates which are larger than 0.
pop_size <- 0.5e5
source("Scripts/vaf_dynamics_functions.R")
##
## Attaching package: 'greta'
## The following objects are masked from 'package:stats':
##
## binomial, cov2cor, poisson
## The following objects are masked from 'package:base':
##
## %*%, apply, backsolve, beta, chol2inv, colMeans, colSums, diag,
## eigen, forwardsolve, gamma, identity, rowMeans, rowSums, sweep,
## tapply
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble 3.0.4 ✔ dplyr 1.0.2
## ✔ tidyr 1.1.2 ✔ stringr 1.4.0
## ✔ readr 1.4.0 ✔ forcats 0.5.0
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::slice() masks greta::slice()
## This is bayesplot version 1.7.2
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
##
## get_legend
##
## Attaching package: 'extraDistr'
## The following objects are masked from 'package:gtools':
##
## ddirichlet, rdirichlet
## The following object is masked from 'package:purrr':
##
## rdunif
##
## ---------------------
## Welcome to dendextend version 1.14.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:stats':
##
## cutree
##
## Attaching package: 'ape'
## The following objects are masked from 'package:dendextend':
##
## ladderize, rotate
## The following object is masked from 'package:ggpubr':
##
## rotate
## Registered S3 method overwritten by 'treeio':
## method from
## root.phylo ape
## ggtree v2.0.4 For help: https://yulab-smu.github.io/treedata-book/
##
## If you use ggtree in published research, please cite the most appropriate paper(s):
##
## [36m-[39m Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi: 10.1093/molbev/msy194
## [36m-[39m Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
##
## rotate
## The following object is masked from 'package:dendextend':
##
## rotate
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:ggpubr':
##
## rotate
##
## Attaching package: 'reghelper'
## The following object is masked from 'package:greta':
##
## beta
## The following object is masked from 'package:base':
##
## beta
source("Scripts/prepare_data.R")
model_file_name <- "models/model_ch.RDS"
dir.create("figures/model_ch/other_associations")
## Warning in dir.create("figures/model_ch/other_associations"): 'figures/model_ch/
## other_associations' already exists
set.seed(42)
model_id <- "model_ch"
load(file = sprintf('data_output/vaf_modelling_coefficients_%s.Rdata',model_id))
values_model <- readRDS(file = model_file_name)
site_samples <- values_model$draws$`11`[,grep('site',colnames(values_model$draws$`11`))]
gene_samples <- values_model$draws$`11`[,grep('gene',colnames(values_model$draws$`11`))]
clone_samples <- values_model$draws$`11`[,grep('clone',colnames(values_model$draws$`11`))]
offset_samples <- values_model$draws$`11`[,grep('u',colnames(values_model$draws$`11`))]
beta_samples <- values_model$draws$`11`[,grep('beta',colnames(values_model$draws$`11`))] %>%
unlist()
sampling_idxs <- r_values_full %>%
subset(b_site_095 + b_gene_095 + b_clone_095 > 0) %>%
group_by(
individual,site,
site_numeric,
gene_numeric,
clone_numeric
) %>%
summarise(
minAge = min(age[which(round(true/coverage,4) >= 0.005)]),
maxAge = max(age),
maxVAF = max(true/coverage)
) %>%
arrange(site)
c_names <- colnames(values_model$draws$`11`)
good_sites_individual <- statistics_data %>%
ungroup %>%
select(site,individual,sum_stat) %>%
filter(sum_stat) %>%
select(-sum_stat)
historical_estimates <- sampling_idxs %>%
apply(1,function(x) {
individual <- x[1]
site_numeric <- as.numeric(x[3])
gene_numeric <- as.numeric(x[4])
clone_numeric <- as.numeric(x[5])
minAge <- as.numeric(x[6])
if (is.na(site_numeric)) {
b_site <- 0} else {
b_site <- site_samples[,site_numeric]
}
b_gene <- gene_samples[,gene_numeric]
b_clone <- clone_samples[,clone_numeric]
b <- b_gene + b_clone + b_site
u_values <- offset_samples[,clone_numeric]
ages_at_onset <- t0_adjusted(u_values,b,g=2,n=pop_size) + values_model$min_age
ages_at_onset_corrected <- ifelse(ages_at_onset < -1,-1,ages_at_onset)
ages_at_onset_corrected <- ifelse(ages_at_onset > minAge,minAge,ages_at_onset)
biologically_feasible <- ages_at_onset > -1 & ages_at_onset < minAge
historical_b <- b[biologically_feasible]
Q <- quantile(historical_b,c(0.05,0.5,0.95),na.rm = T)
Q_observed <- quantile(b,c(0.05,0.5,0.95),na.rm = T)
names(Q) <- c("Q05","Q50","Q95")
return(data.frame(
ages_Q50 = quantile(ages_at_onset_corrected,0.5,na.rm = T)[1],
observed_growth_q05=Q_observed[1],
observed_growth_q50=Q_observed[2],
observed_growth_q95=Q_observed[3],
observed_growth_mean=mean(b,na.rm=T),
historical_growth_q05=Q[1],
historical_growth_q50=Q[2],
historical_growth_q95=Q[3],
historical_growth_mean=mean(historical_b,na.rm=T)))
}) %>%
do.call(what=rbind)
historical_estimates <- cbind(
as.data.frame(sampling_idxs),as.data.frame(historical_estimates))
historical_estimates$gene <- str_match(historical_estimates$site,"[A-Z0-9]+")
historical_estimates <- historical_estimates %>%
filter(
paste(individual,as.character(site)) %in% paste(
good_sites_individual$individual,good_sites_individual$site))
historical_estimates %>%
select(SardID = individual,
Site = site,
Gene = gene,
ObservedGrowthQ05 = observed_growth_q05,
ObservedGrowthQ50 = observed_growth_q50,
ObservedGrowthQ95 = observed_growth_q95,
ObservedGrowthMean = observed_growth_mean,
HistoricalGrowthQ05 = historical_growth_q05,
HistoricalGrowthQ50 = historical_growth_q50,
HistoricalGrowthQ95 = historical_growth_q95,
HistoricalGrowthMean = historical_growth_mean) %>%
write.csv("data_output/historical_growth.csv")
idxs <- sampling_idxs %>%
subset(individual == 4088 & site == "DNMT3At-E578X")
tmp_data <- full_data %>%
subset(SardID == idxs$individual & amino_acid_change == idxs$site)
#idxs <- sampling_idxs[1,]
S <- data.frame(
u = offset_samples[,idxs$clone_numeric],
b_gene = gene_samples[,idxs$gene_numeric],
b_site = site_samples[,idxs$site_numeric],
b_clone = clone_samples[,idxs$clone_numeric],
idx = seq(1,nrow(clone_samples)),
maxAge = idxs$maxAge) %>%
apply(1,function(x) {
u <- as.numeric(x[1])
b_gene <- as.numeric(x[2])
b_site <- as.numeric(x[3])
b_site <- ifelse(is.na(b_site),0,b_site)
b_clone <- as.numeric(x[4])
max_age <- as.numeric(x[6]) + 25
b <- b_gene+b_site+b_clone
traj <- suppressWarnings(trajectory_from_parameters_2(
b = b,u = u,g = 2,N = pop_size,x = seq(-20,max_age,by = 0.5) - values_model$min_age))
age_at_onset <- t0_adjusted(u,b,g = 2,n = pop_size)
return(data.frame(
age = traj$x + values_model$min_age,
VAF = traj$y,
t0 = age_at_onset + values_model$min_age,
ID = as.numeric(x[5])
))
}) %>%
do.call(what = rbind) %>%
mutate(plausible = t0 >= 0) %>%
mutate(VAF = ifelse(is.na(VAF),0,VAF))
x <- rbind(
S %>%
subset(plausible == T) %>%
group_by(age,plausible) %>%
summarise(q05 = quantile(VAF,0.05,na.rm = T),
q50 = quantile(VAF,0.5,na.rm = T),
q95 = quantile(VAF,0.95,na.rm = T)) %>%
as.data.frame,
S %>%
group_by(age) %>%
summarise(plausible = as.logical(0),
q05 = quantile(VAF,0.05,na.rm = T),
q50 = quantile(VAF,0.5,na.rm = T),
q95 = quantile(VAF,0.95,na.rm = T)) %>%
as.data.frame) %>%
subset(q95 > 0)
plot_good_bad <- x %>%
group_by(age) %>%
subset(q95 > 0) %>%
mutate(q05 = ifelse(
all(c(T,F) %in% plausible),
ifelse(plausible == F,min(q95),q05),
q05)) %>%
ggplot(aes(x = age,ymin = q05,ymax = q95,fill = plausible)) +
geom_ribbon(alpha = 0.3) +
geom_line(aes(y = q50,colour = plausible),size = 0.25) +
geom_vline(xintercept = 0,linetype = 3,
size = 0.5) +
theme_gerstung(base_size = 6) +
xlab("Age") +
ylab("VAF") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.x = element_blank()) +
scale_fill_manual(values = colorspace::lighten(c("red4","goldenrod"),amount = 0.2),
guide = F) +
scale_colour_manual(values = colorspace::darken(c("red4","goldenrod"),amount = 0.2),
guide = F) +
scale_y_continuous(trans = 'log10') +
scale_x_continuous(breaks = 0)
plot_plausible <- x %>%
ggplot(aes(x = age,y = q50,group = plausible,colour = plausible)) +
geom_line(size = 0.25) +
geom_point(data = tmp_data,aes(x = Age,y = VAF),inherit.aes = F,size = 0.25) +
geom_vline(xintercept = 0,linetype = 3,
size = 0.5) +
theme_gerstung(base_size = 6) +
xlab("Age") +
ylab("VAF") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.x = element_blank()) +
scale_colour_manual(values = c("red4","goldenrod"),guide = F) +
scale_y_continuous(trans = 'log10') +
scale_x_continuous(breaks = 0,limits = c(min(x$age),max(x$age)))
plot_grid(plot_good_bad,plot_plausible) +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/good_bad_traj.pdf",model_id),
width = 1.4,height = 0.55,
useDingbats = F)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
plot_good_bad +
geom_point(data = tmp_data,aes(x = Age,y = VAF),inherit.aes = F,size = 0.25) +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/good_bad_traj_combo.pdf",model_id),
width = 0.8,height = 0.8,
useDingbats = F)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
idxs <- sampling_idxs %>%
subset((individual == 4088 & site == "DNMT3At-E578X") | (individual == 3877 & site == "U2AF1-Q157R"))
tmp_data <- full_data %>%
subset(SardID %in% idxs$individual & amino_acid_change %in% idxs$site)
#idxs <- sampling_idxs[1,]
S <- data.frame(
u = c(offset_samples[,idxs$clone_numeric]),
b_gene = c(gene_samples[,idxs$gene_numeric]),
b_site = c(site_samples[,idxs$site_numeric]),
b_clone = c(clone_samples[,idxs$clone_numeric]),
idx = c(replicate(2,seq(1,nrow(clone_samples)))),
maxAge = c(sapply(idxs$maxAge, function(x) rep(x,nrow(offset_samples)))),
site = c(sapply(idxs$site, function(x) rep(x,nrow(offset_samples)))),
id = c(sapply(idxs$individual, function(x) rep(x,nrow(offset_samples))))) %>%
apply(1,function(x) {
u <- as.numeric(x[1])
b_gene <- as.numeric(x[2])
b_site <- as.numeric(x[3])
b_site <- ifelse(is.na(b_site),0,b_site)
b_clone <- as.numeric(x[4])
max_age <- as.numeric(x[6]) + 10
b <- b_gene+b_site+b_clone
traj <- suppressWarnings(trajectory_from_parameters_2(
b = b,u = u,g = 2,N = pop_size,x = seq(-20,max_age,by = 0.5) - values_model$min_age))
age_at_onset <- t0_adjusted(u,b,g = 2,n = pop_size)
return(suppressWarnings(data.frame(
age = traj$x + values_model$min_age,
VAF = traj$y,
t0 = age_at_onset + values_model$min_age,
ID = as.numeric(x[5]),
site = x[7],
individual = x[8]
)))
}) %>%
do.call(what = rbind) %>%
mutate(plausible = t0 >= 0) %>%
mutate(VAF = ifelse(is.na(VAF),0,VAF))
x <- S %>%
group_by(age,site,individual) %>%
summarise(plausible = as.logical(0),
q05 = quantile(VAF,0.05,na.rm = T),
q50 = quantile(VAF,0.5,na.rm = T),
q95 = quantile(VAF,0.95,na.rm = T))
plot_slow_fast <- x %>%
ggplot(aes(x = age,y = q50,colour = site)) +
geom_ribbon(aes(ymin = q05,ymax = q95,fill = site),
colour = NA,alpha = 0.5) +
geom_line(size = 0.25) +
#geom_point(data = tmp_data,aes(x = Age,y = VAF),inherit.aes = F,size = 0.25) +
geom_vline(xintercept = 0,linetype = 3,
size = 0.5) +
theme_gerstung(base_size = 6) +
xlab("Age") +
ylab("VAF") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
scale_colour_manual(values = c("goldenrod","lightblue"),guide = F) +
scale_fill_manual(values = c("goldenrod","lightblue"),guide = F) +
scale_y_continuous(trans = 'log10') +
scale_x_continuous(breaks = 0,limits = c(min(x$age),max(x$age))) +
theme(axis.title.x = element_blank())
plot_slow_fast +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/slow_fast_traj.pdf",model_id),
width = 0.6,height = 0.5,
useDingbats = F)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
idxs <- sampling_idxs %>%
subset(grepl("TP53",site))# %>% subset(individual == 4088 & site == "DNMT3At-E578X")
idxs <- idxs[1,]
tmp_data <- full_data %>%
subset(SardID == idxs$individual & amino_acid_change == idxs$site)
S <- data.frame(
u = offset_samples[,idxs$clone_numeric],
b_gene = gene_samples[,idxs$gene_numeric],
b_site = site_samples[,idxs$site_numeric],
b_clone = clone_samples[,idxs$clone_numeric],
idx = seq(1,nrow(clone_samples)),
maxAge = idxs$maxAge) %>%
apply(1,function(x) {
u <- as.numeric(x[1])
b_gene <- as.numeric(x[2])
b_site <- as.numeric(x[3])
b_site <- ifelse(is.na(b_site),0,b_site)
b_clone <- as.numeric(x[4])
max_age <- as.numeric(x[6]) + 25
b <- b_gene+b_site+b_clone
traj <- suppressWarnings(trajectory_from_parameters_2(
b = b,u = u,g = 2,N = pop_size,x = seq(-20,max_age,by = 0.5) - values_model$min_age))
age_at_onset <- t0_adjusted(u,b,g = 2,n = pop_size)
return(data.frame(
age = traj$x + values_model$min_age,
VAF = traj$y,
t0 = age_at_onset + values_model$min_age,
ID = as.numeric(x[5])
))
}) %>%
do.call(what = rbind) %>%
mutate(plausible = t0 >= 0) %>%
mutate(VAF = ifelse(is.na(VAF),0,VAF))
x <- rbind(
S %>%
subset(plausible == T) %>%
group_by(age,plausible) %>%
summarise(q05 = quantile(VAF,0.05,na.rm = T),
q50 = quantile(VAF,0.5,na.rm = T),
q95 = quantile(VAF,0.95,na.rm = T)) %>%
as.data.frame,
S %>%
group_by(age) %>%
summarise(plausible = as.logical(0),
q05 = quantile(VAF,0.05,na.rm = T),
q50 = quantile(VAF,0.5,na.rm = T),
q95 = quantile(VAF,0.95,na.rm = T)) %>%
as.data.frame) %>%
subset(q95 > 0)
plot_good_bad <- x %>%
group_by(age) %>%
subset(q95 > 0) %>%
mutate(q05 = ifelse(
all(c(T,F) %in% plausible),
ifelse(plausible == F,min(q95),q05),
q05)) %>%
ggplot(aes(x = age,ymin = q05,ymax = q95,fill = plausible)) +
geom_ribbon() +
geom_vline(xintercept = 0,linetype = 3,
size = 0.5) +
theme_gerstung(base_size = 6) +
xlab("Age") +
ylab("VAF") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
scale_fill_manual(values = colorspace::lighten(c("red4","goldenrod"),amount = 0.01),
guide = F) +
scale_y_continuous(trans = 'log10') +
scale_x_continuous(breaks = 0)
plot_plausible <- x %>%
ggplot(aes(x = age,y = q50,group = plausible,colour = plausible)) +
geom_line(size = 0.25) +
geom_point(data = tmp_data,aes(x = Age,y = VAF),inherit.aes = F,size = 0.25) +
geom_vline(xintercept = 0,linetype = 3,
size = 0.5) +
theme_gerstung(base_size = 6) +
xlab("Age") +
ylab("VAF") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
scale_colour_manual(values = c("red4","goldenrod"),guide = F) +
scale_y_continuous(trans = 'log10') +
scale_x_continuous(breaks = 0,limits = c(min(x$age),max(x$age)))
plot_grid(plot_good_bad,plot_plausible) +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/good_bad_traj_2.pdf",model_id),
width = 1.4,height = 0.55,
useDingbats = F)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
We can see clearly a bias in terms of genes where the difference between historical and observed growth is concerned - particularly, the phase of clonal dynamics we are able to observe for TP53, DNMT3A and BRCC3 is far too slow to explain the observed VAF.
aging_order <- c(
"DNMT3A","BRCC3","TP53","GNB1","ASXL1","TET2","CBL",
"CTCF","PPM1D","JAK2","KRAS","SRSF2-other","SF3B1","IDH1",
"IDH2","PTPN11","U2AF1","SRSF2-P95H")
min_max <- c(
exp(min(historical_estimates$observed_growth_mean,historical_estimates$historical_growth_mean,na.rm=T))-1,
exp(max(historical_estimates$observed_growth_mean,historical_estimates$historical_growth_mean,na.rm=T))-1
)
growth_per_year_plot <- historical_estimates %>%
ggplot(aes(x = exp(observed_growth_mean)-1,
y = exp(historical_growth_mean)-1,
colour = gene)) +
geom_abline(slope = 1,
size = 0.2,
linetype=2) +
geom_point(size = 0.2) +
scale_y_continuous(limits = min_max,
breaks = seq(-0.2,1,by = 0.2),
expand = c(0.02,0),
labels = sprintf("%s%%",seq(-0.2,1,by = 0.2)*100)) +
scale_x_continuous(limits = min_max,
breaks = seq(-0.2,1,by = 0.2),
expand = c(0.02,0),
labels = sprintf("%s%%",seq(-0.2,1,by = 0.2)*100)) +
theme_gerstung(base_size = 6) +
scale_colour_manual(values = gene_colours,guide = F) +
xlab("Observed growth per year") +
ylab("Historical growth per year")
growth_per_year_gene_plot_data <- historical_estimates %>%
mutate(gene = ifelse(gene == "SRSF2",
ifelse(site == "SRSF2-P95H",
"SRSF2-P95H",
"SRSF2-other"),
gene)) %>%
group_by(gene) %>%
summarise(average_historical_growth = mean(exp(historical_growth_mean)-1,na.rm=T),
average_observed_growth = mean(exp(observed_growth_mean)-1,na.rm=T))
growth_per_year_gene_plot <- growth_per_year_gene_plot_data %>%
ggplot(aes(x = factor(gene,levels = aging_order),
y = (average_observed_growth + average_historical_growth)/2,
colour = str_match(gene,"[A-Z0-9]+"))) +
geom_linerange(aes(ymin = average_observed_growth,ymax = average_historical_growth),
colour = "black") +
geom_text(aes(y = average_historical_growth,
label = sprintf("%s%%",100*round(average_observed_growth/average_historical_growth,3))),
colour = "black",
size = 2.7,
hjust = -0.2) +
geom_point(aes(y = average_observed_growth),size = 1) +
geom_point(aes(y = average_historical_growth),size = 1) +
geom_point(aes(y = average_historical_growth),size = 0.5,colour = 'white') +
theme_gerstung(base_size = 6) +
xlab("") +
ylab("Growth per year") +
scale_y_continuous(breaks = seq(-0.2,1,by = 0.1),
labels = sprintf("%s%%",seq(-0.2,1,by = 0.1)*100),
expand = c(0,0,0.05,0),limits = c(0,NA)) +
scale_colour_manual(values = gene_colours,guide = F) +
rotate_x_text(face = "italic")
plot_grid(growth_per_year_plot,growth_per_year_gene_plot,nrow=1,align = "h") +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/historical_growth_figure.pdf",model_id),
height = 2,
width = 3.5,
useDingbats = F)
## Warning: Removed 12 rows containing missing values (geom_point).
growth_per_year_gene_plot +
theme(axis.text.x = element_text(face = "plain",angle = 0,hjust = 0.5),
axis.text.y = element_text(face = "italic")) +
coord_flip() +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/historical_growth_figure_larger.pdf",model_id),
width = 1.7,height = 2,useDingbats = F)
We can estimate the annual growth that one would expect and the growth one would be able to observe in the later years of a clone from phylogenetic trees alone. This allows us to draw a comparison between the EPS trajectory defined by a tree and the ratio between observed and plausible - or historical - growth. In this figure we present the combined evidence from simulations and real data (both longitudinal and phylogenetic) that clonal deceleration is observable across most clones, with a few genes being characterised by their larger deceleration - we estimate that DNMT3A, one of the most prevalent genes in CH, loses in average at least half of its annual growth rate.
mitchell_coefficients <- read.csv("data_output/mitchell_bnpr_coefficients.csv") %>%
subset(w_sum < 10) %>%
group_by(id,clade) %>%
select(id,clade,
early = cp_1,late = cp_2,expected = nonlinear_vaf) %>%
mutate(plotting_id = "Real data") %>%
mutate(source = "Mitchell et al. 2021") %>%
mutate(colour = ifelse(grepl("DNMT3A",clade),"DNMT3A",
ifelse(grepl("Clade",clade),"No known driver","Other driver")))
tree_coefficients <- read.csv("data_output/tree_bnpr_coefficients.csv") %>%
transmute(id = individual,clade = plot_label,
early = b_early,late = b_early + b_late,expected = expected_late) %>%
mutate(plotting_id = "Real data") %>%
mutate(source = "Fabre et al. 2021") %>%
mutate(colour = str_match(clade,"[A-Z0-9]+")) %>%
mutate(colour = ifelse(grepl("UD",colour),"No known driver",colour))
simulation_coefficients_full <- read.csv("data_output/simulated_bnpr_coefficients.csv") %>%
subset(w_sum < 5)
simulation_coefficients <- simulation_coefficients_full %>%
subset(col == "Low variance") %>%
mutate(plotting_id = ifelse(late_growth/growth_rate < 0.8,
"Saturating",
"Near\nconstant")) %>%
select(id = name,clade = driver,early = cp_1,late = cp_late,expected = expected_nl,plotting_id) %>%
mutate(source = "Simulated") %>%
mutate(colour = "No known driver")
all_bnpr_coefficients <- rbind(as.data.frame(mitchell_coefficients),
as.data.frame(tree_coefficients),
as.data.frame(simulation_coefficients))
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(2259L, 2259L, 500L, 500L, :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(2259L, 2259L, 500L, 500L, :
## invalid factor level, NA generated
x_order <- c("Near\nconstant","Saturating",
"Real data","Obs./plaus.\ngrowth ratio") %>%
rev
rbind(
all_bnpr_coefficients %>%
group_by(source) %>%
summarise(N = sum((late / expected) < 0.8,na.rm = T),
Total = length(id)) %>%
mutate(Ratio = N / Total),
historical_estimates %>%
mutate(ratio = (exp(observed_growth_mean)-1) / (exp(historical_growth_mean)-1)) %>%
summarise(N = sum(ratio < 0.8,na.rm=T),Total = length(ratio)) %>%
mutate(Ratio = N / Total) %>%
mutate(source = "Fabre et al. 2021 (longitudinal)") %>%
select(source,N,Total,Ratio)) %>%
select(Source = source,`Number of saturating trajectories` = N,
`Total number of trajectories` = Total,Proportion = Ratio) %>%
as.data.frame
all_bnpr_coefficients %>%
mutate(late_expected = late / expected) %>%
mutate(late_expected = ifelse(late_expected < -1,-1,late_expected)) %>%
ggplot(aes(x = plotting_id,y = late_expected)) +
geom_point(
data = growth_per_year_gene_plot_data,inherit.aes = F,
mapping = aes(x = "Obs./plaus.\ngrowth ratio",
y = average_observed_growth/average_historical_growth,
group = reorder(gene,average_observed_growth/average_historical_growth),
colour = str_match(gene,"[A-Z0-9]+"),
shape = "Fabre et al. 2021"),
size = 0.3,
position = position_dodge(width = 0.75)) +
geom_point(aes(shape = source,colour = colour),size = 0.25,
position = position_jitter(width = 0.2,height = 0),
alpha = 0.9) +
geom_boxplot(size = 0.25,fill = NA,outlier.color = NA) +
geom_hline(yintercept = 1.0,size = 0.25) +
theme_gerstung(base_size = 6) +
scale_shape_manual(values = c("circle","triangle","cross"),name = NULL) +
scale_x_discrete(breaks = x_order,limits = x_order) +
xlab("") +
ylab("Growth ratio") +
scale_colour_manual(values = c(gene_colours,
`Other driver` = "#18A558",
`No known driver` = "grey50"),guide = F) +
coord_flip(ylim = c(NA,NA)) +
scale_y_continuous(breaks = c(-1,0,0.5,seq(1,10)),
labels = c("<-1","0","0.5",seq(1,10))) +
theme(legend.position = "top",
legend.key.size = unit(0,"cm"),legend.box.spacing = unit(0.1,"cm")) +
guides(shape = guide_legend(nrow = 2)) +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/bnpr_historical_growth.pdf",model_id),
useDingbats = F,
width = 1.9,height = 1.2)
## Warning: Removed 27 rows containing non-finite values (stat_boxplot).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing non-finite values (stat_boxplot).
## Warning: Removed 27 rows containing missing values (geom_point).
all_bnpr_coefficients %>%
mutate(late_expected = late / expected) %>%
mutate(late_expected = ifelse(late_expected < -1,-1,late_expected)) %>%
subset(source != "Simulated") %>%
ggplot(aes(x = plotting_id,y = late_expected)) +
geom_point(
data = growth_per_year_gene_plot_data,inherit.aes = F,
mapping = aes(x = "Obs./plaus.\ngrowth ratio",
y = average_observed_growth/average_historical_growth,
group = reorder(gene,average_observed_growth/average_historical_growth),
colour = str_match(gene,"[A-Z0-9]+"),
shape = "Fabre et al. 2021"),
size = 0.3,
position = position_dodge(width = 0.75)) +
geom_point(size = 0.25,
position = position_jitter(width = 0.2,height = 0),
alpha = 0.9,
colour = "grey") +
geom_boxplot(size = 0.25,fill = NA,outlier.color = NA) +
geom_hline(yintercept = 1.0,size = 0.25) +
theme_gerstung(base_size = 6) +
scale_shape_manual(values = c("circle","triangle","cross"),guide = F) +
scale_x_discrete(breaks = x_order[1:2],limits = x_order[1:2]) +
xlab("") +
ylab("Growth ratio") +
scale_colour_manual(values = c(gene_colours,
`Other driver` = "#18A558",
`No known driver` = "grey50"),guide = F) +
scale_y_continuous(breaks = c(-1,0,0.5,1,2),
labels = c("<-1","0","0.5","1","2")) +
theme(legend.position = "none",
legend.key.size = unit(0,"cm"),legend.box.spacing = unit(0.1,"cm")) +
guides(shape = guide_legend(nrow = 2)) +
coord_flip() +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/bnpr_historical_growth_simplified.pdf",model_id),
useDingbats = F,
width = 1.9,height = 0.8)
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
mitchell_vaf <- read.csv("data_output/mitchell_bnpr_coefficients.csv") %>%
subset(w_sum < 10) %>%
group_by(id,clade) %>%
select(id,clade,
observed_vaf=vaf_from_clade,expected_vaf) %>%
mutate(plotting_id = "Real data") %>%
mutate(source = "Mitchell et al. 2021") %>%
mutate(colour = ifelse(grepl("DNMT3A",clade),"DNMT3A",
ifelse(grepl("Clade",clade),"No known driver","Other driver")))
tree_vaf <- read.csv("data_output/tree_bnpr_coefficients.csv") %>%
subset(source == "Colonies") %>%
select(id = individual,clade = plot_label,
observed_vaf,expected_vaf) %>%
mutate(plotting_id = "Real data") %>%
mutate(source = "Fabre et al. 2021") %>%
mutate(colour = str_match(clade,"[A-Z0-9]+")) %>%
mutate(colour = ifelse(grepl("UD",colour),"No known driver",colour)) %>%
mutate(id = factor(id))
rbind(as.data.frame(mitchell_vaf),as.data.frame(tree_vaf)) %>%
mutate(expected_vaf = expected_vaf) %>%
mutate(expected_vaf = ifelse(expected_vaf > 1,1,expected_vaf)) %>%
ggplot(aes(x = observed_vaf,y = expected_vaf)) +
geom_point(size = 0.5) +
geom_abline(slope = 1,size = 0.25) +
theme_gerstung(base_size = 6) +
scale_y_continuous(breaks = c(0,0.5,1.0)) +
scale_x_continuous(breaks = c(0,0.5,1.0)) +
xlab("Observed clone freq.") +
ylab("Extrapolated\nclone frequency")
rbind(as.data.frame(mitchell_vaf),as.data.frame(tree_vaf)) %>%
mutate(expected_vaf = expected_vaf) %>%
mutate(expected_vaf = ifelse(expected_vaf > 1,1,expected_vaf)) %>%
ggplot(aes(y = expected_vaf/observed_vaf,x = "")) +
geom_hline(yintercept = 1,size = 0.25) +
geom_point(position = position_jitter(height = 0.2),size = 0.25,colour = "grey") +
geom_boxplot(outlier.color = NA,alpha = 0.5,size = 0.25) +
theme_gerstung(base_size = 6) +
theme(axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank()) +
ylab("Expected/observed\nclone size fold change") +
scale_y_continuous(trans = 'log10',
breaks = c(0.1,1e-2,1,10,100),labels = c("0.1","0.01","1","10","100")) +
coord_flip() +
ggsave(filename = sprintf("figures/%s/dynamic_coefficients/expected_vs_observed_vaf.pdf",model_id),
useDingbats = F,
width = 1.1,height = 0.75)
unique_genes <- historical_estimates %>%
group_by(gene) %>%
filter(length(unique(paste(individual,site))) > 10) %>%
select(gene) %>%
distinct %>%
unlist
historical_estimates <- historical_estimates %>%
mutate(
ratio_q05 = observed_growth_q05/historical_growth_q05,
ratio_q50 = observed_growth_q50/historical_growth_q50,
ratio_q95 = observed_growth_q95/historical_growth_q95,
ratio_mean = observed_growth_mean/historical_growth_mean
)
historical_estimates %>%
group_by(gene) %>%
mutate(M = unlist(quantile(ratio_q50,c(0.5),na.rm = T))) %>%
mutate(ratio_q50 = ifelse(ratio_q50 < -0.5,-0.5,ratio_q50)) %>%
ggplot(aes(x = reorder(gene,M),y = ratio_q50)) +
geom_hline(yintercept = 1,linetype = 2,size = 0.25) +
geom_point(size = 0.5,position = position_jitter(width = 0.1)) +
geom_boxplot(outlier.size = 0,alpha = 0.7,size = 0.25) +
coord_flip() +
theme_gerstung(base_size = 6) +
scale_y_continuous(breaks = c(-0.5,0,1),labels = c("<-0.5","0","1")) +
theme(axis.text.y = element_text(face = "italic")) +
ylab("Ratio of historical and observed annual fold change") +
xlab("Gene")
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing missing values (geom_point).
unique_site_multiple <- values_model$sub_data %>%
select(amino_acid_change,site_numeric_multiple) %>%
subset(!is.na(site_numeric_multiple)) %>%
distinct %>%
arrange(site_numeric_multiple) %>%
select(amino_acid_change) %>%
unlist %>%
as.character() %>% as.factor
tmp <- historical_estimates %>%
mutate(truncating = grepl("[A-Z0-9]+t-",site)) %>%
mutate(recurring = site %in% unique_site_multiple) %>%
mutate(f = paste(
ifelse(recurring,"Recurring","Non-recurring"),
ifelse(truncating,"truncating","non-truncating"),
sep = ' and\n'
))
anvar_test <- anova(glm(ratio_q50 ~ f,data = tmp),
test = "F")
tmp %>%
mutate(ratio_q50 = ifelse(ratio_q50 < -1,-1,ratio_q50)) %>%
ggplot(aes(x = f,y = ratio_q50)) +
geom_boxplot(outlier.size = 0.5,size = 0.25) +
ylab("Historical and observed annual fold change") +
xlab("") +
theme_gerstung(base_size = 6) +
coord_flip(ylim = c(-1,NA)) +
scale_y_continuous(breaks = c(-1,0,0.5,1),labels = c("<-1",0,0.5,1)) +
ggsave(useDingbats=FALSE,sprintf("figures/%s/other_associations/uc_tnt_rec.pdf",model_id),
height = 1.2,width = 2.0)
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
anvar_test
tmp_dnmt3a <- tmp %>%
subset(gene == "DNMT3A") %>%
mutate(f = ifelse(grepl("Recurring",f),"Hot-spot","Normal mutation"))
wilcox.test(ratio_q50 ~ f, data = tmp_dnmt3a)
##
## Wilcoxon rank sum test with continuity correction
##
## data: ratio_q50 by f
## W = 3564, p-value = 0.09323
## alternative hypothesis: true location shift is not equal to 0
tmp_dnmt3a %>%
ggplot(aes(x = f,y = ratio_q50)) +
geom_boxplot(outlier.size = 0.5,size = 0.25) +
ylab("Observed vs. historical annual fold change") +
xlab("") +
theme_gerstung(base_size = 6) +
coord_flip(ylim = c(-1,NA)) +
scale_y_continuous() +
ggsave(useDingbats=FALSE,sprintf("figures/%s/other_associations/uc_tnt_rec_dnmt3a.pdf",model_id),
height = 1.2,width = 2.5)
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).
We tested whether age, gender and smoking could explain a greater amount of variance than age alone and found no evidence for this.
smoking_gender_data <- load_smoking_data() %>%
group_by(SardID) %>%
summarise(HasSmoked = any(PackYears>0)) %>%
merge(distinct(select(full_data,SardID,Gender))) %>%
select(individual = SardID,HasSmoked,Gender)
historical_estimates_gender_smoking <- merge(
historical_estimates,smoking_gender_data,
by = c("individual"))
tmp <- historical_estimates_gender_smoking %>%
mutate(min_age = minAge) %>%
filter(!is.na(ratio_q50) & !is.na(HasSmoked) & !is.na(Gender) & !is.na(min_age))
all_models <- list(
`All genes` = list(
m = glm(ratio_q50 ~ HasSmoked + Gender + min_age,data = tmp),
null_m = glm(ratio_q50 ~ min_age,data = tmp)))
for (g in unique_genes) {
x <- tmp %>%
subset(gene == g)
all_models[[g]] <- list(
m = glm(ratio_q50 ~ HasSmoked + Gender + min_age,data = x),
null_m = glm(ratio_q50 ~ min_age,data = x))
}
significance_df <- all_models %>%
lapply(
function(x) anova(x$null_m,x$m,test="F")[2,c(5,6)]
) %>%
do.call(what = rbind)
significance_df$gene <- names(all_models)
significance_df$SignificantAfterCorrection <- p.adjust(significance_df$`Pr(>F)`,method = "BH") < 0.05
significance_df
tmp %>%
mutate(
L = paste(ifelse(Gender == "M","Male","Female"),
ifelse(HasSmoked == T,"\nhas smoked","\nnever smoked"),
sep = '')
) %>%
mutate(ratio_q50 = ifelse(ratio_q50 < -1,-1,ratio_q50)) %>%
ggplot(aes(x = L,y = ratio_q50, colour = Gender)) +
geom_boxplot(size = 0.2,outlier.size = 0.5) +
theme_gerstung(base_size = 6) +
coord_flip(ylim = c(-1,NA)) +
xlab("") +
ylab("Ratio of historical and observed annual fold change") +
scale_y_continuous() +
scale_colour_aaas(guide = F)
all_samples <- sampling_idxs %>%
group_by(site) %>%
mutate(re = length(unique(individual)) > 1) %>%
subset(re == T) %>%
apply(1,function(x) {
individual <- x[1]
site_numeric <- as.numeric(x[3])
gene_numeric <- as.numeric(x[4])
clone_numeric <- as.numeric(x[5])
minAge <- as.numeric(x[6])
if (is.na(site_numeric)) {
b_site <- 0} else {
b_site <- site_samples[,site_numeric]
}
b_gene <- gene_samples[,gene_numeric]
b_clone <- clone_samples[,clone_numeric]
b <- b_gene + b_clone + b_site
u_values <- offset_samples[,clone_numeric]
ages_at_onset <- t0_adjusted(u_values,b,g=2,n=pop_size) + values_model$min_age
ages_at_onset_corrected <- ifelse(ages_at_onset < -1,-1,ages_at_onset)
ages_at_onset_corrected <- ifelse(ages_at_onset > minAge,minAge,ages_at_onset)
biologically_feasible <- ages_at_onset > -1 & ages_at_onset < minAge
historical_b <- b[biologically_feasible] %>%
unlist
return(tibble(
historical_samples = historical_b,
aachange = gsub("-","",str_match(x[2],"-.+")),
gene = str_match(x[2],"[A-Z0-9]+")
))
}) %>%
do.call(what = rbind) %>%
group_by(gene,aachange) %>%
summarise(mean = mean(historical_samples,na.rm = T),
s = sd(historical_samples,na.rm = T)) %>%
mutate(site = sprintf("%s-%s",gene,aachange))
dnds <- read.csv("data/dnds_aml_mds.csv") %>%
subset(qval <= 1e-1) %>%
mutate(site = paste(gene,gsub("\\*","X",aachange),sep = "-")) %>%
select(-gene)
dd <- all_samples %>%
merge(dnds,by = "site",all = F) %>%
as.data.frame
dd %>%
#subset(gene == "DNMT3A") %>%
ggplot(aes(y = dnds,x = exp(mean)-1,colour = gene,shape = condition)) +
geom_point(size = 1) +
geom_errorbarh(aes(xmin = exp(mean-s*1.96)-1,xmax = exp(mean+s*1.96)-1),size = 0.1,height = 0) +
geom_linerange(aes(ymin = CI_low,ymax = CI_high),size = 0.1) +
geom_smooth(method = "lm",colour = "black",alpha = 0.2,size = 0.5) +
scale_colour_manual(values = gene_colours,guide = F) +
xlab("Annual driver growth rate in CH") +
ylab("dN/dS") +
scale_x_continuous(labels = function(x) sprintf("%s%%",x*100)) +
scale_y_continuous(trans = 'log10') +
theme_gerstung(base_size = 6) +
#coord_cartesian(xlim = c(NA,0.5)) +
ggsave(useDingbats=FALSE,
filename = sprintf("figures/%s/dynamic_coefficients/dnds_association_hist.pdf",model_id),
width = 3.5,height = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
lm(formula = dnds ~ mean,data = dd,subset = dd$condition == "AML") %>%
summary
##
## Call:
## lm(formula = dnds ~ mean, data = dd, subset = dd$condition ==
## "AML")
##
## Residuals:
## Min 1Q Median 3Q Max
## -2432.6 -1369.8 -547.4 727.0 4769.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 673.1 1016.0 0.663 0.518
## mean 7921.2 4837.2 1.638 0.124
##
## Residual standard error: 1997 on 14 degrees of freedom
## Multiple R-squared: 0.1608, Adjusted R-squared: 0.1008
## F-statistic: 2.682 on 1 and 14 DF, p-value: 0.1238
lm(formula = dnds ~ mean,data = dd,subset = dd$condition == "MDS") %>%
summary
##
## Call:
## lm(formula = dnds ~ mean, data = dd, subset = dd$condition ==
## "MDS")
##
## Residuals:
## Min 1Q Median 3Q Max
## -2160.9 -1060.5 -373.4 893.7 2740.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 993.0 641.7 1.548 0.139
## mean 4836.1 3295.0 1.468 0.159
##
## Residual standard error: 1436 on 18 degrees of freedom
## Multiple R-squared: 0.1069, Adjusted R-squared: 0.05727
## F-statistic: 2.154 on 1 and 18 DF, p-value: 0.1594
lm(formula = dnds ~ mean,data = dd,subset = dd$condition == "AML" & dd$gene == "DNMT3A") %>%
summary
##
## Call:
## lm(formula = dnds ~ mean, data = dd, subset = dd$condition ==
## "AML" & dd$gene == "DNMT3A")
##
## Residuals:
## 2 4 6 8
## -878.4 -906.3 854.8 929.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2330 8826 -0.264 0.817
## mean 32159 80871 0.398 0.729
##
## Residual standard error: 1263 on 2 degrees of freedom
## Multiple R-squared: 0.07327, Adjusted R-squared: -0.3901
## F-statistic: 0.1581 on 1 and 2 DF, p-value: 0.7293
lm(formula = dnds ~ mean,data = dd,subset = dd$condition == "MDS" & dd$gene == "DNMT3A") %>%
summary
##
## Call:
## lm(formula = dnds ~ mean, data = dd, subset = dd$condition ==
## "MDS" & dd$gene == "DNMT3A")
##
## Residuals:
## 3 5 7
## -126.35 24.82 101.52
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -117.1 1258.2 -0.093 0.941
## mean 4547.4 11347.5 0.401 0.757
##
## Residual standard error: 164 on 1 degrees of freedom
## Multiple R-squared: 0.1384, Adjusted R-squared: -0.7233
## F-statistic: 0.1606 on 1 and 1 DF, p-value: 0.7574
lm(formula = dnds ~ mean + condition,data = dd) %>%
summary
##
## Call:
## lm(formula = dnds ~ mean + condition, data = dd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2350.1 -1219.4 -621.4 789.7 4659.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 970.7 664.5 1.461 0.1536
## mean 6294.8 2808.4 2.241 0.0318 *
## conditionMDS -223.6 567.0 -0.394 0.6959
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1686 on 33 degrees of freedom
## Multiple R-squared: 0.1391, Adjusted R-squared: 0.08692
## F-statistic: 2.666 on 2 and 33 DF, p-value: 0.08448
all_samples <- sampling_idxs %>%
apply(1,function(x) {
individual <- x[1]
site_numeric <- as.numeric(x[3])
gene_numeric <- as.numeric(x[4])
clone_numeric <- as.numeric(x[5])
minAge <- as.numeric(x[6])
if (is.na(site_numeric)) {
b_site <- 0} else {
b_site <- site_samples[,site_numeric]
}
b_gene <- gene_samples[,gene_numeric]
b_clone <- clone_samples[,clone_numeric]
b <- b_gene + b_clone + b_site
u_values <- offset_samples[,clone_numeric]
ages_at_onset <- t0_adjusted(u_values,b,g=2,n=pop_size) + values_model$min_age
ages_at_onset_corrected <- ifelse(ages_at_onset < -1,-1,ages_at_onset)
ages_at_onset_corrected <- ifelse(ages_at_onset > minAge,minAge,ages_at_onset)
biologically_feasible <- ages_at_onset > -1 & ages_at_onset < minAge
historical_b <- b[biologically_feasible] %>%
unlist
return(tibble(
historical_samples = historical_b,
gene = str_match(x[2],"[A-Z0-9]+")
))
}) %>%
do.call(what = rbind) %>%
group_by(gene) %>%
summarise(mean = mean(historical_samples,na.rm = T),
s = sd(historical_samples,na.rm = T))
dd <- all_samples %>%
merge(read.csv("data/aml-risk-coef.csv"),by = "gene",all = F) %>%
as.data.frame
dd %>%
ggplot(aes(y = coef,x = exp(mean)-1,colour = gene)) +
geom_point(size = 1) +
geom_errorbarh(aes(xmin = exp(mean-s*1.96)-1,xmax = exp(mean+s*1.96)-1),size = 0.1,height = 0) +
geom_linerange(aes(ymin = coef - sd*1.96,ymax = coef + sd*1.96),size = 0.1) +
geom_smooth(method = "lm",colour = "black",alpha = 0.2,size = 0.5) +
scale_colour_manual(values = gene_colours,guide = F) +
xlab("Annual driver growth rate in CH") +
ylab("AML risk") +
scale_x_continuous(labels = function(x) sprintf("%s%%",x*100)) +
theme_gerstung(base_size = 6) +
coord_cartesian(xlim = c(NA,0.5)) +
ggsave(useDingbats=FALSE,
filename = sprintf("figures/%s/dynamic_coefficients/aml_risk_hist.pdf",model_id),
width = 3.5,height = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
cor.test(dd$mean,dd$coef)
##
## Pearson's product-moment correlation
##
## data: dd$mean and dd$coef
## t = 3.8329, df = 11, p-value = 0.00278
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3517854 0.9227259
## sample estimates:
## cor
## 0.7561979
cor.test(dd$mean,dd$coef,method = "kendall")
##
## Kendall's rank correlation tau
##
## data: dd$mean and dd$coef
## T = 60, p-value = 0.01012
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.5384615
tmp <- statistics_data %>%
mutate(truncating = grepl("[A-Z0-9]+t-",site)) %>%
mutate(recurring = site %in% unique_site_multiple) %>%
mutate(f = paste(
ifelse(recurring,"Recurring","Non-recurring"),
ifelse(truncating,"truncating","non-truncating"),
sep = ' and\n'
))
anvar_test <- anova(glm(sum_stat ~ f,data = tmp,family = stats::binomial()),
test = "F")
## Warning in anova.glm(glm(sum_stat ~ f, data = tmp, family =
## stats::binomial()), : using F test with a 'binomial' family is inappropriate
tmp %>%
group_by(f) %>%
summarise(Proportion = sum(sum_stat)/length(sum_stat)) %>%
ggplot(aes(x = f,y = Proportion)) +
geom_bar(outlier.size = 0.5,size = 0.25,stat = "identity") +
ylab("Proportion of fits with no outliers") +
xlab("") +
theme_gerstung(base_size = 6) +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
ggsave(useDingbats=FALSE,sprintf("figures/%s/other_associations/uc_tnt_rec.pdf",model_id),
height = 1.2,width = 2.0)
## Warning: Ignoring unknown parameters: outlier.size
anvar_test
good_stat_smoking_gender <- merge(
statistics_data,smoking_gender_data,
by = c("individual"))
tmp <- good_stat_smoking_gender %>%
filter(!is.na(sum_stat) & !is.na(HasSmoked) & !is.na(Gender))
all_models <- list(
`All genes` = list(
m = glm(sum_stat ~ HasSmoked + Gender,data = tmp,
family = stats::binomial()),
null_m = glm(sum_stat ~ 1,data = tmp,
family = stats::binomial())))
for (g in unique_genes) {
x <- tmp %>%
subset(gene == g)
all_models[[g]] <- list(
m = glm(sum_stat ~ HasSmoked + Gender,data = x,
family = stats::binomial()),
null_m = glm(sum_stat ~ 1,data = x,
family = stats::binomial()))
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
significance_df <- all_models %>%
lapply(
function(x) anova(x$null_m,x$m,test="F")[2,c(5,6)]
) %>%
do.call(what = rbind)
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
## Warning: using F test with a 'binomial' family is inappropriate
significance_df$gene <- names(all_models)
significance_df$SignificantAfterCorrection <- p.adjust(significance_df$`Pr(>F)`,method = "BH") < 0.05
significance_df
tmp %>%
mutate(
L = paste(ifelse(Gender == "M","Male","Female"),
ifelse(HasSmoked == T,"\nhas smoked","\nnever smoked"),
sep = '')
) %>%
group_by(L,Gender) %>%
summarise(proportion = sum(sum_stat) / length(sum_stat)) %>%
ggplot(aes(x = L,y = proportion, fill = Gender)) +
geom_bar(stat = "identity") +
theme_gerstung(base_size = 6) +
coord_flip() +
xlab("") +
ylab("Proportion of fits with no outliers") +
scale_y_continuous(expand = c(0,0)) +
scale_fill_aaas(guide = F)
cat(system("git log -n1", intern=TRUE), sep="\n")
## commit 5bcb05932603c8c4a9756c112d35c89f3c41c92b
## Author: josegcpa <josegcpa@ebi.ac.uk>
## Date: Wed Aug 11 19:08:29 2021 +0100
##
## removed redundant parts of the analysis (regarding age at onset)
l <- ls()
data.frame(variable=l, Reduce("rbind",lapply(l, function(x) data.frame(classes=paste(class(get(x)),collapse = ','), size=format(object.size(get(x)), units="auto")))))
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.4 (Maipo)
##
## Matrix products: default
## BLAS: /hps/nobackup/research/gerstung/josegcpa/software/R/3.6.3/lib64/R/lib/libRblas.so
## LAPACK: /hps/nobackup/research/gerstung/josegcpa/software/R/3.6.3/lib64/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] survminer_0.4.8 survival_3.2-7 reghelper_1.0.1 scatterpie_0.1.5
## [5] ggtree_2.0.4 ape_5.4-1 dendextend_1.14.0 default_1.0.0
## [9] extraDistr_1.9.1 ggrepel_0.8.2 ggsci_2.9 cowplot_1.1.0
## [13] gtools_3.8.2 openxlsx_4.2.2 bayesplot_1.7.2 forcats_0.5.0
## [17] stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4 readr_1.4.0
## [21] tidyr_1.1.2 tibble_3.0.4 tidyverse_1.3.0 ggpubr_0.4.0
## [25] ggplot2_3.3.2 greta_0.3.1 reticulate_1.16
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ggsignif_0.6.0 ellipsis_0.3.1
## [4] rio_0.5.16 ggridges_0.5.2 base64enc_0.1-3
## [7] fs_1.5.0 dichromat_2.0-0 rstudioapi_0.11
## [10] farver_2.0.3 listenv_0.8.0 fansi_0.4.1
## [13] lubridate_1.7.9 xml2_1.3.2 splines_3.6.3
## [16] codetools_0.2-16 knitr_1.30 polyclip_1.10-0
## [19] jsonlite_1.7.1 km.ci_0.5-2 broom_0.7.1
## [22] dbplyr_1.4.4 tfruns_1.4 ggforce_0.3.2
## [25] mapproj_1.2.7 BiocManager_1.30.10 compiler_3.6.3
## [28] httr_1.4.2 rvcheck_0.1.8 backports_1.1.10
## [31] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [34] cli_2.1.0 tweenr_1.0.1 htmltools_0.5.0
## [37] prettyunits_1.1.1 tools_3.6.3 coda_0.19-4
## [40] gtable_0.3.0 glue_1.4.2 maps_3.3.0
## [43] Rcpp_1.0.5 carData_3.0-4 cellranger_1.1.0
## [46] vctrs_0.3.4 nlme_3.1-144 xfun_0.18
## [49] globals_0.13.0 ps_1.4.0 rvest_0.3.6
## [52] lifecycle_0.2.0 rstatix_0.6.0 future_1.19.1
## [55] zoo_1.8-8 MASS_7.3-51.5 scales_1.1.1
## [58] hms_0.5.3 parallel_3.6.3 yaml_2.2.1
## [61] curl_4.3 gridExtra_2.3 KMsurv_0.1-5
## [64] stringi_1.5.3 tensorflow_2.2.0 tidytree_0.3.3
## [67] zip_2.1.1 pals_1.6 rlang_0.4.8
## [70] pkgconfig_2.0.3 evaluate_0.14 lattice_0.20-38
## [73] labeling_0.4.2 treeio_1.10.0 tidyselect_1.1.0
## [76] plyr_1.8.6 magrittr_1.5 R6_2.4.1
## [79] generics_0.0.2 DBI_1.1.0 mgcv_1.8-31
## [82] pillar_1.4.6 haven_2.3.1 whisker_0.4
## [85] foreign_0.8-75 withr_2.3.0 abind_1.4-5
## [88] modelr_0.1.8 crayon_1.3.4 car_3.0-10
## [91] survMisc_0.5.5 rmarkdown_2.4 viridis_0.5.1
## [94] progress_1.2.2 readxl_1.3.1 data.table_1.13.0
## [97] blob_1.2.1 reprex_0.3.0 digest_0.6.26
## [100] xtable_1.8-4 munsell_0.5.0 viridisLite_0.3.0