1 Historical growth effect

We here define historical growth effect as the growth effect calculated only from posterior samples which yield age at onset estimates which are larger than 0.

pop_size <- 0.5e5
source("Scripts/vaf_dynamics_functions.R")

## 
## Attaching package: 'greta'

## The following objects are masked from 'package:stats':
## 
##     binomial, cov2cor, poisson

## The following objects are masked from 'package:base':
## 
##     %*%, apply, backsolve, beta, chol2inv, colMeans, colSums, diag,
##     eigen, forwardsolve, gamma, identity, rowMeans, rowSums, sweep,
##     tapply

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✔ tibble  3.0.4     ✔ dplyr   1.0.2
## ✔ tidyr   1.1.2     ✔ stringr 1.4.0
## ✔ readr   1.4.0     ✔ forcats 0.5.0
## ✔ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::slice()  masks greta::slice()

## This is bayesplot version 1.7.2

## - Online documentation and vignettes at mc-stan.org/bayesplot

## - bayesplot theme set to bayesplot::theme_default()

##    * Does _not_ affect other ggplot2 plots

##    * See ?bayesplot_theme_set for details on theme setting

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggpubr':
## 
##     get_legend

## 
## Attaching package: 'extraDistr'

## The following objects are masked from 'package:gtools':
## 
##     ddirichlet, rdirichlet

## The following object is masked from 'package:purrr':
## 
##     rdunif

## 
## ---------------------
## Welcome to dendextend version 1.14.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## The following object is masked from 'package:stats':
## 
##     cutree

## 
## Attaching package: 'ape'

## The following objects are masked from 'package:dendextend':
## 
##     ladderize, rotate

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## Registered S3 method overwritten by 'treeio':
##   method     from
##   root.phylo ape

## ggtree v2.0.4  For help: https://yulab-smu.github.io/treedata-book/
## 
## If you use ggtree in published research, please cite the most appropriate paper(s):
## 
## [36m-[39m Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi: 10.1093/molbev/msy194
## [36m-[39m Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628

## 
## Attaching package: 'ggtree'

## The following object is masked from 'package:ape':
## 
##     rotate

## The following object is masked from 'package:dendextend':
## 
##     rotate

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## 
## Attaching package: 'reghelper'

## The following object is masked from 'package:greta':
## 
##     beta

## The following object is masked from 'package:base':
## 
##     beta

source("Scripts/prepare_data.R")
model_file_name <- "models/model_ch.RDS"
dir.create("figures/model_ch/other_associations")

## Warning in dir.create("figures/model_ch/other_associations"): 'figures/model_ch/
## other_associations' already exists

set.seed(42)
model_id <- "model_ch"
load(file = sprintf('data_output/vaf_modelling_coefficients_%s.Rdata',model_id))
values_model <- readRDS(file = model_file_name)

site_samples <- values_model$draws$`11`[,grep('site',colnames(values_model$draws$`11`))]
gene_samples <- values_model$draws$`11`[,grep('gene',colnames(values_model$draws$`11`))]
clone_samples <- values_model$draws$`11`[,grep('clone',colnames(values_model$draws$`11`))]
offset_samples <- values_model$draws$`11`[,grep('u',colnames(values_model$draws$`11`))]
beta_samples <- values_model$draws$`11`[,grep('beta',colnames(values_model$draws$`11`))] %>%
  unlist()

sampling_idxs <- r_values_full %>%
  subset(b_site_095 + b_gene_095 + b_clone_095 > 0) %>%
  group_by(
    individual,site,
    site_numeric,
    gene_numeric,
    clone_numeric
  ) %>% 
  summarise(
    minAge = min(age[which(round(true/coverage,4) >= 0.005)]),
    maxAge = max(age),
    maxVAF = max(true/coverage)
  ) %>%
  arrange(site)

c_names <- colnames(values_model$draws$`11`)

good_sites_individual <- statistics_data %>%
  ungroup %>% 
  select(site,individual,sum_stat) %>%
  filter(sum_stat) %>%
  select(-sum_stat)

historical_estimates <- sampling_idxs %>%
  apply(1,function(x) {
    individual <- x[1]
    site_numeric <- as.numeric(x[3])
    gene_numeric <- as.numeric(x[4])
    clone_numeric <- as.numeric(x[5])
    minAge <- as.numeric(x[6])
    
    if (is.na(site_numeric)) {
      b_site <- 0} else {
        b_site <- site_samples[,site_numeric]
      }
    b_gene <- gene_samples[,gene_numeric]
    b_clone <- clone_samples[,clone_numeric]
    b <- b_gene + b_clone + b_site
    u_values <- offset_samples[,clone_numeric]
    ages_at_onset <- t0_adjusted(u_values,b,g=2,n=pop_size) + values_model$min_age
    ages_at_onset_corrected <- ifelse(ages_at_onset < -1,-1,ages_at_onset)
    ages_at_onset_corrected <- ifelse(ages_at_onset > minAge,minAge,ages_at_onset)
    biologically_feasible <- ages_at_onset > -1 & ages_at_onset < minAge
    historical_b <- b[biologically_feasible]
    Q <- quantile(historical_b,c(0.05,0.5,0.95),na.rm = T)
    Q_observed <- quantile(b,c(0.05,0.5,0.95),na.rm = T)
    names(Q) <- c("Q05","Q50","Q95")
    return(data.frame(
                      ages_Q50 = quantile(ages_at_onset_corrected,0.5,na.rm = T)[1],
                      observed_growth_q05=Q_observed[1],
                      observed_growth_q50=Q_observed[2],
                      observed_growth_q95=Q_observed[3],
                      observed_growth_mean=mean(b,na.rm=T),
                      historical_growth_q05=Q[1],
                      historical_growth_q50=Q[2],
                      historical_growth_q95=Q[3],
                      historical_growth_mean=mean(historical_b,na.rm=T)))
  }) %>%
  do.call(what=rbind) 
historical_estimates <- cbind(
  as.data.frame(sampling_idxs),as.data.frame(historical_estimates))
historical_estimates$gene <- str_match(historical_estimates$site,"[A-Z0-9]+")
historical_estimates <- historical_estimates %>%
  filter(
    paste(individual,as.character(site)) %in% paste(
      good_sites_individual$individual,good_sites_individual$site))
historical_estimates %>%
  select(SardID = individual,
         Site = site,
         Gene = gene,
         ObservedGrowthQ05 = observed_growth_q05,
         ObservedGrowthQ50 = observed_growth_q50,
         ObservedGrowthQ95 = observed_growth_q95,
         ObservedGrowthMean = observed_growth_mean,
         HistoricalGrowthQ05 = historical_growth_q05,
         HistoricalGrowthQ50 = historical_growth_q50,
         HistoricalGrowthQ95 = historical_growth_q95,
         HistoricalGrowthMean = historical_growth_mean) %>% 
  write.csv("data_output/historical_growth.csv")

idxs <- sampling_idxs %>% 
  subset(individual == 4088 & site == "DNMT3At-E578X")

tmp_data <- full_data %>%
  subset(SardID == idxs$individual & amino_acid_change == idxs$site)

#idxs <- sampling_idxs[1,]

S <- data.frame(
  u = offset_samples[,idxs$clone_numeric],
  b_gene = gene_samples[,idxs$gene_numeric],
  b_site = site_samples[,idxs$site_numeric],
  b_clone = clone_samples[,idxs$clone_numeric],
  idx = seq(1,nrow(clone_samples)),
  maxAge = idxs$maxAge) %>%
  apply(1,function(x) {
    u <- as.numeric(x[1])
    b_gene <- as.numeric(x[2])
    b_site <- as.numeric(x[3])
    b_site <- ifelse(is.na(b_site),0,b_site)
    b_clone <- as.numeric(x[4])
    max_age <- as.numeric(x[6]) + 25
    b <- b_gene+b_site+b_clone
    traj <- suppressWarnings(trajectory_from_parameters_2(
      b = b,u = u,g = 2,N = pop_size,x = seq(-20,max_age,by = 0.5) - values_model$min_age))
    age_at_onset <- t0_adjusted(u,b,g = 2,n = pop_size)
    return(data.frame(
      age = traj$x + values_model$min_age,
      VAF = traj$y,
      t0 = age_at_onset + values_model$min_age,
      ID = as.numeric(x[5])
    ))
  }) %>%
  do.call(what = rbind) %>%
  mutate(plausible = t0 >= 0) %>%
  mutate(VAF = ifelse(is.na(VAF),0,VAF))

x <- rbind(
  S %>% 
    subset(plausible == T) %>%
    group_by(age,plausible) %>%
    summarise(q05 = quantile(VAF,0.05,na.rm = T),
              q50 = quantile(VAF,0.5,na.rm = T),
              q95 = quantile(VAF,0.95,na.rm = T)) %>%
    as.data.frame,
  S %>% 
    group_by(age) %>%
    summarise(plausible = as.logical(0),
              q05 = quantile(VAF,0.05,na.rm = T),
              q50 = quantile(VAF,0.5,na.rm = T),
              q95 = quantile(VAF,0.95,na.rm = T)) %>%
    as.data.frame) %>%
  subset(q95 > 0)

plot_good_bad <- x %>% 
  group_by(age) %>% 
  subset(q95 > 0) %>% 
  mutate(q05 = ifelse(
    all(c(T,F) %in% plausible),
    ifelse(plausible == F,min(q95),q05),
    q05)) %>% 
  ggplot(aes(x = age,ymin = q05,ymax = q95,fill = plausible)) + 
  geom_ribbon(alpha = 0.3) + 
  geom_line(aes(y = q50,colour = plausible),size = 0.25) +
  geom_vline(xintercept = 0,linetype = 3,
             size = 0.5) +
  theme_gerstung(base_size = 6) + 
  xlab("Age") + 
  ylab("VAF") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.x = element_blank()) + 
  scale_fill_manual(values = colorspace::lighten(c("red4","goldenrod"),amount = 0.2),
                    guide = F) + 
  scale_colour_manual(values = colorspace::darken(c("red4","goldenrod"),amount = 0.2),
                      guide = F) + 
  scale_y_continuous(trans = 'log10') + 
  scale_x_continuous(breaks = 0)

plot_plausible <- x %>% 
  ggplot(aes(x = age,y = q50,group = plausible,colour = plausible)) + 
  geom_line(size = 0.25) + 
  geom_point(data = tmp_data,aes(x = Age,y = VAF),inherit.aes = F,size = 0.25) +
  geom_vline(xintercept = 0,linetype = 3,
             size = 0.5) +
  theme_gerstung(base_size = 6) + 
  xlab("Age") + 
  ylab("VAF") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.x = element_blank()) +
  scale_colour_manual(values = c("red4","goldenrod"),guide = F) + 
  scale_y_continuous(trans = 'log10') + 
  scale_x_continuous(breaks = 0,limits = c(min(x$age),max(x$age)))

plot_grid(plot_good_bad,plot_plausible) + 
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/good_bad_traj.pdf",model_id),
         width = 1.4,height = 0.55,
         useDingbats = F)

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

plot_good_bad + 
    geom_point(data = tmp_data,aes(x = Age,y = VAF),inherit.aes = F,size = 0.25) + 
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/good_bad_traj_combo.pdf",model_id),
         width = 0.8,height = 0.8,
         useDingbats = F)

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

idxs <- sampling_idxs %>% 
  subset((individual == 4088 & site == "DNMT3At-E578X") | (individual == 3877 & site == "U2AF1-Q157R"))

tmp_data <- full_data %>%
  subset(SardID %in% idxs$individual & amino_acid_change %in% idxs$site)

#idxs <- sampling_idxs[1,]

S <- data.frame(
  u = c(offset_samples[,idxs$clone_numeric]),
  b_gene = c(gene_samples[,idxs$gene_numeric]),
  b_site = c(site_samples[,idxs$site_numeric]),
  b_clone = c(clone_samples[,idxs$clone_numeric]),
  idx = c(replicate(2,seq(1,nrow(clone_samples)))),
  maxAge = c(sapply(idxs$maxAge, function(x) rep(x,nrow(offset_samples)))),
  site = c(sapply(idxs$site, function(x) rep(x,nrow(offset_samples)))),
  id = c(sapply(idxs$individual, function(x) rep(x,nrow(offset_samples))))) %>%
  apply(1,function(x) {
    u <- as.numeric(x[1])
    b_gene <- as.numeric(x[2])
    b_site <- as.numeric(x[3])
    b_site <- ifelse(is.na(b_site),0,b_site)
    b_clone <- as.numeric(x[4])
    max_age <- as.numeric(x[6]) + 10
    b <- b_gene+b_site+b_clone
    traj <- suppressWarnings(trajectory_from_parameters_2(
      b = b,u = u,g = 2,N = pop_size,x = seq(-20,max_age,by = 0.5) - values_model$min_age))
    age_at_onset <- t0_adjusted(u,b,g = 2,n = pop_size)
    return(suppressWarnings(data.frame(
      age = traj$x + values_model$min_age,
      VAF = traj$y,
      t0 = age_at_onset + values_model$min_age,
      ID = as.numeric(x[5]),
      site = x[7],
      individual = x[8]
    )))
  }) %>%
  do.call(what = rbind) %>%
  mutate(plausible = t0 >= 0) %>%
  mutate(VAF = ifelse(is.na(VAF),0,VAF))

x <- S %>% 
  group_by(age,site,individual) %>%
  summarise(plausible = as.logical(0),
            q05 = quantile(VAF,0.05,na.rm = T),
            q50 = quantile(VAF,0.5,na.rm = T),
            q95 = quantile(VAF,0.95,na.rm = T)) 

plot_slow_fast <- x %>% 
  ggplot(aes(x = age,y = q50,colour = site)) + 
  geom_ribbon(aes(ymin = q05,ymax = q95,fill = site),
              colour = NA,alpha = 0.5) +
  geom_line(size = 0.25) + 
  #geom_point(data = tmp_data,aes(x = Age,y = VAF),inherit.aes = F,size = 0.25) +
  geom_vline(xintercept = 0,linetype = 3,
             size = 0.5) +
  theme_gerstung(base_size = 6) + 
  xlab("Age") + 
  ylab("VAF") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  scale_colour_manual(values = c("goldenrod","lightblue"),guide = F) + 
  scale_fill_manual(values = c("goldenrod","lightblue"),guide = F) + 
  scale_y_continuous(trans = 'log10') + 
  scale_x_continuous(breaks = 0,limits = c(min(x$age),max(x$age))) + 
  theme(axis.title.x = element_blank())

plot_slow_fast + 
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/slow_fast_traj.pdf",model_id),
         width = 0.6,height = 0.5,
         useDingbats = F)

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

idxs <- sampling_idxs %>% 
  subset(grepl("TP53",site))# %>% subset(individual == 4088 & site == "DNMT3At-E578X")
idxs <- idxs[1,]
tmp_data <- full_data %>%
  subset(SardID == idxs$individual & amino_acid_change == idxs$site)

S <- data.frame(
  u = offset_samples[,idxs$clone_numeric],
  b_gene = gene_samples[,idxs$gene_numeric],
  b_site = site_samples[,idxs$site_numeric],
  b_clone = clone_samples[,idxs$clone_numeric],
  idx = seq(1,nrow(clone_samples)),
  maxAge = idxs$maxAge) %>%
  apply(1,function(x) {
    u <- as.numeric(x[1])
    b_gene <- as.numeric(x[2])
    b_site <- as.numeric(x[3])
    b_site <- ifelse(is.na(b_site),0,b_site)
    b_clone <- as.numeric(x[4])
    max_age <- as.numeric(x[6]) + 25
    b <- b_gene+b_site+b_clone
    traj <- suppressWarnings(trajectory_from_parameters_2(
      b = b,u = u,g = 2,N = pop_size,x = seq(-20,max_age,by = 0.5) - values_model$min_age))
    age_at_onset <- t0_adjusted(u,b,g = 2,n = pop_size)
    return(data.frame(
      age = traj$x + values_model$min_age,
      VAF = traj$y,
      t0 = age_at_onset + values_model$min_age,
      ID = as.numeric(x[5])
    ))
  }) %>%
  do.call(what = rbind) %>%
  mutate(plausible = t0 >= 0) %>%
  mutate(VAF = ifelse(is.na(VAF),0,VAF))

x <- rbind(
  S %>% 
    subset(plausible == T) %>%
    group_by(age,plausible) %>%
    summarise(q05 = quantile(VAF,0.05,na.rm = T),
              q50 = quantile(VAF,0.5,na.rm = T),
              q95 = quantile(VAF,0.95,na.rm = T)) %>%
    as.data.frame,
  S %>% 
    group_by(age) %>%
    summarise(plausible = as.logical(0),
              q05 = quantile(VAF,0.05,na.rm = T),
              q50 = quantile(VAF,0.5,na.rm = T),
              q95 = quantile(VAF,0.95,na.rm = T)) %>%
    as.data.frame) %>%
  subset(q95 > 0)

plot_good_bad <- x %>% 
  group_by(age) %>% 
  subset(q95 > 0) %>% 
  mutate(q05 = ifelse(
    all(c(T,F) %in% plausible),
    ifelse(plausible == F,min(q95),q05),
    q05)) %>% 
  ggplot(aes(x = age,ymin = q05,ymax = q95,fill = plausible)) + 
  geom_ribbon() + 
  geom_vline(xintercept = 0,linetype = 3,
             size = 0.5) +
  theme_gerstung(base_size = 6) + 
  xlab("Age") + 
  ylab("VAF") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) + 
  scale_fill_manual(values = colorspace::lighten(c("red4","goldenrod"),amount = 0.01),
                    guide = F) + 
  scale_y_continuous(trans = 'log10') + 
  scale_x_continuous(breaks = 0)

plot_plausible <- x %>% 
  ggplot(aes(x = age,y = q50,group = plausible,colour = plausible)) + 
  geom_line(size = 0.25) + 
  geom_point(data = tmp_data,aes(x = Age,y = VAF),inherit.aes = F,size = 0.25) +
  geom_vline(xintercept = 0,linetype = 3,
             size = 0.5) +
  theme_gerstung(base_size = 6) + 
  xlab("Age") + 
  ylab("VAF") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  scale_colour_manual(values = c("red4","goldenrod"),guide = F) + 
  scale_y_continuous(trans = 'log10') + 
  scale_x_continuous(breaks = 0,limits = c(min(x$age),max(x$age)))

plot_grid(plot_good_bad,plot_plausible) + 
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/good_bad_traj_2.pdf",model_id),
         width = 1.4,height = 0.55,
         useDingbats = F)

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis

1.1 Comparing historical with observed growth

We can see clearly a bias in terms of genes where the difference between historical and observed growth is concerned - particularly, the phase of clonal dynamics we are able to observe for TP53, DNMT3A and BRCC3 is far too slow to explain the observed VAF.

aging_order <- c(
  "DNMT3A","BRCC3","TP53","GNB1","ASXL1","TET2","CBL",
  "CTCF","PPM1D","JAK2","KRAS","SRSF2-other","SF3B1","IDH1",
  "IDH2","PTPN11","U2AF1","SRSF2-P95H")
min_max <- c(
  exp(min(historical_estimates$observed_growth_mean,historical_estimates$historical_growth_mean,na.rm=T))-1,
  exp(max(historical_estimates$observed_growth_mean,historical_estimates$historical_growth_mean,na.rm=T))-1
)

growth_per_year_plot <- historical_estimates %>% 
  ggplot(aes(x = exp(observed_growth_mean)-1,
             y = exp(historical_growth_mean)-1,
             colour = gene)) + 
  geom_abline(slope = 1,
              size = 0.2,
              linetype=2) +
  geom_point(size = 0.2) +
  scale_y_continuous(limits = min_max,
                     breaks = seq(-0.2,1,by = 0.2),
                     expand = c(0.02,0),
                     labels = sprintf("%s%%",seq(-0.2,1,by = 0.2)*100)) + 
  scale_x_continuous(limits = min_max,
                     breaks = seq(-0.2,1,by = 0.2),
                     expand = c(0.02,0),
                     labels = sprintf("%s%%",seq(-0.2,1,by = 0.2)*100)) + 
  theme_gerstung(base_size = 6) + 
  scale_colour_manual(values = gene_colours,guide = F) +
  xlab("Observed growth per year") + 
  ylab("Historical growth per year")

growth_per_year_gene_plot_data <- historical_estimates %>%
  mutate(gene = ifelse(gene == "SRSF2",
                       ifelse(site == "SRSF2-P95H",
                              "SRSF2-P95H",
                              "SRSF2-other"),
                       gene)) %>%
  group_by(gene) %>%
  summarise(average_historical_growth = mean(exp(historical_growth_mean)-1,na.rm=T),
            average_observed_growth = mean(exp(observed_growth_mean)-1,na.rm=T))

growth_per_year_gene_plot <- growth_per_year_gene_plot_data %>% 
  ggplot(aes(x = factor(gene,levels = aging_order),
             y = (average_observed_growth + average_historical_growth)/2,
             colour = str_match(gene,"[A-Z0-9]+"))) +  
  geom_linerange(aes(ymin = average_observed_growth,ymax = average_historical_growth),
                 colour = "black") + 
  geom_text(aes(y = average_historical_growth,
                label = sprintf("%s%%",100*round(average_observed_growth/average_historical_growth,3))),
            colour = "black",
            size = 2.7,
            hjust = -0.2) + 
  geom_point(aes(y = average_observed_growth),size = 1) + 
  geom_point(aes(y = average_historical_growth),size = 1) + 
  geom_point(aes(y = average_historical_growth),size = 0.5,colour = 'white') + 
  theme_gerstung(base_size = 6) + 
  xlab("") + 
  ylab("Growth per year") +
  scale_y_continuous(breaks = seq(-0.2,1,by = 0.1),
                     labels = sprintf("%s%%",seq(-0.2,1,by = 0.1)*100),
                     expand = c(0,0,0.05,0),limits = c(0,NA)) + 
  scale_colour_manual(values = gene_colours,guide = F) + 
  rotate_x_text(face = "italic")

plot_grid(growth_per_year_plot,growth_per_year_gene_plot,nrow=1,align = "h") +
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/historical_growth_figure.pdf",model_id),
         height = 2,
         width = 3.5,
         useDingbats = F)

## Warning: Removed 12 rows containing missing values (geom_point).

growth_per_year_gene_plot + 
  theme(axis.text.x = element_text(face = "plain",angle = 0,hjust = 0.5),
        axis.text.y = element_text(face = "italic")) + 
  coord_flip() + 
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/historical_growth_figure_larger.pdf",model_id),
         width = 1.7,height = 2,useDingbats = F)

1.1.1 Comparison between the expected and observable growth for phylogenetic trees and longitudinal data

We can estimate the annual growth that one would expect and the growth one would be able to observe in the later years of a clone from phylogenetic trees alone. This allows us to draw a comparison between the EPS trajectory defined by a tree and the ratio between observed and plausible - or historical - growth. In this figure we present the combined evidence from simulations and real data (both longitudinal and phylogenetic) that clonal deceleration is observable across most clones, with a few genes being characterised by their larger deceleration - we estimate that DNMT3A, one of the most prevalent genes in CH, loses in average at least half of its annual growth rate.

mitchell_coefficients <- read.csv("data_output/mitchell_bnpr_coefficients.csv") %>%
  subset(w_sum < 10) %>%
  group_by(id,clade) %>% 
  select(id,clade,
         early = cp_1,late = cp_2,expected = nonlinear_vaf) %>%
  mutate(plotting_id = "Real data") %>% 
  mutate(source = "Mitchell et al. 2021") %>%
  mutate(colour = ifelse(grepl("DNMT3A",clade),"DNMT3A",
                         ifelse(grepl("Clade",clade),"No known driver","Other driver"))) 

tree_coefficients <- read.csv("data_output/tree_bnpr_coefficients.csv") %>%
  transmute(id = individual,clade = plot_label,
            early = b_early,late = b_early + b_late,expected = expected_late) %>%
  mutate(plotting_id = "Real data") %>% 
  mutate(source = "Fabre et al. 2021") %>%
  mutate(colour = str_match(clade,"[A-Z0-9]+")) %>%
  mutate(colour = ifelse(grepl("UD",colour),"No known driver",colour))
simulation_coefficients_full <- read.csv("data_output/simulated_bnpr_coefficients.csv") %>%
  subset(w_sum < 5)
  
simulation_coefficients <-  simulation_coefficients_full %>% 
  subset(col == "Low variance") %>% 
  mutate(plotting_id = ifelse(late_growth/growth_rate < 0.8,
                              "Saturating",
                              "Near\nconstant")) %>% 
  select(id = name,clade = driver,early = cp_1,late = cp_late,expected = expected_nl,plotting_id) %>%
  mutate(source = "Simulated") %>%
  mutate(colour = "No known driver")

all_bnpr_coefficients <- rbind(as.data.frame(mitchell_coefficients),
                               as.data.frame(tree_coefficients),
                               as.data.frame(simulation_coefficients))

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(2259L, 2259L, 500L, 500L, :
## invalid factor level, NA generated

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(2259L, 2259L, 500L, 500L, :
## invalid factor level, NA generated

x_order <- c("Near\nconstant","Saturating",
             "Real data","Obs./plaus.\ngrowth ratio") %>%
  rev

rbind(
  all_bnpr_coefficients %>% 
    group_by(source) %>% 
    summarise(N = sum((late / expected) < 0.8,na.rm = T),
              Total = length(id)) %>% 
    mutate(Ratio = N / Total),
  historical_estimates %>% 
    mutate(ratio = (exp(observed_growth_mean)-1) / (exp(historical_growth_mean)-1)) %>% 
    summarise(N = sum(ratio < 0.8,na.rm=T),Total = length(ratio)) %>% 
    mutate(Ratio = N / Total) %>%
    mutate(source = "Fabre et al. 2021 (longitudinal)") %>%
    select(source,N,Total,Ratio)) %>% 
  select(Source = source,`Number of saturating trajectories` = N,
         `Total number of trajectories` = Total,Proportion = Ratio) %>%
  as.data.frame

all_bnpr_coefficients %>%
  mutate(late_expected = late / expected) %>%
  mutate(late_expected = ifelse(late_expected < -1,-1,late_expected)) %>% 
  ggplot(aes(x = plotting_id,y = late_expected)) +
  geom_point(
    data = growth_per_year_gene_plot_data,inherit.aes = F,
    mapping = aes(x = "Obs./plaus.\ngrowth ratio",
                  y = average_observed_growth/average_historical_growth,
                  group = reorder(gene,average_observed_growth/average_historical_growth),
                  colour = str_match(gene,"[A-Z0-9]+"),
                  shape = "Fabre et al. 2021"),
    size = 0.3,
    position = position_dodge(width = 0.75)) +
  geom_point(aes(shape = source,colour = colour),size = 0.25,
             position = position_jitter(width = 0.2,height = 0),
             alpha = 0.9) + 
  geom_boxplot(size = 0.25,fill = NA,outlier.color = NA) +
  geom_hline(yintercept = 1.0,size = 0.25) + 
  theme_gerstung(base_size = 6) + 
  scale_shape_manual(values = c("circle","triangle","cross"),name = NULL) +
  scale_x_discrete(breaks = x_order,limits = x_order) +
  xlab("") + 
  ylab("Growth ratio") + 
  scale_colour_manual(values = c(gene_colours,
                                 `Other driver` = "#18A558",
                                 `No known driver` = "grey50"),guide = F) + 
  coord_flip(ylim = c(NA,NA)) + 
  scale_y_continuous(breaks = c(-1,0,0.5,seq(1,10)),
                     labels = c("<-1","0","0.5",seq(1,10))) +
  theme(legend.position = "top",
        legend.key.size = unit(0,"cm"),legend.box.spacing = unit(0.1,"cm")) +
  guides(shape = guide_legend(nrow = 2)) +
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/bnpr_historical_growth.pdf",model_id),
         useDingbats = F,
         width = 1.9,height = 1.2)

## Warning: Removed 27 rows containing non-finite values (stat_boxplot).

## Warning: Removed 27 rows containing missing values (geom_point).

## Warning: Removed 27 rows containing non-finite values (stat_boxplot).

## Warning: Removed 27 rows containing missing values (geom_point).

all_bnpr_coefficients %>%
  mutate(late_expected = late / expected) %>%
  mutate(late_expected = ifelse(late_expected < -1,-1,late_expected)) %>% 
  subset(source != "Simulated") %>%
  ggplot(aes(x = plotting_id,y = late_expected)) +
  geom_point(
    data = growth_per_year_gene_plot_data,inherit.aes = F,
    mapping = aes(x = "Obs./plaus.\ngrowth ratio",
                  y = average_observed_growth/average_historical_growth,
                  group = reorder(gene,average_observed_growth/average_historical_growth),
                  colour = str_match(gene,"[A-Z0-9]+"),
                  shape = "Fabre et al. 2021"),
    size = 0.3,
    position = position_dodge(width = 0.75)) +
  geom_point(size = 0.25,
             position = position_jitter(width = 0.2,height = 0),
             alpha = 0.9,
             colour = "grey") + 
  geom_boxplot(size = 0.25,fill = NA,outlier.color = NA) +
  geom_hline(yintercept = 1.0,size = 0.25) + 
  theme_gerstung(base_size = 6) + 
  scale_shape_manual(values = c("circle","triangle","cross"),guide = F) +
  scale_x_discrete(breaks = x_order[1:2],limits = x_order[1:2]) +
  xlab("") + 
  ylab("Growth ratio") + 
  scale_colour_manual(values = c(gene_colours,
                                 `Other driver` = "#18A558",
                                 `No known driver` = "grey50"),guide = F) + 
  scale_y_continuous(breaks = c(-1,0,0.5,1,2),
                     labels = c("<-1","0","0.5","1","2")) +
  theme(legend.position = "none",
        legend.key.size = unit(0,"cm"),legend.box.spacing = unit(0.1,"cm")) +
  guides(shape = guide_legend(nrow = 2)) +
  coord_flip() +
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/bnpr_historical_growth_simplified.pdf",model_id),
         useDingbats = F,
         width = 1.9,height = 0.8)

## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

## Warning: Removed 3 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

## Warning: Removed 3 rows containing missing values (geom_point).

1.1.2 Comparison between expected and observed VAF

mitchell_vaf <- read.csv("data_output/mitchell_bnpr_coefficients.csv") %>%
  subset(w_sum < 10) %>%
  group_by(id,clade) %>% 
  select(id,clade,
         observed_vaf=vaf_from_clade,expected_vaf) %>%
  mutate(plotting_id = "Real data") %>% 
  mutate(source = "Mitchell et al. 2021") %>%
  mutate(colour = ifelse(grepl("DNMT3A",clade),"DNMT3A",
                         ifelse(grepl("Clade",clade),"No known driver","Other driver"))) 

tree_vaf <- read.csv("data_output/tree_bnpr_coefficients.csv") %>%
  subset(source == "Colonies") %>% 
  select(id = individual,clade = plot_label,
         observed_vaf,expected_vaf) %>%
  mutate(plotting_id = "Real data") %>% 
  mutate(source = "Fabre et al. 2021") %>%
  mutate(colour = str_match(clade,"[A-Z0-9]+")) %>%
  mutate(colour = ifelse(grepl("UD",colour),"No known driver",colour)) %>%
  mutate(id = factor(id))

rbind(as.data.frame(mitchell_vaf),as.data.frame(tree_vaf)) %>%
  mutate(expected_vaf = expected_vaf) %>% 
  mutate(expected_vaf = ifelse(expected_vaf > 1,1,expected_vaf)) %>% 
  ggplot(aes(x = observed_vaf,y = expected_vaf)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1,size = 0.25) + 
  theme_gerstung(base_size = 6) + 
  scale_y_continuous(breaks = c(0,0.5,1.0)) +
  scale_x_continuous(breaks = c(0,0.5,1.0)) + 
  xlab("Observed clone freq.") + 
  ylab("Extrapolated\nclone frequency")

rbind(as.data.frame(mitchell_vaf),as.data.frame(tree_vaf)) %>%
  mutate(expected_vaf = expected_vaf) %>% 
  mutate(expected_vaf = ifelse(expected_vaf > 1,1,expected_vaf)) %>% 
  ggplot(aes(y = expected_vaf/observed_vaf,x = "")) + 
  geom_hline(yintercept = 1,size = 0.25) +
  geom_point(position = position_jitter(height = 0.2),size = 0.25,colour = "grey") + 
  geom_boxplot(outlier.color = NA,alpha = 0.5,size = 0.25) +
  theme_gerstung(base_size = 6) + 
  theme(axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank()) +
  ylab("Expected/observed\nclone size fold change") +
  scale_y_continuous(trans = 'log10',
                     breaks = c(0.1,1e-2,1,10,100),labels = c("0.1","0.01","1","10","100")) + 
  coord_flip() +
  ggsave(filename = sprintf("figures/%s/dynamic_coefficients/expected_vs_observed_vaf.pdf",model_id),
         useDingbats = F,
         width = 1.1,height = 0.75)

1.2 Investigating the determinants of the difference between historical and observed growth effect

unique_genes <- historical_estimates %>%
  group_by(gene) %>%
  filter(length(unique(paste(individual,site))) > 10) %>%
  select(gene) %>%
  distinct %>%
  unlist
historical_estimates <- historical_estimates %>%
  mutate(
    ratio_q05 = observed_growth_q05/historical_growth_q05,
    ratio_q50 = observed_growth_q50/historical_growth_q50,
    ratio_q95 = observed_growth_q95/historical_growth_q95,
    ratio_mean = observed_growth_mean/historical_growth_mean
  )

1.2.1 Distribution by gene

historical_estimates %>%
  group_by(gene) %>%
  mutate(M = unlist(quantile(ratio_q50,c(0.5),na.rm = T))) %>%
  mutate(ratio_q50 = ifelse(ratio_q50 < -0.5,-0.5,ratio_q50)) %>% 
  ggplot(aes(x = reorder(gene,M),y = ratio_q50)) + 
  geom_hline(yintercept = 1,linetype = 2,size = 0.25) + 
  geom_point(size = 0.5,position = position_jitter(width = 0.1)) +
  geom_boxplot(outlier.size = 0,alpha = 0.7,size = 0.25) + 
  coord_flip() + 
  theme_gerstung(base_size = 6) +
  scale_y_continuous(breaks = c(-0.5,0,1),labels = c("<-0.5","0","1")) + 
  theme(axis.text.y = element_text(face = "italic")) + 
  ylab("Ratio of historical and observed annual fold change") + 
  xlab("Gene")

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing missing values (geom_point).

1.2.2 Association with recurrence and truncation status

unique_site_multiple <- values_model$sub_data %>%
  select(amino_acid_change,site_numeric_multiple) %>% 
  subset(!is.na(site_numeric_multiple)) %>%
  distinct %>%
  arrange(site_numeric_multiple) %>%
  select(amino_acid_change) %>%
  unlist %>% 
  as.character() %>% as.factor

tmp <- historical_estimates %>%
  mutate(truncating = grepl("[A-Z0-9]+t-",site)) %>% 
  mutate(recurring = site %in% unique_site_multiple) %>%
  mutate(f = paste(
    ifelse(recurring,"Recurring","Non-recurring"),
    ifelse(truncating,"truncating","non-truncating"),
    sep = ' and\n'
  ))

anvar_test <- anova(glm(ratio_q50 ~ f,data = tmp),
                    test = "F")
tmp %>%
  mutate(ratio_q50 = ifelse(ratio_q50 < -1,-1,ratio_q50)) %>%
  ggplot(aes(x = f,y = ratio_q50)) + 
  geom_boxplot(outlier.size = 0.5,size = 0.25) + 
  ylab("Historical and observed annual fold change") + 
  xlab("") +
  theme_gerstung(base_size = 6) + 
  coord_flip(ylim = c(-1,NA)) + 
  scale_y_continuous(breaks = c(-1,0,0.5,1),labels = c("<-1",0,0.5,1)) +
  ggsave(useDingbats=FALSE,sprintf("figures/%s/other_associations/uc_tnt_rec.pdf",model_id),
         height = 1.2,width = 2.0)

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

anvar_test

tmp_dnmt3a <- tmp %>%
  subset(gene == "DNMT3A") %>%
  mutate(f = ifelse(grepl("Recurring",f),"Hot-spot","Normal mutation"))

wilcox.test(ratio_q50 ~ f, data = tmp_dnmt3a)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio_q50 by f
## W = 3564, p-value = 0.09323
## alternative hypothesis: true location shift is not equal to 0

tmp_dnmt3a %>%
  ggplot(aes(x = f,y = ratio_q50)) + 
  geom_boxplot(outlier.size = 0.5,size = 0.25) + 
  ylab("Observed vs. historical annual fold change") + 
  xlab("") +
  theme_gerstung(base_size = 6) + 
  coord_flip(ylim = c(-1,NA)) + 
  scale_y_continuous() +
  ggsave(useDingbats=FALSE,sprintf("figures/%s/other_associations/uc_tnt_rec_dnmt3a.pdf",model_id),
         height = 1.2,width = 2.5)

## Warning: Removed 7 rows containing non-finite values (stat_boxplot).

## Warning: Removed 7 rows containing non-finite values (stat_boxplot).

1.2.3 Association with gender and smoking

We tested whether age, gender and smoking could explain a greater amount of variance than age alone and found no evidence for this.

smoking_gender_data <- load_smoking_data() %>% 
  group_by(SardID) %>%
  summarise(HasSmoked = any(PackYears>0)) %>%
  merge(distinct(select(full_data,SardID,Gender))) %>%
  select(individual = SardID,HasSmoked,Gender)

historical_estimates_gender_smoking <- merge(
  historical_estimates,smoking_gender_data,
  by = c("individual")) 

tmp <- historical_estimates_gender_smoking %>%
  mutate(min_age = minAge) %>% 
  filter(!is.na(ratio_q50) & !is.na(HasSmoked) & !is.na(Gender) & !is.na(min_age))
all_models <- list(
  `All genes` = list(
    m = glm(ratio_q50 ~ HasSmoked + Gender + min_age,data = tmp),
    null_m = glm(ratio_q50 ~ min_age,data = tmp)))

for (g in unique_genes) {
  x <- tmp %>% 
    subset(gene == g)
  all_models[[g]] <- list(
    m = glm(ratio_q50 ~ HasSmoked + Gender + min_age,data = x),
    null_m = glm(ratio_q50 ~ min_age,data = x))
}

significance_df <- all_models %>% 
  lapply(
    function(x) anova(x$null_m,x$m,test="F")[2,c(5,6)]
  ) %>%
  do.call(what = rbind)
significance_df$gene <- names(all_models)
significance_df$SignificantAfterCorrection <- p.adjust(significance_df$`Pr(>F)`,method = "BH") < 0.05

significance_df

tmp %>% 
  mutate(
    L = paste(ifelse(Gender == "M","Male","Female"),
              ifelse(HasSmoked == T,"\nhas smoked","\nnever smoked"),
              sep = '')
  ) %>%
  mutate(ratio_q50 = ifelse(ratio_q50 < -1,-1,ratio_q50)) %>%
  ggplot(aes(x = L,y = ratio_q50, colour = Gender)) + 
  geom_boxplot(size = 0.2,outlier.size = 0.5) + 
  theme_gerstung(base_size = 6) + 
  coord_flip(ylim = c(-1,NA)) + 
  xlab("") + 
  ylab("Ratio of historical and observed annual fold change") +
  scale_y_continuous() +
  scale_colour_aaas(guide = F)

1.2.4 dNdS in AML and MDS

all_samples <- sampling_idxs %>%
  group_by(site) %>%
  mutate(re = length(unique(individual)) > 1) %>%
  subset(re == T) %>% 
  apply(1,function(x) {
    individual <- x[1]
    site_numeric <- as.numeric(x[3])
    gene_numeric <- as.numeric(x[4])
    clone_numeric <- as.numeric(x[5])
    minAge <- as.numeric(x[6])
    
    if (is.na(site_numeric)) {
      b_site <- 0} else {
        b_site <- site_samples[,site_numeric]
      }
    b_gene <- gene_samples[,gene_numeric]
    b_clone <- clone_samples[,clone_numeric]
    b <- b_gene + b_clone + b_site
    u_values <- offset_samples[,clone_numeric]
    ages_at_onset <- t0_adjusted(u_values,b,g=2,n=pop_size) + values_model$min_age
    ages_at_onset_corrected <- ifelse(ages_at_onset < -1,-1,ages_at_onset)
    ages_at_onset_corrected <- ifelse(ages_at_onset > minAge,minAge,ages_at_onset)
    biologically_feasible <- ages_at_onset > -1 & ages_at_onset < minAge
    historical_b <- b[biologically_feasible] %>%
      unlist
    return(tibble(
      historical_samples = historical_b,
      aachange = gsub("-","",str_match(x[2],"-.+")),
      gene = str_match(x[2],"[A-Z0-9]+")
    ))
  }) %>%
  do.call(what = rbind) %>%
  group_by(gene,aachange) %>%
  summarise(mean = mean(historical_samples,na.rm = T),
            s = sd(historical_samples,na.rm = T)) %>%
  mutate(site = sprintf("%s-%s",gene,aachange))

dnds <- read.csv("data/dnds_aml_mds.csv") %>%
  subset(qval <= 1e-1) %>% 
  mutate(site = paste(gene,gsub("\\*","X",aachange),sep = "-")) %>%
  select(-gene)

dd <- all_samples %>% 
  merge(dnds,by = "site",all = F) %>% 
  as.data.frame

dd %>% 
  #subset(gene == "DNMT3A") %>% 
  ggplot(aes(y = dnds,x = exp(mean)-1,colour = gene,shape = condition)) + 
  geom_point(size = 1) + 
  geom_errorbarh(aes(xmin = exp(mean-s*1.96)-1,xmax = exp(mean+s*1.96)-1),size = 0.1,height = 0) + 
  geom_linerange(aes(ymin = CI_low,ymax = CI_high),size = 0.1) + 
  geom_smooth(method = "lm",colour = "black",alpha = 0.2,size = 0.5) + 
  scale_colour_manual(values = gene_colours,guide = F) + 
  xlab("Annual driver growth rate in CH") +
  ylab("dN/dS") +
  scale_x_continuous(labels = function(x) sprintf("%s%%",x*100)) + 
  scale_y_continuous(trans = 'log10') +
  theme_gerstung(base_size = 6) +
  #coord_cartesian(xlim = c(NA,0.5)) +
  ggsave(useDingbats=FALSE,
         filename = sprintf("figures/%s/dynamic_coefficients/dnds_association_hist.pdf",model_id),
         width = 3.5,height = 2)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

lm(formula = dnds ~ mean,data = dd,subset = dd$condition == "AML") %>%
  summary

## 
## Call:
## lm(formula = dnds ~ mean, data = dd, subset = dd$condition == 
##     "AML")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2432.6 -1369.8  -547.4   727.0  4769.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    673.1     1016.0   0.663    0.518
## mean          7921.2     4837.2   1.638    0.124
## 
## Residual standard error: 1997 on 14 degrees of freedom
## Multiple R-squared:  0.1608, Adjusted R-squared:  0.1008 
## F-statistic: 2.682 on 1 and 14 DF,  p-value: 0.1238

lm(formula = dnds ~ mean,data = dd,subset = dd$condition == "MDS") %>%
  summary

## 
## Call:
## lm(formula = dnds ~ mean, data = dd, subset = dd$condition == 
##     "MDS")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2160.9 -1060.5  -373.4   893.7  2740.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    993.0      641.7   1.548    0.139
## mean          4836.1     3295.0   1.468    0.159
## 
## Residual standard error: 1436 on 18 degrees of freedom
## Multiple R-squared:  0.1069, Adjusted R-squared:  0.05727 
## F-statistic: 2.154 on 1 and 18 DF,  p-value: 0.1594

lm(formula = dnds ~ mean,data = dd,subset = dd$condition == "AML" & dd$gene == "DNMT3A") %>%
  summary

## 
## Call:
## lm(formula = dnds ~ mean, data = dd, subset = dd$condition == 
##     "AML" & dd$gene == "DNMT3A")
## 
## Residuals:
##      2      4      6      8 
## -878.4 -906.3  854.8  929.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    -2330       8826  -0.264    0.817
## mean           32159      80871   0.398    0.729
## 
## Residual standard error: 1263 on 2 degrees of freedom
## Multiple R-squared:  0.07327,    Adjusted R-squared:  -0.3901 
## F-statistic: 0.1581 on 1 and 2 DF,  p-value: 0.7293

lm(formula = dnds ~ mean,data = dd,subset = dd$condition == "MDS" & dd$gene == "DNMT3A") %>%
  summary

## 
## Call:
## lm(formula = dnds ~ mean, data = dd, subset = dd$condition == 
##     "MDS" & dd$gene == "DNMT3A")
## 
## Residuals:
##       3       5       7 
## -126.35   24.82  101.52 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -117.1     1258.2  -0.093    0.941
## mean          4547.4    11347.5   0.401    0.757
## 
## Residual standard error: 164 on 1 degrees of freedom
## Multiple R-squared:  0.1384, Adjusted R-squared:  -0.7233 
## F-statistic: 0.1606 on 1 and 1 DF,  p-value: 0.7574

lm(formula = dnds ~ mean + condition,data = dd) %>%
  summary

## 
## Call:
## lm(formula = dnds ~ mean + condition, data = dd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2350.1 -1219.4  -621.4   789.7  4659.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     970.7      664.5   1.461   0.1536  
## mean           6294.8     2808.4   2.241   0.0318 *
## conditionMDS   -223.6      567.0  -0.394   0.6959  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1686 on 33 degrees of freedom
## Multiple R-squared:  0.1391, Adjusted R-squared:  0.08692 
## F-statistic: 2.666 on 2 and 33 DF,  p-value: 0.08448

1.2.5 AML risk

all_samples <- sampling_idxs %>%
  apply(1,function(x) {
    individual <- x[1]
    site_numeric <- as.numeric(x[3])
    gene_numeric <- as.numeric(x[4])
    clone_numeric <- as.numeric(x[5])
    minAge <- as.numeric(x[6])
    
    if (is.na(site_numeric)) {
      b_site <- 0} else {
        b_site <- site_samples[,site_numeric]
      }
    b_gene <- gene_samples[,gene_numeric]
    b_clone <- clone_samples[,clone_numeric]
    b <- b_gene + b_clone + b_site
    u_values <- offset_samples[,clone_numeric]
    ages_at_onset <- t0_adjusted(u_values,b,g=2,n=pop_size) + values_model$min_age
    ages_at_onset_corrected <- ifelse(ages_at_onset < -1,-1,ages_at_onset)
    ages_at_onset_corrected <- ifelse(ages_at_onset > minAge,minAge,ages_at_onset)
    biologically_feasible <- ages_at_onset > -1 & ages_at_onset < minAge
    historical_b <- b[biologically_feasible] %>%
      unlist
    return(tibble(
      historical_samples = historical_b,
      gene = str_match(x[2],"[A-Z0-9]+")
    ))
  }) %>%
  do.call(what = rbind) %>%
  group_by(gene) %>%
  summarise(mean = mean(historical_samples,na.rm = T),
            s = sd(historical_samples,na.rm = T))

dd <- all_samples %>% 
  merge(read.csv("data/aml-risk-coef.csv"),by = "gene",all = F) %>% 
  as.data.frame

dd %>% 
  ggplot(aes(y = coef,x = exp(mean)-1,colour = gene)) + 
  geom_point(size = 1) + 
  geom_errorbarh(aes(xmin = exp(mean-s*1.96)-1,xmax = exp(mean+s*1.96)-1),size = 0.1,height = 0) + 
  geom_linerange(aes(ymin = coef - sd*1.96,ymax = coef + sd*1.96),size = 0.1) + 
  geom_smooth(method = "lm",colour = "black",alpha = 0.2,size = 0.5) + 
  scale_colour_manual(values = gene_colours,guide = F) + 
  xlab("Annual driver growth rate in CH") +
  ylab("AML risk") +
  scale_x_continuous(labels = function(x) sprintf("%s%%",x*100)) + 
  theme_gerstung(base_size = 6) +
  coord_cartesian(xlim = c(NA,0.5)) +
  ggsave(useDingbats=FALSE,
         filename = sprintf("figures/%s/dynamic_coefficients/aml_risk_hist.pdf",model_id),
         width = 3.5,height = 2)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

cor.test(dd$mean,dd$coef)

## 
##  Pearson's product-moment correlation
## 
## data:  dd$mean and dd$coef
## t = 3.8329, df = 11, p-value = 0.00278
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3517854 0.9227259
## sample estimates:
##       cor 
## 0.7561979

cor.test(dd$mean,dd$coef,method = "kendall")

## 
##  Kendall's rank correlation tau
## 
## data:  dd$mean and dd$coef
## T = 60, p-value = 0.01012
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.5384615

2 Poor fits

2.1 Investigating the determinants of poor fits

2.1.1 Association with recurrence and truncation status

tmp <- statistics_data %>%
  mutate(truncating = grepl("[A-Z0-9]+t-",site)) %>% 
  mutate(recurring = site %in% unique_site_multiple) %>%
  mutate(f = paste(
    ifelse(recurring,"Recurring","Non-recurring"),
    ifelse(truncating,"truncating","non-truncating"),
    sep = ' and\n'
  ))

anvar_test <- anova(glm(sum_stat ~ f,data = tmp,family = stats::binomial()),
                    test = "F")

## Warning in anova.glm(glm(sum_stat ~ f, data = tmp, family =
## stats::binomial()), : using F test with a 'binomial' family is inappropriate

tmp %>%
  group_by(f) %>% 
  summarise(Proportion = sum(sum_stat)/length(sum_stat)) %>%
  ggplot(aes(x = f,y = Proportion)) + 
  geom_bar(outlier.size = 0.5,size = 0.25,stat = "identity") + 
  ylab("Proportion of fits with no outliers") + 
  xlab("") +
  theme_gerstung(base_size = 6) + 
  coord_flip() + 
  scale_y_continuous(expand = c(0,0)) +
  ggsave(useDingbats=FALSE,sprintf("figures/%s/other_associations/uc_tnt_rec.pdf",model_id),
         height = 1.2,width = 2.0)

## Warning: Ignoring unknown parameters: outlier.size

anvar_test

2.1.2 Associations with smoking status and gender

good_stat_smoking_gender <- merge(
  statistics_data,smoking_gender_data,
  by = c("individual")) 

tmp <- good_stat_smoking_gender %>%
  filter(!is.na(sum_stat) & !is.na(HasSmoked) & !is.na(Gender))
all_models <- list(
  `All genes` = list(
    m = glm(sum_stat ~ HasSmoked + Gender,data = tmp,
            family = stats::binomial()),
    null_m = glm(sum_stat ~ 1,data = tmp,
                 family = stats::binomial())))

for (g in unique_genes) {
  x <- tmp %>% 
    subset(gene == g)
  all_models[[g]] <- list(
    m = glm(sum_stat ~ HasSmoked + Gender,data = x,
            family = stats::binomial()),
    null_m = glm(sum_stat ~ 1,data = x,
                 family = stats::binomial()))
}

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

significance_df <- all_models %>% 
  lapply(
    function(x) anova(x$null_m,x$m,test="F")[2,c(5,6)]
  ) %>%
  do.call(what = rbind)

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

## Warning: using F test with a 'binomial' family is inappropriate

significance_df$gene <- names(all_models)
significance_df$SignificantAfterCorrection <- p.adjust(significance_df$`Pr(>F)`,method = "BH") < 0.05

significance_df

tmp %>% 
  mutate(
    L = paste(ifelse(Gender == "M","Male","Female"),
              ifelse(HasSmoked == T,"\nhas smoked","\nnever smoked"),
              sep = '')
  ) %>%
  group_by(L,Gender) %>%
  summarise(proportion = sum(sum_stat) / length(sum_stat)) %>%
  ggplot(aes(x = L,y = proportion, fill = Gender)) + 
  geom_bar(stat = "identity") + 
  theme_gerstung(base_size = 6) + 
  coord_flip() +
  xlab("") + 
  ylab("Proportion of fits with no outliers") +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_aaas(guide = F)

3 Session details

cat(system("git log -n1", intern=TRUE), sep="\n")

## commit 5bcb05932603c8c4a9756c112d35c89f3c41c92b
## Author: josegcpa <josegcpa@ebi.ac.uk>
## Date:   Wed Aug 11 19:08:29 2021 +0100
## 
##     removed redundant parts of the analysis (regarding age at onset)

l <- ls()
data.frame(variable=l, Reduce("rbind",lapply(l, function(x) data.frame(classes=paste(class(get(x)),collapse = ','), size=format(object.size(get(x)), units="auto")))))

sessionInfo()

## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.4 (Maipo)
## 
## Matrix products: default
## BLAS:   /hps/nobackup/research/gerstung/josegcpa/software/R/3.6.3/lib64/R/lib/libRblas.so
## LAPACK: /hps/nobackup/research/gerstung/josegcpa/software/R/3.6.3/lib64/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] survminer_0.4.8   survival_3.2-7    reghelper_1.0.1   scatterpie_0.1.5 
##  [5] ggtree_2.0.4      ape_5.4-1         dendextend_1.14.0 default_1.0.0    
##  [9] extraDistr_1.9.1  ggrepel_0.8.2     ggsci_2.9         cowplot_1.1.0    
## [13] gtools_3.8.2      openxlsx_4.2.2    bayesplot_1.7.2   forcats_0.5.0    
## [17] stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4       readr_1.4.0      
## [21] tidyr_1.1.2       tibble_3.0.4      tidyverse_1.3.0   ggpubr_0.4.0     
## [25] ggplot2_3.3.2     greta_0.3.1       reticulate_1.16  
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1    ggsignif_0.6.0      ellipsis_0.3.1     
##   [4] rio_0.5.16          ggridges_0.5.2      base64enc_0.1-3    
##   [7] fs_1.5.0            dichromat_2.0-0     rstudioapi_0.11    
##  [10] farver_2.0.3        listenv_0.8.0       fansi_0.4.1        
##  [13] lubridate_1.7.9     xml2_1.3.2          splines_3.6.3      
##  [16] codetools_0.2-16    knitr_1.30          polyclip_1.10-0    
##  [19] jsonlite_1.7.1      km.ci_0.5-2         broom_0.7.1        
##  [22] dbplyr_1.4.4        tfruns_1.4          ggforce_0.3.2      
##  [25] mapproj_1.2.7       BiocManager_1.30.10 compiler_3.6.3     
##  [28] httr_1.4.2          rvcheck_0.1.8       backports_1.1.10   
##  [31] assertthat_0.2.1    Matrix_1.2-18       lazyeval_0.2.2     
##  [34] cli_2.1.0           tweenr_1.0.1        htmltools_0.5.0    
##  [37] prettyunits_1.1.1   tools_3.6.3         coda_0.19-4        
##  [40] gtable_0.3.0        glue_1.4.2          maps_3.3.0         
##  [43] Rcpp_1.0.5          carData_3.0-4       cellranger_1.1.0   
##  [46] vctrs_0.3.4         nlme_3.1-144        xfun_0.18          
##  [49] globals_0.13.0      ps_1.4.0            rvest_0.3.6        
##  [52] lifecycle_0.2.0     rstatix_0.6.0       future_1.19.1      
##  [55] zoo_1.8-8           MASS_7.3-51.5       scales_1.1.1       
##  [58] hms_0.5.3           parallel_3.6.3      yaml_2.2.1         
##  [61] curl_4.3            gridExtra_2.3       KMsurv_0.1-5       
##  [64] stringi_1.5.3       tensorflow_2.2.0    tidytree_0.3.3     
##  [67] zip_2.1.1           pals_1.6            rlang_0.4.8        
##  [70] pkgconfig_2.0.3     evaluate_0.14       lattice_0.20-38    
##  [73] labeling_0.4.2      treeio_1.10.0       tidyselect_1.1.0   
##  [76] plyr_1.8.6          magrittr_1.5        R6_2.4.1           
##  [79] generics_0.0.2      DBI_1.1.0           mgcv_1.8-31        
##  [82] pillar_1.4.6        haven_2.3.1         whisker_0.4        
##  [85] foreign_0.8-75      withr_2.3.0         abind_1.4-5        
##  [88] modelr_0.1.8        crayon_1.3.4        car_3.0-10         
##  [91] survMisc_0.5.5      rmarkdown_2.4       viridis_0.5.1      
##  [94] progress_1.2.2      readxl_1.3.1        data.table_1.13.0  
##  [97] blob_1.2.1          reprex_0.3.0        digest_0.6.26      
## [100] xtable_1.8-4        munsell_0.5.0       viridisLite_0.3.0

The longitudinal dynamics and natural history of clonal haematopoiesis

Historical growth and its determinants; poor fits and their determinants

José Guilherme de Almeida

Moritz Gerstung