In this notebook we inspect how different methods to estimate EPS compare.

0.0.0.1 Setup

source("Scripts/vaf_dynamics_functions.R")

## 
## Attaching package: 'greta'

## The following objects are masked from 'package:stats':
## 
##     binomial, cov2cor, poisson

## The following objects are masked from 'package:base':
## 
##     %*%, apply, backsolve, beta, chol2inv, colMeans, colSums, diag,
##     eigen, forwardsolve, gamma, identity, rowMeans, rowSums, sweep,
##     tapply

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✔ tibble  3.0.4     ✔ dplyr   1.0.2
## ✔ tidyr   1.1.2     ✔ stringr 1.4.0
## ✔ readr   1.4.0     ✔ forcats 0.5.0
## ✔ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::slice()  masks greta::slice()

## This is bayesplot version 1.7.2

## - Online documentation and vignettes at mc-stan.org/bayesplot

## - bayesplot theme set to bayesplot::theme_default()

##    * Does _not_ affect other ggplot2 plots

##    * See ?bayesplot_theme_set for details on theme setting

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggpubr':
## 
##     get_legend

## 
## Attaching package: 'extraDistr'

## The following objects are masked from 'package:gtools':
## 
##     ddirichlet, rdirichlet

## The following object is masked from 'package:purrr':
## 
##     rdunif

## 
## ---------------------
## Welcome to dendextend version 1.14.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## The following object is masked from 'package:stats':
## 
##     cutree

## 
## Attaching package: 'ape'

## The following objects are masked from 'package:dendextend':
## 
##     ladderize, rotate

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## Registered S3 method overwritten by 'treeio':
##   method     from
##   root.phylo ape

## ggtree v2.0.4  For help: https://yulab-smu.github.io/treedata-book/
## 
## If you use ggtree in published research, please cite the most appropriate paper(s):
## 
## [36m-[39m Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi: 10.1093/molbev/msy194
## [36m-[39m Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628

## 
## Attaching package: 'ggtree'

## The following object is masked from 'package:ape':
## 
##     rotate

## The following object is masked from 'package:dendextend':
## 
##     rotate

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following object is masked from 'package:ggpubr':
## 
##     rotate

## 
## Attaching package: 'reghelper'

## The following object is masked from 'package:greta':
## 
##     beta

## The following object is masked from 'package:base':
## 
##     beta

source("Scripts/make_ultrametric.R")

library(parallel)
library(Matrix)

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:ggtree':
## 
##     expand

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(castor)

## Loading required package: Rcpp

library(phangorn)
library(phylodyn)
library(minpack.lm)
INLA:::inla.dynload.workaround()

change_point <- function(x, b0, m1, m2, delta) { 
  b0 + (x*m1) + (sapply(x-delta, function (t) max(0, t)) * m2)
}

plot_tree <- function(obj) { 
  tree_ultra <- obj$tree_ultra
  tree_ultra$edge.length[is.infinite(tree_ultra$edge.length)] <- 0
  tree_ultra$tip.label <- obj$tree$S
  driver_list <- Filter(function(x) length(x) > 5,obj$driver_list)
  if (length(driver_list) > 1) {
    colours <- RColorBrewer::brewer.pal(length(driver_list),"Set3")
  } else {
    colours <- c("black")
  }
  colour_code <- rep("black",length(tree_ultra$tip.label))
  R <- 1:length(driver_list)
  if (length(driver_list) == 0) {
    R <- numeric(0)
  } 
  for (i in R) {
    D <- driver_list[[i]]
    colour_code[tree_ultra$tip.label %in% unique(D)] <- colours[i]
  }
  plot(tree_ultra,tip.color = colour_code)
}

clade_from_mrca <- function(tree,mrca) {
  edge_idx <- which(tree$edge[,1] == mrca)
  continue <- T
  output <- c(which(tree$edge[,2] == mrca))
  while (continue == T) {
    if (length(edge_idx) > 0) {
      output <- c(output,edge_idx)
      new_edge_idx <- c()
      for (idx in edge_idx) {
        nodes <- tree$edge[idx,2]
        new_idxs <- which(tree$edge[,1] == nodes)
        new_edge_idx <- c(new_edge_idx,new_idxs)
      }
      edge_idx <- new_edge_idx
    } else {
      continue <- F
    }
  }
  return(output)
}

build_tree <- function(sub_data,subsample_size=100,
                       detection_threshold=0) {
  sub_af <- sub_data %>% 
    group_by(CloneID) %>% 
    summarise(N = max(NClones),.groups = "drop") %>%
    arrange(CloneID)
  MaxClone <- max(sub_data$CloneID)
  MaxMut <- max(sub_data$MutID)
  sub_af_sparse <- rep(0,MaxClone)
  sub_af_sparse[sub_af$CloneID] <- sub_af$N 
  
  Presence <- sparseMatrix(i = sub_data$CloneID,
                           j = sub_data$MutID,
                           dims = c(MaxClone,MaxMut))
  Presence[cbind(sub_data$CloneID,sub_data$MutID)] <- 1

  if (sum(as.logical(sub_af_sparse)) < subsample_size) {
    sub_af_sparse <- rep(1,length(sub_af_sparse))
  }
  
  S <- sample(MaxClone,subsample_size,replace = F,prob = sub_af_sparse)
  af <- sub_af_sparse[S]
  Presence <- Presence[S,]
  Presence <- as.matrix(rbind(Presence,wt=0))
  Presence <- Presence[,colSums(Presence) > 0]
  
  dst <- as.matrix(dist(Presence > 0,method = "manhattan"))
  tree <- root(njs(dst),
               outgroup = "wt",
               resolve.root = T,
               edgelabel = T) 
  tree <- drop.tip(tree,"wt")
  return(list(tree = tree,S = S,af = af))
}

read_clonex <- function(path,d = 1000) {
  x <- read_tsv(path,
                col_names = c("Gen","NClones","CloneID","MutID"),
                col_types = c(col_integer(),col_integer(),col_integer(),col_integer()),
                progress = F) %>%
    mutate(Driver = MutID <= d)
  return(x)
}

trajectory_from_td <- function(x=0:1000, td=0, s=0.01, N=2e5, g=1){
  s <- s/g
  t0 <- td - 1/(s)
  t <- x-t0
  t <- t*g
  y <- pmax(0,t)
  if (s==0)
    return(y)
  td <- 1/(s)
  te <- log(N*s -1)/s + td
  y[t>td] <- N/(1+exp(-(s*(t[t>td]-te))))
  return(y)
}

trajectory_from_t0 <- function(x=0:1000, t0=0, s=0.01, N=2e5, g=1){
  s <- s/g
  t <- x-t0
  t <- t*g
  y <- pmax(0,t)
  if (s==0)
    return(y)
  td <- 1/(s)
  te <- log(N*s -1)/s + td
  y[t>td] <- N/(1+exp(-(s*(t[t>td]-te))))
  return(y)
}

1 Population size estimation

In this section we calculate/load all BNPR trajectories for the trees inferred from Wright-Fisher (WF) simulations. These simulations are done for 6 different fitness levels - 0.005, 0.010, 0.015, 0.020, 0.025 and 0.030 - over 800 generations and with a fixed population size (200,000).

1.1 Loading data and inferring trees

The first step is to build the trees, which we then display. One can immediately see that expansions become increasingly prevalent for higher fitness effects, with quite a few cases of a single clone sweeping all of the population.

N <- 2e5
N_DRIVERS <- 50
all_file_paths <- list.files(
  "hsc_output_bnpr_complete",pattern = "last_generation",
  full.names = T,recursive = T)
all_driver_file_paths <- list.files(
  "hsc_output_bnpr_complete",pattern = "driver",
  full.names = T,recursive = T)

file_name <- "data_output/simulated_trees_trajectories.RDS"
if (file.exists(file_name) == T) {
  tree_traj <- readRDS(file_name)
  trees <- tree_traj[[1]]
  all_driver_trajectories <- tree_traj[[2]]
} else {
  all_driver_trajectories <- all_driver_file_paths %>%
    lapply(function(x) {
      read.csv(x,header = F) %>%
        arrange(V1,V2) %>%
        select(Gen = V1,MutID = V2,Count = V3)})
  
  trees <- mclapply(
    all_file_paths,
    mc.cores = 6,
    mc.cleanup = T,
    function(path) {
      s <- str_match(path,"hsc_[0-9.]+") %>%
        gsub(pattern = "hsc_",replacement = "") %>%
        as.numeric()
      system(sprintf('echo "%s"', path)) # prints during mclapply by using bash
      x <- read_clonex(path,d = N_DRIVERS) %>%
        subset(Gen == 800) %>%
        subset(MutID != 0)
      x$R <- as.numeric(str_match(path,"[0-9.]+$"))
      x$s <- s
      x$drift_threshold <- 1 / (N * (s))
      
      driver_list <- x %>% 
        subset(Driver == T) %>% 
        select(CloneID,MutID) 
      driver_list_out <- list()
      for (driver_id in unique(driver_list$MutID)) {
        tmp <- driver_list %>%
          subset(MutID == driver_id) %>%
          select(CloneID) %>%
          unlist
        driver_list_out[[as.character(driver_id)]] <- tmp
      }
      tree <- x %>%
        build_tree()
      ultrametric_tree <- make.ultrametric.tree(tree$tree)
      gc()
      list(
        tree = tree,
        tree_ultra = ultrametric_tree,
        driver_list = driver_list_out,
        path = path
      ) %>%
        return
    }
  )
  
  names(trees) <- gsub('/last_generation','',all_file_paths)
  names(all_driver_trajectories) <- gsub('/driver_trajectory','',all_driver_file_paths)
  saveRDS(object = list(trees,all_driver_trajectories),file = file_name)
}

for (tree_name in names(trees)) {
  trees[[tree_name]]$tree_ultra$edge.length <- trees[[tree_name]]$tree_ultra$edge.length * 800
}

par(mfrow = c(4,5),mar = c(2,0.5,1,0.5))
for (tree_name in names(trees)) {
  tree <- trees[[tree_name]]
  fitness <- str_match(tree_name,"[0-9.]+")
  plot(tree$tree_ultra,main = fitness)
}

1.2 Calculating BNPR trajectories for the whole trees

Here we calculate the actual EPS trajectories using BNPR and display them.

file_name <- "data_output/simulated_bnpr_trees.RDS"
if (file.exists(file_name) == T) {
  all_estimates_trees <- readRDS(file_name)
} else {
  all_estimates_trees <- lapply(
    trees,
    function(x) {
      system(sprintf('echo "%s"', x$path)) # prints during mclapply by using bash
      BNPR(x$tree_ultra) %>%
        return
      }
    )
  saveRDS(all_estimates_trees,file_name)
}

1.3 Calculating `mcmc.popsize` and skyline trajectories for the whole trees

We accompany our BNPR estimates with those obtained through mcmc.popsize and skyline estimations to prove that what we are observing are not artefacts specific to BNPR. Interestingly, mcmc.popsize appears to suffer from its own bias - in some trajectories a very large initial EPS followed by a dip is observable. This holds for both possible priors - constant population size and a skyline-process population size, with the latter ameliorating this effect slightly.

all_estimates_trees_skyline <- mclapply(
    trees,
    mc.cores = 2,
    mc.cleanup = T,
    function(x) skyline(x$tree_ultra,epsilon = 0.01)
    )

all_estimates_trees_skyline_wide <- mclapply(
    trees,
    mc.cores = 2,
    mc.cleanup = T,
    function(x) skyline(x$tree_ultra,epsilon = 0.05)
    )

all_estimates_trees_mcmc <- mclapply(
    trees,
    mc.cores = 2, 
    mc.cleanup = T,
    function(x) mcmc.popsize(x$tree_ultra,nstep=10000,thinning = 100,
                             progress.bar = F,
                             burn.in = 1000,lambda = 0.1,
                             method.prior.heights = "constant")
    )

all_estimates_trees_mcmc_skyline <- mclapply(
    trees,
    mc.cores = 2, 
    mc.cleanup = T,
    function(x) mcmc.popsize(x$tree_ultra,nstep=10000,thinning = 100,
                             progress.bar = F,
                             burn.in = 1000,lambda = 0.1,
                             method.prior.heights = "skyline")
    )

par(mfrow = c(4,5),mar = c(1.5,2,2,1))
for (name in 1:length(all_estimates_trees)) {
  plot_BNPR(all_estimates_trees[[name]])
  lines(all_estimates_trees_skyline[[name]],col = "green")
  lines(all_estimates_trees_skyline_wide[[name]],col = "#026440")
  lines(extract.popsize(all_estimates_trees_mcmc[[name]]),col = "red")
  lines(extract.popsize(all_estimates_trees_mcmc_skyline[[name]]),col = "purple")
}

1.4 Calculating BNPR trajectories for each clade

file_name <- "data_output/simulated_bnpr_clades.RDS"
if (file.exists(file_name) == T) {
  all_estimates_full <- readRDS(file_name)
  all_estimates <- all_estimates_full[[1]]
  all_estimates_trimmed_5_years <- all_estimates_full[[2]]
  all_estimates_trimmed_10_years <- all_estimates_full[[3]]
} else {
  all_estimates <- list()
  i <- 1
  for (obj_name in names(trees)) {
    obj <- trees[[obj_name]]
    print(obj_name)
    s <- str_match(obj_name,"[0-9.]+") %>%
      as.numeric
    Tree <- list(tree = obj$tree_ultra)
    Tree$tree$tip.label <- obj$tree$S
    Tree$tree$edge.length[is.infinite(Tree$tree$edge.length)] <- 0
    tree <- Tree$tree
    driver_id_list <- Filter(function(x) length(x) > 5,obj$driver_list)
    all_drivers <- do.call(c,driver_id_list)
    clades <- cut_tree(tree = tree,depth = 0.1) %>%
      Filter(f = function(x) length(x) >= 5)
    clades_ <- list()
    for (clade in clades) {
      clade_tips <- tree$tip.label[clade]
      for (driver in names(driver_id_list)) {
        if (any(clade_tips %in% driver_id_list[[driver]])) {
          clades_[[length(clades_)+1]] <- list(
            clade = clade,
            driver = driver
          )
        }
      }
    }
    clades <- clades_
    
    if (!(obj_name %in% names(all_estimates))) {
      bnpr_estimates <- mclapply(
        clades,
        mc.cores = 2,
        mc.cleanup = T,
        function(clad) {
          #tip_in_clade <- which(Tree$tree$tip.label %in% clad)
          tip_in_clade <- clad$clade
          if (length(tip_in_clade) > 4) {
            sub_tree <- keep.tip(Tree$tree,tip_in_clade) %>%
              multi2di()
            if (!is.null(sub_tree)) {
              bnpr_estimates <- NULL
              bnpr_estimate <- BNPR(sub_tree)
              if (!is.null(bnpr_estimate)) {
                list(
                  tree = tree,
                  bnpr = bnpr_estimate,
                  clade = clad$clade,
                  tree = sub_tree,
                  driver = clad$driver) %>%
                  return
              }
            }
          }
        }
      ) %>%
        Filter(f = function(x) !is.null(x))
      all_estimates[[obj_name]] <- bnpr_estimates
    }
    gc()
  }
  saveRDS(all_estimates,file_name)
}

1.5 Calculating `mcmc.popsize` trajectories for each clade

all_estimates_mcmc <- list()
i <- 1
par(mfrow = c(4,5),mar=c(1,2,2,1))
for (obj_name in names(all_estimates)) {
  print(obj_name)
  if (!(obj_name %in% names(all_estimates_mcmc))) {
    obj <- all_estimates[[obj_name]]
    estimates <- list()
    for (child_obj in obj) {
      Tr <- child_obj$tree %>%
        keep.tip(child_obj$clade) %>%
        multi2di()
      pr <- ifelse(length(child_obj$tree$tip.labels) > 10,"skyline","constant")
      pop_size_eps <- mcmc.popsize(Tr,10000,100,1000,
                                   lambda = 0.1,
                                   progress.bar = F,
                                   method.prior.heights = pr)
      estimates[[length(estimates)+1]] <- list(pop_size_eps = pop_size_eps)
    }
    estimates <- estimates %>%
     Filter(f = function(x) !is.null(x))
    all_estimates_mcmc[[obj_name]] <- estimates
  }
  gc()
}

## [1] "hsc_output_bnpr_complete/hsc_0.005_200_1"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_10"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_11"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_12"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_13"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_14"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_15"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_16"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_17"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_18"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_19"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_2"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_20"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_3"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_4"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_5"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_6"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_7"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_8"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_9"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_1"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_10"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_11"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_12"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_13"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_14"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_15"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_16"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_17"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_18"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_19"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_2"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_20"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_3"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_4"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_5"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_6"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_7"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_8"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_9"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_1"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_10"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_11"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_12"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_13"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_14"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_15"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_16"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_17"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_18"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_19"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_2"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_20"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_3"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_4"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_5"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_6"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_7"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_8"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_9"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_1"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_10"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_11"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_12"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_13"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_14"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_15"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_16"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_17"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_18"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_19"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_2"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_20"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_3"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_4"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_5"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_6"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_7"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_8"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_9"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_1"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_10"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_11"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_12"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_13"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_14"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_15"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_16"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_17"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_18"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_19"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_2"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_20"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_3"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_4"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_5"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_6"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_7"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_8"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_9"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_1"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_10"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_11"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_12"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_13"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_14"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_15"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_16"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_17"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_18"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_19"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_2"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_20"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_3"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_4"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_5"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_6"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_7"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_8"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_9"

1.6 Calculating skyline trajectories for each clade

all_estimates_skyline <- list()
names_done <- names(all_estimates_skyline)
i <- 1
par(mfrow = c(4,5),mar=c(1,2,2,1))
for (obj_name in names(all_estimates)) {
  print(obj_name)
  if (!(obj_name %in% names_done)) {
    obj <- all_estimates[[obj_name]]
    estimates <- list()
    for (child_obj in obj) {
      Tr <- child_obj$tree %>%
        keep.tip(child_obj$clade) %>%
        multi2di()
      sk <- skyline(Tr,epsilon = 0.02)
      estimates[[length(estimates)+1]] <- list(skyline = sk)
    }
    estimates <- estimates %>% 
      Filter(f = function(x) !is.null(x))
    all_estimates_skyline[[obj_name]] <- estimates
  }
  gc()
}

## [1] "hsc_output_bnpr_complete/hsc_0.005_200_1"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_10"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_11"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_12"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_13"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_14"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_15"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_16"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_17"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_18"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_19"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_2"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_20"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_3"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_4"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_5"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_6"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_7"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_8"
## [1] "hsc_output_bnpr_complete/hsc_0.005_200_9"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_1"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_10"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_11"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_12"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_13"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_14"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_15"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_16"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_17"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_18"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_19"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_2"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_20"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_3"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_4"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_5"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_6"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_7"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_8"
## [1] "hsc_output_bnpr_complete/hsc_0.01_50_9"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_1"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_10"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_11"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_12"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_13"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_14"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_15"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_16"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_17"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_18"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_19"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_2"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_20"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_3"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_4"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_5"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_6"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_7"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_8"
## [1] "hsc_output_bnpr_complete/hsc_0.015_20_9"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_1"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_10"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_11"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_12"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_13"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_14"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_15"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_16"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_17"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_18"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_19"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_2"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_20"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_3"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_4"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_5"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_6"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_7"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_8"
## [1] "hsc_output_bnpr_complete/hsc_0.02_15_9"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_1"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_10"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_11"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_12"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_13"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_14"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_15"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_16"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_17"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_18"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_19"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_2"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_20"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_3"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_4"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_5"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_6"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_7"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_8"
## [1] "hsc_output_bnpr_complete/hsc_0.025_8_9"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_1"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_10"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_11"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_12"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_13"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_14"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_15"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_16"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_17"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_18"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_19"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_2"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_20"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_3"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_4"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_5"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_6"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_7"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_8"
## [1] "hsc_output_bnpr_complete/hsc_0.03_5_9"

1.7 Comparing BNPR, `mcmc.popsize` and skyline trajectories

1.7.1 For clades with 10 or fewer tips

# fewer than 10 tips per clade
par(mfrow = c(4,5),mar = c(1,1,1,1))
for (name in names(all_estimates)) {
  obj <- all_estimates[[name]]
  obj_mcmc <- all_estimates_mcmc[[name]]
  obj_skyl <- all_estimates_skyline[[name]]
  
  if (length(obj) > 0) {
    for (i in 1:length(obj)) {
      if (length(obj[[i]]$clade) <= 10) {
        bnpr_traj <- obj[[i]]$bnpr
        mcmc_traj <- obj_mcmc[[i]]$pop_size_eps
        skyl_traj <- obj_skyl[[i]]$skyline
        
        plot_BNPR(bnpr_traj)
        try(lines(extract.popsize(mcmc_traj),col = "red"))
        lines(skyl_traj,col = "green")
      }
    }
  }
}

## Error in prep$pos[index.right] : subscript out of bounds

## Error in prep$pos[index.right] : subscript out of bounds

## Warning in plot.window(...): nonfinite axis limits [GScale(-inf,173.085,2, .);
## log=1]

1.7.2 For clades with 10-50 tips

# 10-50 tips
par(mfrow = c(4,5),mar = c(1,1,1,1))
for (name in names(all_estimates)) {
  obj <- all_estimates[[name]]
  obj_mcmc <- all_estimates_mcmc[[name]]
  obj_skyl <- all_estimates_skyline[[name]]
  
  if (length(obj) > 0) {
    for (i in 1:length(obj)) {
      if (length(obj[[i]]$clade) < 50 & length(obj[[i]]$clade) > 10) {
        bnpr_traj <- obj[[i]]$bnpr
        mcmc_traj <- obj_mcmc[[i]]$pop_size_eps
        skyl_traj <- obj_skyl[[i]]$skyline
        
        plot_BNPR(bnpr_traj)
        try(lines(extract.popsize(mcmc_traj),col = "red"))
        lines(skyl_traj,col = "green")
      }
    }
  }
}

## Error in prep$pos[index.right] : subscript out of bounds

1.7.3 For clades with 50+ tips

# 50-100 clades
par(mfrow = c(4,5),mar = c(1,1,1,1))
for (name in names(all_estimates)) {
  obj <- all_estimates[[name]]
  obj_mcmc <- all_estimates_mcmc[[name]]
  obj_skyl <- all_estimates_skyline[[name]]
  
  if (length(obj) > 0) {
    for (i in 1:length(obj)) {
      if (length(obj[[i]]$clade) >= 50) {
        bnpr_traj <- obj[[i]]$bnpr
        mcmc_traj <- obj_mcmc[[i]]$pop_size_eps
        skyl_traj <- obj_skyl[[i]]$skyline
        
        plot_BNPR(bnpr_traj)
        try(lines(extract.popsize(mcmc_traj),col = "red"))
        # extracting population size fails for a few cases
        lines(skyl_traj,col = "green")
      }
    }
  }
}

## Error in prep$pos[index.right] : subscript out of bounds

## Error in prep$pos[index.right] : subscript out of bounds

1.7.4 Figure with examples

set.seed(4242)
samples_to_plot <- sample(names(all_estimates),replace = F,size = 9)

data_for_plot <- samples_to_plot %>%
  lapply(
    function(i) {
      data_list <- list()
      obj <- all_estimates[[i]]
      obj_mcmc <- all_estimates_mcmc[[i]]
      obj_skyl <- all_estimates_skyline[[i]]
      
      if (length(obj) > 0) {
        for (j in 1:length(obj)) {
          try(silent = T,{
            bnpr_traj <- obj[[j]]$bnpr
            mcmc_traj <- obj_mcmc[[j]]$pop_size_eps
            skyl_traj <- obj_skyl[[j]]$skyline
            
            pop_size <- extract.popsize(mcmc_traj) %>%
              as.matrix 
            
            data_list[[j]] <- data.frame(
              X = bnpr_traj$summary$time,
              Y = bnpr_traj$summary$quant0.5,
              Y025 = bnpr_traj$summary$quant0.025,
              Y975 = bnpr_traj$summary$quant0.975,
              MCMC_Y = approx(x = pop_size[,1],
                              y = pop_size[,3],
                              xout = bnpr_traj$summary$time)$y,
              MCMC_Y025 = approx(x = pop_size[,1],
                                 y = pop_size[,4],
                                 xout = bnpr_traj$summary$time)$y,
              MCMC_Y975 = approx(x = pop_size[,1],
                                 y = pop_size[,5],
                                 xout = bnpr_traj$summary$time)$y,
              SKYL_Y = approx(x = skyl_traj$time,
                              y = skyl_traj$population.size,
                              xout = bnpr_traj$summary$time,rule = 2)$y,
              id = i,
              clade_no = j,
              clade_size = length(obj[[j]]$clade)
            )
          })
        }
      }
      return(do.call(rbind,data_list))
    }
  )

data_for_plot %>%
  do.call(what = rbind) %>%
  mutate(fitness = str_match(id,pattern = "[0-9.]+")) %>%
  mutate(traj_id = as.numeric(as.factor(paste(id,clade_no)))) %>%
  ggplot(aes(x = 800 - X)) + 
  geom_ribbon(aes(ymin = Y025,ymax = Y975,fill = "BNPR"),alpha = 0.3) + 
  geom_line(aes(y = Y,colour = "BNPR"),size = 0.25) + 
  geom_ribbon(aes(ymin = MCMC_Y025,ymax = MCMC_Y975,fill = "mcmc.popsize"),alpha = 0.3) + 
  geom_line(aes(y = MCMC_Y,colour = "mcmc.popsize"),size = 0.25) + 
  geom_line(aes(y = SKYL_Y,colour = "skyline",fill = "skyline"),size = 0.25) + 
  facet_wrap(~ reorder(sprintf("id=%s (N=%s;s=%s)",traj_id, clade_size,fitness),clade_size)) + 
  scale_y_continuous(trans = 'log10') + 
  coord_cartesian(ylim = c(1e-1,1e8)) + 
  scale_colour_aaas(name = NULL) +
  scale_fill_aaas(name = NULL) +
  theme_gerstung(base_size = 6) + 
  theme(strip.text = element_text(margin = margin()),
        legend.position = "bottom",
        legend.key.size = unit(0.1,"cm")) + 
  xlab("Time (generations)") + 
  ylab("Neff") + 
  ggsave(filename = sprintf("figures/simulations/eps_comparison.pdf"),
         useDingbats = F,
         width = 5.5,height = 3)

## Warning: Ignoring unknown aesthetics: fill

1.8 Comparing fits derived from BNPR, `mcmc.popsize` and skyline

There is quite good agreement between log-linear trajectories fitted to skyline estimation and BNPR. This does not hold as well when comparing mcmc.popsize and BNPR - probably due to the initial dip in population size, the EPS estimates derived from mcmc.popsize lead to a slight underestimation of log-linear growth trajectories. As for early/late growth rate comparisons we observe little agreement between BNPR and mcmc.popsize or skyline estimation.

linear_estimates <- list()
early_late_estimates <- list()
idx <- 1
for (x in names(all_estimates)) {
  obj <- all_estimates[[x]]
  obj_mcmc <- all_estimates_mcmc[[x]]
  obj_skyl <- all_estimates_skyline[[x]]
  linear_estimates[[x]] <- list()
  early_late_estimates[[x]] <- list()
  if (length(obj) > 0){
    for (i in 1:length(obj)) {
      try(
        {
          bnpr_data <- data.frame(
            X = (800 - obj[[i]]$bnpr$summary$time),
            Y = log(obj[[i]]$bnpr$summary$mean),
            W = (log(obj[[i]]$bnpr$summary$quant0.975)-log(obj[[i]]$bnpr$summary$quant0.025))^2/16)
          ps <- extract.popsize(obj_mcmc[[i]]$pop_size_eps)
          X_ <- seq(min(obj_skyl[[i]]$skyline$time),max(obj_skyl[[i]]$skyline$time),length.out = 100)
          sk_Y <- obj_skyl[[i]]$skyline$population.size[as.numeric(cut(X_,c(0,obj_skyl[[i]]$skyline$time)))]
          skyl_data <- data.frame(
            X = (800 - X_),
            Y = log(sk_Y)
            ) %>% 
            na.omit
          mcmc_data <- data.frame(
            X = (800 - ps[,1]),
            Y = log(ps[,3])) %>%
            na.omit
          
          linear_estimates[[x]][[i]] <- list(
            id = idx,
            bnpr = lm(Y ~ X,data = bnpr_data),
            skyl = lm(Y ~ X,data = skyl_data),
            mcmc = lm(Y ~ X,data = mcmc_data)
          )

          opt_fn_bnpr <- function(par) {
            se <- (bnpr_data$Y - change_point(bnpr_data$X,par[1],par[2],par[3],par[4]))^2
            return(mean(se / bnpr_data$W))
          }
          cpr_bnpr <- optim(
            par = c(linear_estimates[[x]][[i]]$bnpr$coefficients[1],
                    linear_estimates[[x]][[i]]$bnpr$coefficients[2],
                    0,
                    mean(bnpr_data$X)),
            method = "L-BFGS-B",
            lower = c(NA,0,NA,min(bnpr_data$X) + diff(range(bnpr_data$X)) * 0.25),
            upper = c(NA,NA,NA,max(bnpr_data$X) - diff(range(bnpr_data$X)) * 0.25),
            fn = opt_fn_bnpr)
          
          opt_fn_skyl <- function(par) {
            se <- (skyl_data$Y - change_point(skyl_data$X,par[1],par[2],par[3],par[4]))^2
            return(mean(se))
          }
          cpr_skyl <- optim(
            par = c(linear_estimates[[x]][[i]]$skyl$coefficients[1],
                    linear_estimates[[x]][[i]]$skyl$coefficients[2],
                    0,
                    mean(bnpr_data$X)),
            method = "L-BFGS-B",
            lower = c(NA,0,NA,min(bnpr_data$X) + diff(range(bnpr_data$X)) * 0.25),
            upper = c(NA,NA,NA,max(bnpr_data$X) - diff(range(bnpr_data$X)) * 0.25),
            fn = opt_fn_skyl)

          opt_fn_mcmc <- function(par) {
            se <- (mcmc_data$Y - change_point(mcmc_data$X,par[1],par[2],par[3],par[4]))^2
            return(mean(se))
          }
          cpr_mcmc <- optim(
            par = c(linear_estimates[[x]][[i]]$mcmc$coefficients[1],
                    linear_estimates[[x]][[i]]$mcmc$coefficients[2],
                    0,
                    mean(bnpr_data$X)),
            method = "L-BFGS-B",
            lower = c(NA,0,NA,min(bnpr_data$X) + diff(range(bnpr_data$X)) * 0.25),
            upper = c(NA,NA,NA,max(bnpr_data$X) - diff(range(bnpr_data$X)) * 0.25),
            fn = opt_fn_mcmc)
          early_late_estimates[[x]][[i]] <- list(
            id = idx,
            bnpr = cpr_bnpr,
            skyl = cpr_skyl,
            mcmc = cpr_mcmc,
            W_sum = mean(bnpr_data$W)
            )
          }
      )
    }
  }
  idx <- idx + 1
}

## Error in prep$pos[index.right] : subscript out of bounds
## Error in prep$pos[index.right] : subscript out of bounds
## Error in prep$pos[index.right] : subscript out of bounds
## Error in prep$pos[index.right] : subscript out of bounds
## Error in prep$pos[index.right] : subscript out of bounds

linear_estimates_bnpr_mcmc_skyl <- linear_estimates %>% 
    lapply(function(x) do.call(rbind,
                               lapply(x, function(y) c(y$id,
                                                       y$bnpr$coefficient[2],
                                                       y$mcmc$coefficient[2],
                                                       y$skyl$coefficient[2])))) %>%
  do.call(what = rbind) %>%
  as.data.frame()
colnames(linear_estimates_bnpr_mcmc_skyl) <- c("id","BNPR","mcmc.popsize","Skyline")
linear_estimates_bnpr_mcmc_skyl$id <- names(all_estimates)[linear_estimates_bnpr_mcmc_skyl$id]
linear_estimates_bnpr_mcmc_skyl$fitness <- linear_estimates_bnpr_mcmc_skyl$id %>%
  str_match("[0-9.]+") %>% 
  as.numeric

early_estimates_bnpr_mcmc_skyl <- early_late_estimates %>% 
    lapply(function(x) do.call(rbind,
                               lapply(x, function(y) c(y$id,
                                                       y$bnpr$par[2],
                                                       y$mcmc$par[2],
                                                       y$skyl$par[2],
                                                       y$W_sum)))) %>%
  do.call(what = rbind) %>%
  as.data.frame()
colnames(early_estimates_bnpr_mcmc_skyl) <- c("id","BNPR","mcmc.popsize","Skyline","W_sum")
early_estimates_bnpr_mcmc_skyl$id <- names(all_estimates)[early_estimates_bnpr_mcmc_skyl$id]
early_estimates_bnpr_mcmc_skyl$fitness <- early_estimates_bnpr_mcmc_skyl$id %>%
  str_match("[0-9.]+") %>% 
  as.numeric

late_estimates_bnpr_mcmc_skyl <- early_late_estimates %>% 
  Filter(f = function(f) !is.null(x)) %>% 
  lapply(function(x) do.call(
    rbind,
    lapply(x, function(y) c(y$id,
                            ifelse(!is.null(y$bnpr),
                                   sum(y$bnpr$par[2:3]),NA),
                            sum(y$mcmc$par[2:3]),
                            sum(y$skyl$par[2:3]))))) %>%
  do.call(what = rbind) %>%
  as.data.frame()

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 1)

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 3)

## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 3)

colnames(late_estimates_bnpr_mcmc_skyl) <- c("id","BNPR","mcmc.popsize","Skyline")
late_estimates_bnpr_mcmc_skyl$id <- names(all_estimates)[late_estimates_bnpr_mcmc_skyl$id]
late_estimates_bnpr_mcmc_skyl$fitness <- late_estimates_bnpr_mcmc_skyl$id %>%
  str_match("[0-9.]+") %>% 
  as.numeric

A <- linear_estimates_bnpr_mcmc_skyl %>%
  ggplot() +
  geom_point(aes(x = Skyline,y = BNPR),size = 0.5) + 
  geom_abline(slope = 1,linetype = 2,size = 0.25) +
  theme_gerstung(base_size = 6) + 
  coord_cartesian(xlim = c(-0.05,0.05),
                  ylim = c(-0.05,0.05))

B <- early_estimates_bnpr_mcmc_skyl %>%
  ggplot() +
  geom_point(aes(x = Skyline,y = BNPR),size = 0.5) + 
  geom_abline(slope = 1,linetype = 2,size = 0.25) +
  theme_gerstung(base_size = 6) + 
  coord_cartesian(xlim = c(-0.05,0.05),
                  ylim = c(-0.05,0.05))

C <- late_estimates_bnpr_mcmc_skyl %>%
  ggplot() +
  geom_point(aes(x = Skyline,y = BNPR),size = 0.5) + 
  geom_abline(slope = 1,linetype = 2,size = 0.25) +
  theme_gerstung(base_size = 6) + 
  coord_cartesian(xlim = c(-0.05,0.05),
                  ylim = c(-0.05,0.05))

D <- linear_estimates_bnpr_mcmc_skyl %>%
  ggplot() +
  geom_point(aes(x = mcmc.popsize,y = BNPR),size = 0.5) + 
  geom_abline(slope = 1,linetype = 2,size = 0.25) +
  theme_gerstung(base_size = 6) + 
  coord_cartesian(xlim = c(-0.05,0.05),
                  ylim = c(-0.05,0.05))

E <- early_estimates_bnpr_mcmc_skyl %>%
  ggplot() +
  geom_point(aes(x = mcmc.popsize,y = BNPR),size = 0.5) + 
  geom_abline(slope = 1,linetype = 2,size = 0.25) +
  theme_gerstung(base_size = 6) + 
  coord_cartesian(xlim = c(-0.05,0.05),
                  ylim = c(-0.05,0.05))

FF <- late_estimates_bnpr_mcmc_skyl %>%
  ggplot() +
  geom_point(aes(x = mcmc.popsize,y = BNPR),size = 0.5) + 
  geom_abline(slope = 1,linetype = 2,size = 0.25) +
  theme_gerstung(base_size = 6) + 
  coord_cartesian(xlim = c(-0.05,0.05),
                  ylim = c(-0.05,0.05))

plot_grid(A,B,C,D,E,FF,nrow = 2)

## Warning: Removed 3 rows containing missing values (geom_point).

data.frame(
  BNPR = c(cor(linear_estimates_bnpr_mcmc_skyl$fitness,linear_estimates_bnpr_mcmc_skyl$BNPR),
           cor(early_estimates_bnpr_mcmc_skyl$fitness,early_estimates_bnpr_mcmc_skyl$BNPR))^2,
  Skyline = c(cor(linear_estimates_bnpr_mcmc_skyl$fitness,linear_estimates_bnpr_mcmc_skyl$Skyline),
              cor(early_estimates_bnpr_mcmc_skyl$fitness,early_estimates_bnpr_mcmc_skyl$Skyline))^2,
  mcmc.popsize = c(cor(linear_estimates_bnpr_mcmc_skyl$fitness,linear_estimates_bnpr_mcmc_skyl$mcmc.popsize),
                   cor(early_estimates_bnpr_mcmc_skyl$fitness,early_estimates_bnpr_mcmc_skyl$mcmc.popsize))^2,
  fit = c("Log-linear","Early log-linear"))

data.frame(
  BNPR = c(cor(linear_estimates_bnpr_mcmc_skyl$fitness,linear_estimates_bnpr_mcmc_skyl$BNPR),
           cor(early_estimates_bnpr_mcmc_skyl$fitness,early_estimates_bnpr_mcmc_skyl$BNPR))^2,
  Skyline = c(cor(linear_estimates_bnpr_mcmc_skyl$fitness,linear_estimates_bnpr_mcmc_skyl$Skyline),
              cor(early_estimates_bnpr_mcmc_skyl$fitness,early_estimates_bnpr_mcmc_skyl$Skyline))^2,
  mcmc.popsize = c(cor(linear_estimates_bnpr_mcmc_skyl$fitness,linear_estimates_bnpr_mcmc_skyl$mcmc.popsize),
                   cor(early_estimates_bnpr_mcmc_skyl$fitness,early_estimates_bnpr_mcmc_skyl$mcmc.popsize))^2,
  fit = c("Log-linear","Early log-linear")) %>%
  gather(key = "key",value = "value",BNPR,Skyline,mcmc.popsize) %>%
  ggplot(aes(x = key,y = fit,fill = value)) +
  geom_tile() + 
  geom_text(aes(label = round(value,2)),size = 2.6,colour = "white") +
  theme_gerstung(base_size = 6) + 
  theme(legend.position = "bottom",legend.key.height = unit(0.1,"cm"),
        legend.key.width = unit(0.4,"cm"),
        legend.box.spacing = unit(0,"cm")) + 
  scale_y_discrete(expand = c(0,0)) +
  scale_x_discrete(expand = c(0,0)) +
  ylab("") + 
  xlab("") +
  scale_fill_material(palette = "deep-purple",name = "R2") + 
  ggsave(filename = sprintf("figures/simulations/eps_comparison_heatmap.pdf"),
         useDingbats = F,
         width = 2.2,height = 1)

The longitudinal dynamics and natural history of clonal haematopoiesis

Investigating different methods to estimate EPS

José Guilherme de Almeida

Moritz Gerstung

0.0.0.1 Setup

1 Population size estimation

1.1 Loading data and inferring trees

1.2 Calculating BNPR trajectories for the whole trees

1.3 Calculating `mcmc.popsize` and skyline trajectories for the whole trees

1.4 Calculating BNPR trajectories for each clade

1.5 Calculating `mcmc.popsize` trajectories for each clade

1.6 Calculating skyline trajectories for each clade

1.7 Comparing BNPR, `mcmc.popsize` and skyline trajectories

1.7.1 For clades with 10 or fewer tips

1.7.2 For clades with 10-50 tips

1.7.3 For clades with 50+ tips

1.7.4 Figure with examples

1.8 Comparing fits derived from BNPR, `mcmc.popsize` and skyline

The longitudinal dynamics and natural history of clonal haematopoiesis

Investigating different methods to estimate EPS

José Guilherme de Almeida

Moritz Gerstung

0.0.0.1 Setup

1 Population size estimation

1.1 Loading data and inferring trees

1.2 Calculating BNPR trajectories for the whole trees

1.3 Calculating mcmc.popsize and skyline trajectories for the whole trees

1.4 Calculating BNPR trajectories for each clade

1.5 Calculating mcmc.popsize trajectories for each clade

1.6 Calculating skyline trajectories for each clade

1.7 Comparing BNPR, mcmc.popsize and skyline trajectories

1.7.1 For clades with 10 or fewer tips

1.7.2 For clades with 10-50 tips

1.7.3 For clades with 50+ tips

1.7.4 Figure with examples

1.8 Comparing fits derived from BNPR, mcmc.popsize and skyline

1.3 Calculating `mcmc.popsize` and skyline trajectories for the whole trees

1.5 Calculating `mcmc.popsize` trajectories for each clade

1.7 Comparing BNPR, `mcmc.popsize` and skyline trajectories

1.8 Comparing fits derived from BNPR, `mcmc.popsize` and skyline