1 Modelling the technical overdispersion

Technical replicate data shows some level of technical overdispersion. This can be modelled in our current model by changing our binomial modelling of the counts to a beta binomial, with a fixed \(\beta\) parameter and an \(\alpha\) parameter that depends on \(\beta\) and on the probability that is inferred by modelling, particularly by using the formulation for the expected value of the beta distribution \(E[X] = \frac{\alpha}{\alpha+\beta}\).

Considering the above, we can model the counts as \(counts \sim BB(n = coverage,alpha = \frac{\beta p}{1 - p},beta = \beta)\)

one_hot_encode <- function(factor_vector) {
  factor_levels <- sort(as.character(unique(factor_vector)))
  print(factor_levels)
  output <- sapply(
    factor_vector,
    function(x) {
      as.numeric(factor_levels == x)
    }
  )
  return(t(output))
}

tru_q_data <- read.table('data/TruQ.txt',header = T)
replicate_data <- read.table('data/Replicates_all.txt',header = T) %>%
  mutate(VAF = ifelse(CHR == "X", VAF / 2,VAF),
         MUTcount = ifelse(CHR == "X", floor(MUTcount / 2),MUTcount))
ages <- merge(full_data,replicate_data,by = c("SardID","Phase")) %>%
  select(Age, SardID, Phase) %>%
  unique
replicated_data_multiple <- replicate_data %>%
  group_by(SardID,variant) %>%
  summarise(count = length(unique(Phase))) %>%
  subset(count > 1)
replicate_data <- merge(
  replicate_data,
  ages,
  by = c("SardID","Phase"),
  all = T
) %>% 
  subset(SardID %in% replicated_data_multiple$SardID & variant %in% replicated_data_multiple$variant)

tru_q_data %>% 
  ggplot(aes(x = VAF_expected,y = VAF_observed)) + 
  geom_point(aes(colour = as.factor(Replicate))) + 
  geom_errorbar(
    aes(
      ymin = qbeta(0.05,MUTcount + 1,TOTALcount - MUTcount + 1),
      ymax = qbeta(0.95,MUTcount + 1,TOTALcount - MUTcount + 1)
  )) + 
  theme_gerstung(base_size = 15) +
  facet_wrap(~ Gene) + 
  ggsci::scale_color_lancet(name = "Replicate") +
  theme(legend.position = 'bottom') + 
  xlab("VAF expected") +
  ylab("VAF observed") + 
  ggtitle("TruQ data") + 
  ggsave(useDingbats=FALSE,"figures/overdispersion/overdispersion_vis_truq.pdf",height=14,width=14)

replicate_data %>% 
  ggplot(aes(x = as.factor(Phase),y = VAF)) + 
  geom_point(aes(colour = as.factor(Replicate)),
             size = 2) + 
  geom_linerange(
    aes(
      ymin = qbeta(0.05,MUTcount + 1,TOTALcount - MUTcount + 1),
      ymax = qbeta(0.95,MUTcount + 1,TOTALcount - MUTcount + 1)
    )) + 
  theme_gerstung(base_size = 15) +
  facet_wrap(~ paste(Gene,variant,SardID,sep = '-'),scale = 'free') + 
  ggsci::scale_color_lancet(name = "Replicate") +
  theme(legend.position = 'bottom') + 
  xlab("Phase") +
  ylab("VAF") + 
  ggtitle("Replicate data from Sardinian samples") + 
  ggsave(useDingbats=FALSE,"figures/overdispersion/overdispersion_vis_sard.pdf",height=14,width=14)

The invervals here are estimated using a beta distribution (\(Beta(alpha = count+1,beta = coverage - count + 1)\)).

1.1 Modelling the overdispersion for TruQ data

VAF_expected <- tru_q_data$VAF_expected + 1/tru_q_data$TOTALcount
VAF_observed <- tru_q_data$VAF_observed + 1/tru_q_data$TOTALcount

beta_rate <- variable(lower = 0,upper = Inf)
beta_variable <- exponential(
  rate = beta_rate
)

distribution(VAF_expected) <- beta(
  shape1 = (VAF_observed * beta_variable) / (1 - VAF_observed),
  shape2 = beta_variable
)

m <- model(beta_variable,beta_rate)

draws <- mcmc(m,
              sampler = hmc(Lmin = 10,Lmax = 20),
              warmup = 2000,
              n_samples = 2000)
## 
## running 4 chains simultaneously on up to 4 cores
## 
    warmup                                           0/2000 | eta:  ?s          
    warmup =                                        50/2000 | eta:  1m | 15% bad
    warmup ==                                      100/2000 | eta: 47s | 11% bad
    warmup ===                                     150/2000 | eta: 42s | 10% bad
    warmup ====                                    200/2000 | eta: 37s | 7% bad 
    warmup =====                                   250/2000 | eta: 35s | 6% bad 
    warmup ======                                  300/2000 | eta: 33s | 5% bad 
    warmup =======                                 350/2000 | eta: 31s | 4% bad 
    warmup ========                                400/2000 | eta: 30s | 4% bad 
    warmup =========                               450/2000 | eta: 28s | 3% bad 
    warmup ==========                              500/2000 | eta: 27s | 3% bad 
    warmup ==========                              550/2000 | eta: 26s | 3% bad 
    warmup ===========                             600/2000 | eta: 24s | 2% bad 
    warmup ============                            650/2000 | eta: 23s | 2% bad 
    warmup =============                           700/2000 | eta: 22s | 2% bad 
    warmup ==============                          750/2000 | eta: 21s | 2% bad 
    warmup ===============                         800/2000 | eta: 20s | 2% bad 
    warmup ================                        850/2000 | eta: 19s | 2% bad 
    warmup =================                       900/2000 | eta: 18s | 2% bad 
    warmup ==================                      950/2000 | eta: 18s | 2% bad 
    warmup ===================                    1000/2000 | eta: 17s | 2% bad 
    warmup ====================                   1050/2000 | eta: 16s | 1% bad 
    warmup =====================                  1100/2000 | eta: 15s | 1% bad 
    warmup ======================                 1150/2000 | eta: 14s | 1% bad 
    warmup =======================                1200/2000 | eta: 13s | 1% bad 
    warmup ========================               1250/2000 | eta: 12s | 1% bad 
    warmup =========================              1300/2000 | eta: 11s | 1% bad 
    warmup ==========================             1350/2000 | eta: 11s | 1% bad 
    warmup ===========================            1400/2000 | eta: 10s | 1% bad 
    warmup ============================           1450/2000 | eta:  9s | 1% bad 
    warmup ============================           1500/2000 | eta:  8s | 1% bad 
    warmup =============================          1550/2000 | eta:  7s | 1% bad 
    warmup ==============================         1600/2000 | eta:  6s | 1% bad 
    warmup ===============================        1650/2000 | eta:  6s | 1% bad 
    warmup ================================       1700/2000 | eta:  5s | 1% bad 
    warmup =================================      1750/2000 | eta:  4s | 1% bad 
    warmup ==================================     1800/2000 | eta:  3s | <1% bad
    warmup ===================================    1850/2000 | eta:  2s | <1% bad
    warmup ====================================   1900/2000 | eta:  2s | <1% bad
    warmup =====================================  1950/2000 | eta:  1s | <1% bad
    warmup ====================================== 2000/2000 | eta:  0s | <1% bad
## 
  sampling                                           0/2000 | eta:  ?s          
  sampling =                                        50/2000 | eta: 37s          
  sampling ==                                      100/2000 | eta: 32s          
  sampling ===                                     150/2000 | eta: 27s          
  sampling ====                                    200/2000 | eta: 25s          
  sampling =====                                   250/2000 | eta: 24s          
  sampling ======                                  300/2000 | eta: 24s          
  sampling =======                                 350/2000 | eta: 23s          
  sampling ========                                400/2000 | eta: 21s          
  sampling =========                               450/2000 | eta: 20s          
  sampling ==========                              500/2000 | eta: 20s          
  sampling ==========                              550/2000 | eta: 19s          
  sampling ===========                             600/2000 | eta: 18s          
  sampling ============                            650/2000 | eta: 17s          
  sampling =============                           700/2000 | eta: 16s          
  sampling ==============                          750/2000 | eta: 15s          
  sampling ===============                         800/2000 | eta: 15s          
  sampling ================                        850/2000 | eta: 14s          
  sampling =================                       900/2000 | eta: 13s          
  sampling ==================                      950/2000 | eta: 13s          
  sampling ===================                    1000/2000 | eta: 12s          
  sampling ====================                   1050/2000 | eta: 12s          
  sampling =====================                  1100/2000 | eta: 11s          
  sampling ======================                 1150/2000 | eta: 11s          
  sampling =======================                1200/2000 | eta: 10s          
  sampling ========================               1250/2000 | eta: 10s          
  sampling =========================              1300/2000 | eta:  9s          
  sampling ==========================             1350/2000 | eta:  8s          
  sampling ===========================            1400/2000 | eta:  8s          
  sampling ============================           1450/2000 | eta:  7s          
  sampling ============================           1500/2000 | eta:  6s          
  sampling =============================          1550/2000 | eta:  6s          
  sampling ==============================         1600/2000 | eta:  5s          
  sampling ===============================        1650/2000 | eta:  5s          
  sampling ================================       1700/2000 | eta:  4s          
  sampling =================================      1750/2000 | eta:  3s          
  sampling ==================================     1800/2000 | eta:  3s          
  sampling ===================================    1850/2000 | eta:  2s          
  sampling ====================================   1900/2000 | eta:  1s          
  sampling =====================================  1950/2000 | eta:  1s          
  sampling ====================================== 2000/2000 | eta:  0s
mcmc_trace(draws)

technical_beta_values <- calculate(beta_variable,draws) %>%
  do.call(what = rbind) %>%
  variable_summaries()

technical_dispersion_rate_values <- calculate(beta_rate,draws) %>%
  do.call(what = rbind) %>%
  variable_summaries()

replicate_data_samples <- replicate_data %>%
  apply(1,FUN = function(x) {
    size <- as.numeric(x[12]) + 1
    count <- as.numeric(x[10]) + 1
    out <- data.frame(
      extraDistr::rbbinom(n = 1000,size = size,
                          alpha = ((count/size) * technical_beta_values[1,1]) / (1 - count/size),
                          beta = technical_beta_values[1,1])) / size
    out$SardID <- x[1]
    out$Phase <- x[2]
    out$Replicate <- x[3]
    out$variant <- x[4]
    out$Gene <- x[5]
    out$Age <- x[14]
    out$VAF <- x[13]
    return(out)
  }
  ) %>%
  do.call(what = rbind)
colnames(replicate_data_samples)[1] <- "Samples"
replicate_data_samples[,c(7,8)] <- apply(replicate_data_samples[,c(7,8)],2,as.numeric)

replicate_data_samples %>% 
  ggplot(aes(x = as.numeric(Age),colour = Replicate)) + 
  geom_point(aes(y = Samples),
             alpha = 0.1,
             size = 0.2,
             position = "jitter") + 
  geom_point(aes(y = VAF),
             size = 2,
             alpha = 0.9) +
  theme_minimal(base_size = 15) +
  facet_wrap(~ paste(Gene,variant,SardID,sep = '-'),scale = 'free') + 
  ggsci::scale_color_lancet(name = "Replicate") +
  theme(legend.position = 'bottom') + 
  xlab("Phase") +
  ylab("VAF")

1.2 Estimating \(\beta\) with technical replicates

Using technical replicates for known DNA quantities (\(truQ\)) and for actual patient data (\(SR\)), we can posit the following model:

\[ p_{truQ} = observed\ VAF_{truQ} + 1/coverage_{truQ} \\ \beta ~ \sim exponential(0.001) \\ \alpha_{truQ} = \frac{\beta p_{truQ}}{1 - p_{truQ}} \\ expected\ VAF_{truQ} + 1/coverage_{truQ} \sim Beta(alpha,beta) \]

We need to adjust the expected and observed VAFs with the equivalent of having one count due to the fact that beta distributions only permit \(\alpha>0\) and \(\beta>0\).

\[ p_{SR} = growth\ coefficient = effect_{gene} + individual_{offset} \\ \alpha_{SR} = \frac{\beta p_{SR}}{1 - p_{SR}} \\ observed\ counts_{SR} \sim BB(coverage + 1,alpha,beta) \]

Here, \(p_{SR}\) is parametrized as \(p_{SR} = ilogit((b_{gene_{i}} + b_{c_{ji}}) * t + u_{ji})\).

For this subsubsection, the model is specified in scripts/estimating_overdispersion.R.

We used this model to infer the \(\mu_{\beta}\) and \(\sigma_{\beta}\) mentioned in the model specification above.

VAF_expected <- tru_q_data$VAF_expected + 1/tru_q_data$TOTALcount
VAF_observed <- tru_q_data$VAF_observed + 1/tru_q_data$TOTALcount
Sard_count <- replicate_data$MUTcount 
Sard_cover <- replicate_data$TOTALcount
n_clones <- length(unique(paste(replicate_data$SardID,replicate_data$variant,sep = '-')))

clone_b <- normal(0,sqrt(sum(0.1^2,0.1^2)),dim = c(n_clones,1))
u <- uniform(min=-50,max=0,dim = c(n_clones,1))
individual_ind <- one_hot_encode(paste(replicate_data$SardID,replicate_data$variant,sep = '-'))
##  [1] "158-DNMT3A_25466800_G_A"   "2259-BCOR_39922886_A_T"   
##  [3] "2259-CBL_119142571_G_A"    "2259-NOTCH1_139438556_T_C"
##  [5] "2259-SF3B1_198266834_T_C"  "2259-SF3B1_198267359_C_G" 
##  [7] "2259-ZRSR2_15809085_A_G"   "260-ASXL1_31023231_G_T"   
##  [9] "260-PHF6_133527665_G_A"    "260-SMC1A_53436412_C_T"   
## [11] "260-SRSF2_74732959_G_A"    "260-SRSF2_74732959_G_T"   
## [13] "260-SRSF2_74733113_A_G"    "847-EZH2_148511184_G_A"   
## [15] "847-JAK2_5073770_G_T"      "847-TET2_106164929_A_G"
offset <- individual_ind %*% u
gene_effect <- individual_ind %*% clone_b * replicate_data$Age#(replicate_data$Age - min(replicate_data$Age))
r <- gene_effect + offset
mu <- ilogit(r) * 0.5

beta_rate <- variable(lower = 0,upper = Inf)
beta_variable <- exponential(
  rate = beta_rate
)

distribution(VAF_expected) <- beta(
  shape1 = (VAF_observed * beta_variable) / (1 - VAF_observed),
  shape2 = beta_variable
)

distribution(Sard_count) <- beta_binomial(
  size = Sard_cover,
  alpha = (mu * beta_variable) / (1 - mu),
  beta = beta_variable
)

m <- model(beta_variable,beta_rate,clone_b,u)

draws <- mcmc(m,
              sampler = hmc(Lmin = 400,Lmax = 500),
              warmup = 2000,
              n_samples = 1000,
              n_cores = 4)
## 
## running 4 chains simultaneously on up to 4 cores
## 
    warmup                                           0/2000 | eta:  ?s          
    warmup =                                        50/2000 | eta: 38m          
    warmup ==                                      100/2000 | eta: 36m          
    warmup ===                                     150/2000 | eta: 35m          
    warmup ====                                    200/2000 | eta: 34m          
    warmup =====                                   250/2000 | eta: 34m          
    warmup ======                                  300/2000 | eta: 33m          
    warmup =======                                 350/2000 | eta: 32m          
    warmup ========                                400/2000 | eta: 31m          
    warmup =========                               450/2000 | eta: 30m          
    warmup ==========                              500/2000 | eta: 30m          
    warmup ==========                              550/2000 | eta: 28m          
    warmup ===========                             600/2000 | eta: 28m          
    warmup ============                            650/2000 | eta: 27m          
    warmup =============                           700/2000 | eta: 26m          
    warmup ==============                          750/2000 | eta: 25m          
    warmup ===============                         800/2000 | eta: 24m          
    warmup ================                        850/2000 | eta: 23m          
    warmup =================                       900/2000 | eta: 22m          
    warmup ==================                      950/2000 | eta: 21m          
    warmup ===================                    1000/2000 | eta: 20m          
    warmup ====================                   1050/2000 | eta: 19m          
    warmup =====================                  1100/2000 | eta: 18m          
    warmup ======================                 1150/2000 | eta: 17m          
    warmup =======================                1200/2000 | eta: 16m          
    warmup ========================               1250/2000 | eta: 15m          
    warmup =========================              1300/2000 | eta: 14m          
    warmup ==========================             1350/2000 | eta: 13m          
    warmup ===========================            1400/2000 | eta: 12m          
    warmup ============================           1450/2000 | eta: 11m          
    warmup ============================           1500/2000 | eta: 10m          
    warmup =============================          1550/2000 | eta:  9m          
    warmup ==============================         1600/2000 | eta:  8m          
    warmup ===============================        1650/2000 | eta:  7m          
    warmup ================================       1700/2000 | eta:  6m          
    warmup =================================      1750/2000 | eta:  5m          
    warmup ==================================     1800/2000 | eta:  4m          
    warmup ===================================    1850/2000 | eta:  3m          
    warmup ====================================   1900/2000 | eta:  2m          
    warmup =====================================  1950/2000 | eta:  1m          
    warmup ====================================== 2000/2000 | eta:  0s          
## 
  sampling                                           0/1000 | eta:  ?s          
  sampling ==                                       50/1000 | eta: 23m          
  sampling ====                                    100/1000 | eta: 22m          
  sampling ======                                  150/1000 | eta: 22m          
  sampling ========                                200/1000 | eta: 20m          
  sampling ==========                              250/1000 | eta: 19m          
  sampling ===========                             300/1000 | eta: 17m          
  sampling =============                           350/1000 | eta: 16m          
  sampling ===============                         400/1000 | eta: 15m          
  sampling =================                       450/1000 | eta: 14m          
  sampling ===================                     500/1000 | eta: 13m          
  sampling =====================                   550/1000 | eta: 11m          
  sampling =======================                 600/1000 | eta: 10m          
  sampling =========================               650/1000 | eta:  9m          
  sampling ===========================             700/1000 | eta:  8m          
  sampling ============================            750/1000 | eta:  6m          
  sampling ==============================          800/1000 | eta:  5m          
  sampling ================================        850/1000 | eta:  4m          
  sampling ==================================      900/1000 | eta:  3m          
  sampling ====================================    950/1000 | eta:  1m          
  sampling ====================================== 1000/1000 | eta:  0s
mcmc_trace(draws)

beta_values <- calculate(beta_variable,draws) %>%
  lapply(
    variable_summaries
  )
mu_values <- calculate(mu,draws) %>% 
  lapply(
    variable_summaries
  )

Values <- list(
  size = Sard_cover,
  mu = colMeans(do.call(rbind,calculate(mu,draws))),
  beta = variable_summaries(do.call(rbind,calculate(beta_variable,draws)))
)
beta_parameter <- Values$beta[1,1]
parameter_df <- data.frame(
  cover = Sard_cover,
  alpha = (Values$mu * beta_parameter) / (1 - Values$mu),
  beta = beta_parameter
)
samples <- parameter_df %>% 
  apply(
    1,function(x){
      S <- extraDistr::rbbinom(n = 1000,size = x[1],alpha = x[2],beta = x[3])
      if (sum(is.na(S)) > 0) {
        print(x)
      }
      S <- S / x[1]
      return(quantile(S,c(0.05,0.50,0.95),na.rm = T))
    }) %>% 
  t %>%
  as.data.frame()
colnames(samples) <- c("Q_005","Q_050","Q_095")

prediction_matrix <- data.frame(
  pred = samples$Q_050,
  pred_005 = samples$Q_005,
  pred_095 = samples$Q_095,
  true = (Sard_count)/Values$size,
  prob = Values$mu,
  variant = replicate_data$variant,
  Gene = replicate_data$Gene,
  Age = replicate_data$Age,
  SardID = replicate_data$SardID,
  Replicate = replicate_data$Replicate
) 

prediction_matrix %>% 
  gather(key = 'key',value = 'value',true,prob) %>%
  ggplot(aes(x = Age,y = value,colour = key,shape = as.factor(Replicate))) + 
  geom_point() +
  geom_errorbar(aes(ymin = pred_005,ymax = pred_095)) +
  geom_line() +
  theme_minimal(base_size = 15) +
  facet_wrap(~ paste(variant,SardID),scales = 'free') + 
  theme(legend.position = "bottom")

Values$beta %>%
  saveRDS("models/overdispersion.RDS")