Technical replicate data shows some level of technical overdispersion. This can be modelled in our current model by changing our binomial modelling of the counts to a beta binomial, with a fixed \(\beta\) parameter and an \(\alpha\) parameter that depends on \(\beta\) and on the probability that is inferred by modelling, particularly by using the formulation for the expected value of the beta distribution \(E[X] = \frac{\alpha}{\alpha+\beta}\).
Considering the above, we can model the counts as \(counts \sim BB(n = coverage,alpha = \frac{\beta p}{1 - p},beta = \beta)\)
one_hot_encode <- function(factor_vector) {
factor_levels <- sort(as.character(unique(factor_vector)))
print(factor_levels)
output <- sapply(
factor_vector,
function(x) {
as.numeric(factor_levels == x)
}
)
return(t(output))
}
tru_q_data <- read.table('data/TruQ.txt',header = T)
replicate_data <- read.table('data/Replicates_all.txt',header = T) %>%
mutate(VAF = ifelse(CHR == "X", VAF / 2,VAF),
MUTcount = ifelse(CHR == "X", floor(MUTcount / 2),MUTcount))
ages <- merge(full_data,replicate_data,by = c("SardID","Phase")) %>%
select(Age, SardID, Phase) %>%
unique
replicated_data_multiple <- replicate_data %>%
group_by(SardID,variant) %>%
summarise(count = length(unique(Phase))) %>%
subset(count > 1)
replicate_data <- merge(
replicate_data,
ages,
by = c("SardID","Phase"),
all = T
) %>%
subset(SardID %in% replicated_data_multiple$SardID & variant %in% replicated_data_multiple$variant)
tru_q_data %>%
ggplot(aes(x = VAF_expected,y = VAF_observed)) +
geom_point(aes(colour = as.factor(Replicate))) +
geom_errorbar(
aes(
ymin = qbeta(0.05,MUTcount + 1,TOTALcount - MUTcount + 1),
ymax = qbeta(0.95,MUTcount + 1,TOTALcount - MUTcount + 1)
)) +
theme_gerstung(base_size = 15) +
facet_wrap(~ Gene) +
ggsci::scale_color_lancet(name = "Replicate") +
theme(legend.position = 'bottom') +
xlab("VAF expected") +
ylab("VAF observed") +
ggtitle("TruQ data") +
ggsave(useDingbats=FALSE,"figures/overdispersion/overdispersion_vis_truq.pdf",height=14,width=14)
replicate_data %>%
ggplot(aes(x = as.factor(Phase),y = VAF)) +
geom_point(aes(colour = as.factor(Replicate)),
size = 2) +
geom_linerange(
aes(
ymin = qbeta(0.05,MUTcount + 1,TOTALcount - MUTcount + 1),
ymax = qbeta(0.95,MUTcount + 1,TOTALcount - MUTcount + 1)
)) +
theme_gerstung(base_size = 15) +
facet_wrap(~ paste(Gene,variant,SardID,sep = '-'),scale = 'free') +
ggsci::scale_color_lancet(name = "Replicate") +
theme(legend.position = 'bottom') +
xlab("Phase") +
ylab("VAF") +
ggtitle("Replicate data from Sardinian samples") +
ggsave(useDingbats=FALSE,"figures/overdispersion/overdispersion_vis_sard.pdf",height=14,width=14)
The invervals here are estimated using a beta distribution (\(Beta(alpha = count+1,beta = coverage - count + 1)\)).
VAF_expected <- tru_q_data$VAF_expected + 1/tru_q_data$TOTALcount
VAF_observed <- tru_q_data$VAF_observed + 1/tru_q_data$TOTALcount
beta_rate <- variable(lower = 0,upper = Inf)
beta_variable <- exponential(
rate = beta_rate
)
distribution(VAF_expected) <- beta(
shape1 = (VAF_observed * beta_variable) / (1 - VAF_observed),
shape2 = beta_variable
)
m <- model(beta_variable,beta_rate)
draws <- mcmc(m,
sampler = hmc(Lmin = 10,Lmax = 20),
warmup = 2000,
n_samples = 2000)
##
## running 4 chains simultaneously on up to 4 cores
##
warmup 0/2000 | eta: ?s
warmup = 50/2000 | eta: 1m | 15% bad
warmup == 100/2000 | eta: 47s | 11% bad
warmup === 150/2000 | eta: 42s | 10% bad
warmup ==== 200/2000 | eta: 37s | 7% bad
warmup ===== 250/2000 | eta: 35s | 6% bad
warmup ====== 300/2000 | eta: 33s | 5% bad
warmup ======= 350/2000 | eta: 31s | 4% bad
warmup ======== 400/2000 | eta: 30s | 4% bad
warmup ========= 450/2000 | eta: 28s | 3% bad
warmup ========== 500/2000 | eta: 27s | 3% bad
warmup ========== 550/2000 | eta: 26s | 3% bad
warmup =========== 600/2000 | eta: 24s | 2% bad
warmup ============ 650/2000 | eta: 23s | 2% bad
warmup ============= 700/2000 | eta: 22s | 2% bad
warmup ============== 750/2000 | eta: 21s | 2% bad
warmup =============== 800/2000 | eta: 20s | 2% bad
warmup ================ 850/2000 | eta: 19s | 2% bad
warmup ================= 900/2000 | eta: 18s | 2% bad
warmup ================== 950/2000 | eta: 18s | 2% bad
warmup =================== 1000/2000 | eta: 17s | 2% bad
warmup ==================== 1050/2000 | eta: 16s | 1% bad
warmup ===================== 1100/2000 | eta: 15s | 1% bad
warmup ====================== 1150/2000 | eta: 14s | 1% bad
warmup ======================= 1200/2000 | eta: 13s | 1% bad
warmup ======================== 1250/2000 | eta: 12s | 1% bad
warmup ========================= 1300/2000 | eta: 11s | 1% bad
warmup ========================== 1350/2000 | eta: 11s | 1% bad
warmup =========================== 1400/2000 | eta: 10s | 1% bad
warmup ============================ 1450/2000 | eta: 9s | 1% bad
warmup ============================ 1500/2000 | eta: 8s | 1% bad
warmup ============================= 1550/2000 | eta: 7s | 1% bad
warmup ============================== 1600/2000 | eta: 6s | 1% bad
warmup =============================== 1650/2000 | eta: 6s | 1% bad
warmup ================================ 1700/2000 | eta: 5s | 1% bad
warmup ================================= 1750/2000 | eta: 4s | 1% bad
warmup ================================== 1800/2000 | eta: 3s | <1% bad
warmup =================================== 1850/2000 | eta: 2s | <1% bad
warmup ==================================== 1900/2000 | eta: 2s | <1% bad
warmup ===================================== 1950/2000 | eta: 1s | <1% bad
warmup ====================================== 2000/2000 | eta: 0s | <1% bad
##
sampling 0/2000 | eta: ?s
sampling = 50/2000 | eta: 37s
sampling == 100/2000 | eta: 32s
sampling === 150/2000 | eta: 27s
sampling ==== 200/2000 | eta: 25s
sampling ===== 250/2000 | eta: 24s
sampling ====== 300/2000 | eta: 24s
sampling ======= 350/2000 | eta: 23s
sampling ======== 400/2000 | eta: 21s
sampling ========= 450/2000 | eta: 20s
sampling ========== 500/2000 | eta: 20s
sampling ========== 550/2000 | eta: 19s
sampling =========== 600/2000 | eta: 18s
sampling ============ 650/2000 | eta: 17s
sampling ============= 700/2000 | eta: 16s
sampling ============== 750/2000 | eta: 15s
sampling =============== 800/2000 | eta: 15s
sampling ================ 850/2000 | eta: 14s
sampling ================= 900/2000 | eta: 13s
sampling ================== 950/2000 | eta: 13s
sampling =================== 1000/2000 | eta: 12s
sampling ==================== 1050/2000 | eta: 12s
sampling ===================== 1100/2000 | eta: 11s
sampling ====================== 1150/2000 | eta: 11s
sampling ======================= 1200/2000 | eta: 10s
sampling ======================== 1250/2000 | eta: 10s
sampling ========================= 1300/2000 | eta: 9s
sampling ========================== 1350/2000 | eta: 8s
sampling =========================== 1400/2000 | eta: 8s
sampling ============================ 1450/2000 | eta: 7s
sampling ============================ 1500/2000 | eta: 6s
sampling ============================= 1550/2000 | eta: 6s
sampling ============================== 1600/2000 | eta: 5s
sampling =============================== 1650/2000 | eta: 5s
sampling ================================ 1700/2000 | eta: 4s
sampling ================================= 1750/2000 | eta: 3s
sampling ================================== 1800/2000 | eta: 3s
sampling =================================== 1850/2000 | eta: 2s
sampling ==================================== 1900/2000 | eta: 1s
sampling ===================================== 1950/2000 | eta: 1s
sampling ====================================== 2000/2000 | eta: 0s
mcmc_trace(draws)
technical_beta_values <- calculate(beta_variable,draws) %>%
do.call(what = rbind) %>%
variable_summaries()
technical_dispersion_rate_values <- calculate(beta_rate,draws) %>%
do.call(what = rbind) %>%
variable_summaries()
replicate_data_samples <- replicate_data %>%
apply(1,FUN = function(x) {
size <- as.numeric(x[12]) + 1
count <- as.numeric(x[10]) + 1
out <- data.frame(
extraDistr::rbbinom(n = 1000,size = size,
alpha = ((count/size) * technical_beta_values[1,1]) / (1 - count/size),
beta = technical_beta_values[1,1])) / size
out$SardID <- x[1]
out$Phase <- x[2]
out$Replicate <- x[3]
out$variant <- x[4]
out$Gene <- x[5]
out$Age <- x[14]
out$VAF <- x[13]
return(out)
}
) %>%
do.call(what = rbind)
colnames(replicate_data_samples)[1] <- "Samples"
replicate_data_samples[,c(7,8)] <- apply(replicate_data_samples[,c(7,8)],2,as.numeric)
replicate_data_samples %>%
ggplot(aes(x = as.numeric(Age),colour = Replicate)) +
geom_point(aes(y = Samples),
alpha = 0.1,
size = 0.2,
position = "jitter") +
geom_point(aes(y = VAF),
size = 2,
alpha = 0.9) +
theme_minimal(base_size = 15) +
facet_wrap(~ paste(Gene,variant,SardID,sep = '-'),scale = 'free') +
ggsci::scale_color_lancet(name = "Replicate") +
theme(legend.position = 'bottom') +
xlab("Phase") +
ylab("VAF")
Using technical replicates for known DNA quantities (\(truQ\)) and for actual patient data (\(SR\)), we can posit the following model:
\[ p_{truQ} = observed\ VAF_{truQ} + 1/coverage_{truQ} \\ \beta ~ \sim exponential(0.001) \\ \alpha_{truQ} = \frac{\beta p_{truQ}}{1 - p_{truQ}} \\ expected\ VAF_{truQ} + 1/coverage_{truQ} \sim Beta(alpha,beta) \]
We need to adjust the expected and observed VAFs with the equivalent of having one count due to the fact that beta distributions only permit \(\alpha>0\) and \(\beta>0\).
\[ p_{SR} = growth\ coefficient = effect_{gene} + individual_{offset} \\ \alpha_{SR} = \frac{\beta p_{SR}}{1 - p_{SR}} \\ observed\ counts_{SR} \sim BB(coverage + 1,alpha,beta) \]
Here, \(p_{SR}\) is parametrized as \(p_{SR} = ilogit((b_{gene_{i}} + b_{c_{ji}}) * t + u_{ji})\).
For this subsubsection, the model is specified in scripts/estimating_overdispersion.R
.
We used this model to infer the \(\mu_{\beta}\) and \(\sigma_{\beta}\) mentioned in the model specification above.
VAF_expected <- tru_q_data$VAF_expected + 1/tru_q_data$TOTALcount
VAF_observed <- tru_q_data$VAF_observed + 1/tru_q_data$TOTALcount
Sard_count <- replicate_data$MUTcount
Sard_cover <- replicate_data$TOTALcount
n_clones <- length(unique(paste(replicate_data$SardID,replicate_data$variant,sep = '-')))
clone_b <- normal(0,sqrt(sum(0.1^2,0.1^2)),dim = c(n_clones,1))
u <- uniform(min=-50,max=0,dim = c(n_clones,1))
individual_ind <- one_hot_encode(paste(replicate_data$SardID,replicate_data$variant,sep = '-'))
## [1] "158-DNMT3A_25466800_G_A" "2259-BCOR_39922886_A_T"
## [3] "2259-CBL_119142571_G_A" "2259-NOTCH1_139438556_T_C"
## [5] "2259-SF3B1_198266834_T_C" "2259-SF3B1_198267359_C_G"
## [7] "2259-ZRSR2_15809085_A_G" "260-ASXL1_31023231_G_T"
## [9] "260-PHF6_133527665_G_A" "260-SMC1A_53436412_C_T"
## [11] "260-SRSF2_74732959_G_A" "260-SRSF2_74732959_G_T"
## [13] "260-SRSF2_74733113_A_G" "847-EZH2_148511184_G_A"
## [15] "847-JAK2_5073770_G_T" "847-TET2_106164929_A_G"
offset <- individual_ind %*% u
gene_effect <- individual_ind %*% clone_b * replicate_data$Age#(replicate_data$Age - min(replicate_data$Age))
r <- gene_effect + offset
mu <- ilogit(r) * 0.5
beta_rate <- variable(lower = 0,upper = Inf)
beta_variable <- exponential(
rate = beta_rate
)
distribution(VAF_expected) <- beta(
shape1 = (VAF_observed * beta_variable) / (1 - VAF_observed),
shape2 = beta_variable
)
distribution(Sard_count) <- beta_binomial(
size = Sard_cover,
alpha = (mu * beta_variable) / (1 - mu),
beta = beta_variable
)
m <- model(beta_variable,beta_rate,clone_b,u)
draws <- mcmc(m,
sampler = hmc(Lmin = 400,Lmax = 500),
warmup = 2000,
n_samples = 1000,
n_cores = 4)
##
## running 4 chains simultaneously on up to 4 cores
##
warmup 0/2000 | eta: ?s
warmup = 50/2000 | eta: 38m
warmup == 100/2000 | eta: 36m
warmup === 150/2000 | eta: 35m
warmup ==== 200/2000 | eta: 34m
warmup ===== 250/2000 | eta: 34m
warmup ====== 300/2000 | eta: 33m
warmup ======= 350/2000 | eta: 32m
warmup ======== 400/2000 | eta: 31m
warmup ========= 450/2000 | eta: 30m
warmup ========== 500/2000 | eta: 30m
warmup ========== 550/2000 | eta: 28m
warmup =========== 600/2000 | eta: 28m
warmup ============ 650/2000 | eta: 27m
warmup ============= 700/2000 | eta: 26m
warmup ============== 750/2000 | eta: 25m
warmup =============== 800/2000 | eta: 24m
warmup ================ 850/2000 | eta: 23m
warmup ================= 900/2000 | eta: 22m
warmup ================== 950/2000 | eta: 21m
warmup =================== 1000/2000 | eta: 20m
warmup ==================== 1050/2000 | eta: 19m
warmup ===================== 1100/2000 | eta: 18m
warmup ====================== 1150/2000 | eta: 17m
warmup ======================= 1200/2000 | eta: 16m
warmup ======================== 1250/2000 | eta: 15m
warmup ========================= 1300/2000 | eta: 14m
warmup ========================== 1350/2000 | eta: 13m
warmup =========================== 1400/2000 | eta: 12m
warmup ============================ 1450/2000 | eta: 11m
warmup ============================ 1500/2000 | eta: 10m
warmup ============================= 1550/2000 | eta: 9m
warmup ============================== 1600/2000 | eta: 8m
warmup =============================== 1650/2000 | eta: 7m
warmup ================================ 1700/2000 | eta: 6m
warmup ================================= 1750/2000 | eta: 5m
warmup ================================== 1800/2000 | eta: 4m
warmup =================================== 1850/2000 | eta: 3m
warmup ==================================== 1900/2000 | eta: 2m
warmup ===================================== 1950/2000 | eta: 1m
warmup ====================================== 2000/2000 | eta: 0s
##
sampling 0/1000 | eta: ?s
sampling == 50/1000 | eta: 23m
sampling ==== 100/1000 | eta: 22m
sampling ====== 150/1000 | eta: 22m
sampling ======== 200/1000 | eta: 20m
sampling ========== 250/1000 | eta: 19m
sampling =========== 300/1000 | eta: 17m
sampling ============= 350/1000 | eta: 16m
sampling =============== 400/1000 | eta: 15m
sampling ================= 450/1000 | eta: 14m
sampling =================== 500/1000 | eta: 13m
sampling ===================== 550/1000 | eta: 11m
sampling ======================= 600/1000 | eta: 10m
sampling ========================= 650/1000 | eta: 9m
sampling =========================== 700/1000 | eta: 8m
sampling ============================ 750/1000 | eta: 6m
sampling ============================== 800/1000 | eta: 5m
sampling ================================ 850/1000 | eta: 4m
sampling ================================== 900/1000 | eta: 3m
sampling ==================================== 950/1000 | eta: 1m
sampling ====================================== 1000/1000 | eta: 0s
mcmc_trace(draws)
beta_values <- calculate(beta_variable,draws) %>%
lapply(
variable_summaries
)
mu_values <- calculate(mu,draws) %>%
lapply(
variable_summaries
)
Values <- list(
size = Sard_cover,
mu = colMeans(do.call(rbind,calculate(mu,draws))),
beta = variable_summaries(do.call(rbind,calculate(beta_variable,draws)))
)
beta_parameter <- Values$beta[1,1]
parameter_df <- data.frame(
cover = Sard_cover,
alpha = (Values$mu * beta_parameter) / (1 - Values$mu),
beta = beta_parameter
)
samples <- parameter_df %>%
apply(
1,function(x){
S <- extraDistr::rbbinom(n = 1000,size = x[1],alpha = x[2],beta = x[3])
if (sum(is.na(S)) > 0) {
print(x)
}
S <- S / x[1]
return(quantile(S,c(0.05,0.50,0.95),na.rm = T))
}) %>%
t %>%
as.data.frame()
colnames(samples) <- c("Q_005","Q_050","Q_095")
prediction_matrix <- data.frame(
pred = samples$Q_050,
pred_005 = samples$Q_005,
pred_095 = samples$Q_095,
true = (Sard_count)/Values$size,
prob = Values$mu,
variant = replicate_data$variant,
Gene = replicate_data$Gene,
Age = replicate_data$Age,
SardID = replicate_data$SardID,
Replicate = replicate_data$Replicate
)
prediction_matrix %>%
gather(key = 'key',value = 'value',true,prob) %>%
ggplot(aes(x = Age,y = value,colour = key,shape = as.factor(Replicate))) +
geom_point() +
geom_errorbar(aes(ymin = pred_005,ymax = pred_095)) +
geom_line() +
theme_minimal(base_size = 15) +
facet_wrap(~ paste(variant,SardID),scales = 'free') +
theme(legend.position = "bottom")
Values$beta %>%
saveRDS("models/overdispersion.RDS")