vignettes/not-built-vignettes/reproduce-revised-paper-results.Rmd
reproduce-revised-paper-results.Rmd
InfectionTrees
is installed (devtools::install_github("skgallagher/InfectionTrees")
) and loaded along with the following libraries
library(InfectionTrees)
library(ggplot2)
library(knitr)
library(kableExtra)
library(tidyr)
library(dplyr)
If there are any occurrences of devtools::load_all()
, remove them and replace with library(InfectionTrees)
.
Expected run time is an approximation for of estimated time to run the script. They were originally run on a laptop with the following specs unless otherwise specified.
: Ubuntu
Distributor ID: Ubuntu 20.04.2 LTS
Description: 20.04
Release: focal
Codename
: x86_64
Architecture-mode(s): 32-bit, 64-bit
CPU op: Little Endian
Byte Order: 39 bits physical, 48 bits virtual
Address sizesCPU(s): 8
-line CPU(s) list: 0-7
OnThread(s) per core: 2
Core(s) per socket: 4
Socket(s): 1
node(s): 1
NUMA : GenuineIntel
Vendor ID: 6
CPU family: 142
Model: Intel(R) Core(TM) i7-10510U CPU @ 1.80GHz
Model name: 12
Stepping: 1281.004
CPU MHz: 4900.0000
CPU max MHz: 400.0000
CPU min MHz: 4599.93
BogoMIPS: VT-x
Virtualization: 128 KiB
L1d cache: 128 KiB
L1i cache: 1 MiB
L2 cache: 8 MiB
L3 cacheCPU(s): 0-7
NUMA node0 : KVM: Mitigation: VMX disabled
Vulnerability Itlb multihit: Not affected
Vulnerability L1tf: Not affected
Vulnerability Mds: Not affected
Vulnerability Meltdown: Mitigation; Speculative Store Bypass disabled v
Vulnerability Spec store bypass
ia prctl and seccomp: Mitigation; usercopy/swapgs barriers and __user
Vulnerability Spectre v1
pointer sanitization: Mitigation; Enhanced IBRS, IBPB conditional, RS
Vulnerability Spectre v2
B filling: Mitigation; TSX disabled
Vulnerability Srbds: Not affected
Vulnerability Tsx async abort: fpu vme de pse tsc msr pae mce cx8 apic sep mtr
Flags
r pge mca cmov pat pse36 clflush dts acpi mmx f
xsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rd
tscp lm constant_tsc art arch_perfmon pebs bts
rep_good nopl xtopology nonstop_tsc cpuid aperf
mperf pni pclmulqdq dtes64 monitor ds_cpl vmx e
st tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer
aes xsave avx f16c rdrand lahf_lm abm 3dnowpre
fetch cpuid_fault epb invpcid_single ssbd ibrs
ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpr
iority ept vpid ept_ad fsgsbase tsc_adjust bmi1
avx2 smep bmi2 erms invpcid mpx rdseed adx sma
p clflushopt intel_pt xsaveopt xsavec xgetbv1 x
saves dtherm ida arat pln pts hwp hwp_notify hw
p_act_window hwp_epp md_clear flush_l1d arch_ca pabilities
All code for reproducing our revised paper results may be found here.
Please clone the entire repository into your local directory and set your working directory to revised_paper_results/
The results include the following:
table1-binary-cov-base-model-sims/
run-base-sims.R
run-base-sims-boot.R
analyze-base-sims.R
table2-binary-cov-mot-model-sims/
outsider-sims.R
analyze-mot-sims.R
md-tb/
tab3-4-data-eda.R
tb-data-base-and-mot.R
bootstrap-sims.R
data-naive-model.R
data-naive-boot.R
analyze-data-big-table.R
fig4-most-likely-trees.R
enough-MC-trees/sampling-enough-mc-trees.R
table1-binary-cov-base-model-sims/
This table was produced by running the following three files in order.
run-base-sims.R
Expected run-time: ~3-5 hours
#!/usr/bin/env Rscript
## June 15, 2021
## SKG
## Rerun the simulations
## This is for the base model with a binary covariate
## We are using alpha values of .5, .7, .1
## bet0 values of -2.5, -1.5, and -2.75
## and beta1 values of -.5, 0, -.25
## There are 15 total sets (so not all combinations are used)
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(dplyr)
set.seed(6172021)
## Simulations to try
M <- 100 # number of simulations for each parameter configuration
B <- 5000 # number of MC samples
K <- 1000 # number of clusters in a data set
par_df1 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
1, 0, -1),
gamma = .5)
par_df2 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
.5, 0, -.5),
gamma = .7)
par_df3 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.75, -2.75, -2.75),
beta_1 = c(2.25, 0, -.25),
gamma = .1)
###########################################################################
## data frame of simulation parameter configurations
par_df <- dplyr::bind_rows(par_df1, par_df2, par_df3)
out_list <- vector(mode = "list", length = nrow(par_df))
#########################################################################
## Generate a small set of pre-sampled trees to get started
binary_cluster_summaries <- data.frame(cluster_size = c(1, 1, 2, 2, 2),
x_pos = c(0, 1, 0, 1, 2),
x_neg = c(1, 0, 2, 1, 0),
freq = 1)
mc_trees <- sample_mc_binary_cov(B = B,
observed_cluster_summaries =
binary_cluster_summaries) %>%
dplyr::select(-freq)
## Run the simulations
total_time <- proc.time()[3]
for(zz in 1:nrow(par_df)){
t_params <- proc.time()[3]
beta_0 <- par_df$beta_0[zz]
beta_1 <- par_df$beta_1[zz]
gamma <- par_df$gamma[zz]
cat(sprintf("Parameter set %d \n", zz))
cat(sprintf("beta_0 = %.2f, beta_1 = %.2f, gamma = %.2f\n",
beta_0, beta_1, gamma))
## For a given set of parameters
best_params_mat <- matrix(0, nrow = M, ncol = 2)
coverage_mat <- matrix(0, nrow = M, ncol = 4)
colnames(coverage_mat) <- c("lp_beta0", "lp_beta1",
"fi_beta0", "fi_beta1")
cluster_sizes <- numeric(K * M) # store all the cluster sizes from a simulated set of
## observed data
max_cluster_sizes <- matrix(M)
for(mm in 1:M){
t_sims <- proc.time()[3]
cat( sprintf("Simulation %d \n", mm))
## Simulate a branching process data set
## Distribution of xs
sample_covariates_df <- data.frame(x = c(rep(0, gamma * 10),
rep(1, (1-gamma) * 10)))
data <- simulate_bp(K = K ,
inf_params = c(beta_0, beta_1),
sample_covariates_df = sample_covariates_df,
covariate_names = "x",
max_size = 55)
## Summarize the data set
binary_cluster_summaries <- summarize_binary_clusters(data)
cluster_sizes[(mm-1) *K + (1:K)] <- rep(binary_cluster_summaries$cluster_size,
times = binary_cluster_summaries$freq)
max_cluster_sizes[mm] <- max(binary_cluster_summaries$cluster_size)
cat(sprintf("max cluster size is %d \n", max_cluster_sizes[mm]))
## ##################################3
## SAMPLE (ADDITIONAL) MC TREES
## #######################################
size_npos <- with(binary_cluster_summaries,
paste0(cluster_size, "-", x_pos))
sampled_size_npos <- with(mc_trees,
paste0(cluster_size, "-", x_pos))
missing_sample_ids <- which(!(size_npos %in% sampled_size_npos))
if(length(missing_sample_ids) > 0){
new_cluster_summaries <- binary_cluster_summaries[missing_sample_ids,]
print("Sampling the following trees")
print(size_npos[missing_sample_ids])
new_mc_trees <- sample_mc_binary_cov(B = B,
observed_cluster_summaries =
new_cluster_summaries) %>%
dplyr::select(-freq)
mc_trees <- bind_rows(mc_trees, new_mc_trees) %>%
arrange(cluster_size, x_pos)
}
#######################################
## OPTIMIZE
######################################
inf_params_0 <- c(0,0)
best_params <- optim(inf_params_0, fn = bp_loglike_binary_cov,
obs_data_summary = binary_cluster_summaries,
mc_samples_summary = mc_trees,
return_neg = TRUE,
lower = c(-5, -5),
upper = c(0, 5),
method = "L-BFGS-B",
hessian = TRUE)
best_params_mat[mm,] <- best_params$par
## Fisher information (approximately)
se <- sqrt(diag(solve(best_params$hessian)))
lower <- best_params$par - 1.96 * se
upper <- best_params$par + 1.96 * se
coverage_mat[mm, 3:4] <- ifelse(lower < c(beta_0, beta_1) &
upper > c(beta_0, beta_1),
1, 0)
print("best params")
print(best_params$par)
## Likelihood profiles
## lp_ests <- binary_likelihood_profs(best_pars = best_params$par,
## max_loglike = -best_params$value,
## obs_data_summary =
## binary_cluster_summaries,
## mc_samples_summary = mc_trees,
## alpha = .05)
# print("95% CI from LP")
# print(lp_ests)
print("95% CI from FI")
print(cbind(lower, upper))
## coverage_mat[mm, 1:2] <- ifelse(lp_ests[,2] < c(beta_0, beta_1) &
## lp_ests[,3] > c(beta_0, beta_1),
## 1, 0)
print("Data simulation time (hrs)")
print((proc.time()[3] - t_sims) / 3600, digits = 2)
print("Cumulative time (hrs)")
print((proc.time()[3] - total_time) / 3600, digits = 2)
}
print("Params simulation time (hrs)")
print((proc.time()[3] - t_params) / 3600, digits = 2)
cat(sprintf("beta_0 = %.2f, beta_1 = %.2f, gamma = %.2f\n ",
beta_0, beta_1, gamma))
print("Mean estimates")
print(colMeans(best_params_mat))
print("Median estimates")
print( apply(best_params_mat, 2, median))
print("2.5Q")
print( apply(best_params_mat, 2, quantile, probs = .025))
print("2.5Q")
print( apply(best_params_mat, 2, quantile, probs = .975))
print("Coverage beta0, beta1")
print(colMeans(coverage_mat))
print("Cluster sizes:")
cluster_size_summary <- quantile(cluster_sizes, probs = c(.5, .9, 1))
print(cluster_size_summary)
sims_list <- list(best_params_mat = best_params_mat,
set = zz,
max_cluster_sizes = max_cluster_sizes,
coverage = colMeans(coverage_mat),
cluster_size_summary = cluster_size_summary)
out_list[[zz]] <- sims_list
}
current_time <- Sys.time()
current_time <- gsub(" ", "_", current_time)
current_time <- gsub(":", ".", current_time)
fn <- paste0("simulation_results_", current_time, ".RDS")
saveRDS(out_list, fn)
run-base-sims-boot.R
Expected run-time: ~12-18 hours
#!/usr/bin/env Rscript
## June 18, 2021
## SKG
## Rerun the simulations
## This is for the base model with a binary covariate
## We are using alpha values of .5, .7, .1
## beta0 values of -2.5, -1.5, and -2.75
## and beta1 values of -.5, 0, -.25, 2.25
## There are 15 total sets (so not all combinations are used)
## This version includes a bootstrap resample woo
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(dplyr)
set.seed(6172021)
## Simulations to try
M <- 100 # number of simulations for each parameter configuration
B <- 5000 # number of MC samples
K <- 1000 # number of clusters in a data set
n_boot <- 30
par_df1 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
1, 0, -1),
gamma = .5)
par_df2 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
.5, 0, -.5),
gamma = .7)
par_df3 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.75, -2.75, -2.75),
beta_1 = c(2.25, 0, -.25),
gamma = .1)
###########################################################################
## data frame of simulation parameter configurations
par_df <- dplyr::bind_rows(par_df1, par_df2, par_df3)
out_list <- vector(mode = "list", length = nrow(par_df))
#########################################################################
## Generate a small set of pre-sampled trees to get started
binary_cluster_summaries <- data.frame(cluster_size = c(1, 1, 2, 2, 2),
x_pos = c(0, 1, 0, 1, 2),
x_neg = c(1, 0, 2, 1, 0),
freq = 1)
mc_trees <- sample_mc_binary_cov(B = B,
observed_cluster_summaries =
binary_cluster_summaries) %>%
dplyr::select(-freq)
## Run the simulations
total_time <- proc.time()[3]
for(zz in 1:nrow(par_df)){
t_params <- proc.time()[3]
beta_0 <- par_df$beta_0[zz]
beta_1 <- par_df$beta_1[zz]
gamma <- par_df$gamma[zz]
cat(sprintf("Parameter set %d \n", zz))
cat(sprintf("beta_0 = %.2f, beta_1 = %.2f, gamma = %.2f\n",
beta_0, beta_1, gamma))
## For a given set of parameters
best_params_mat <- matrix(0, nrow = M, ncol = 2)
coverage_mat <- matrix(0, nrow = M, ncol = 4)
colnames(coverage_mat) <- c("boot_beta0", "boot_beta1",
"fi_beta0", "fi_beta1")
cluster_sizes <- numeric(K * M) # store all the cluster sizes from a simulated set of
## observed data
max_cluster_sizes <- matrix(M)
for(mm in 1:M){
t_sims <- proc.time()[3]
cat( sprintf("Simulation %d \n", mm))
## Simulate a branching process data set
## Distribution of xs
sample_covariates_df <- data.frame(x = c(rep(0, gamma * 10),
rep(1, (1-gamma) * 10)))
data <- simulate_bp(K = K ,
inf_params = c(beta_0, beta_1),
sample_covariates_df = sample_covariates_df,
covariate_names = "x",
max_size = 60)
## Summarize the data set
binary_cluster_summaries <- summarize_binary_clusters(data)
cluster_sizes[(mm-1) *K + (1:K)] <- rep(binary_cluster_summaries$cluster_size,
times = binary_cluster_summaries$freq)
max_cluster_sizes[mm] <- max(binary_cluster_summaries$cluster_size)
cat(sprintf("max cluster size is %d \n", max_cluster_sizes[mm]))
## ##################################3
## SAMPLE (ADDITIONAL) MC TREES
## #######################################
size_npos <- with(binary_cluster_summaries,
paste0(cluster_size, "-", x_pos))
sampled_size_npos <- with(mc_trees,
paste0(cluster_size, "-", x_pos))
missing_sample_ids <- which(!(size_npos %in% sampled_size_npos))
if(length(missing_sample_ids) > 0){
new_cluster_summaries <- binary_cluster_summaries[missing_sample_ids,]
print("Sampling the following trees")
print(size_npos[missing_sample_ids])
new_mc_trees <- sample_mc_binary_cov(B = B,
observed_cluster_summaries =
new_cluster_summaries) %>%
dplyr::select(-freq)
mc_trees <- bind_rows(mc_trees, new_mc_trees) %>%
arrange(cluster_size, x_pos)
}
#######################################
## OPTIMIZE
######################################
inf_params_0 <- c(0,0)
best_params <- optim(inf_params_0, fn = bp_loglike_binary_cov,
obs_data_summary = binary_cluster_summaries,
mc_samples_summary = mc_trees,
return_neg = TRUE,
lower = c(-5, -5),
upper = c(0, 5),
method = "L-BFGS-B",
hessian = TRUE)
best_params_mat[mm,] <- best_params$par
## Fisher information (approximately)
se <- sqrt(diag(solve(best_params$hessian)))
lower <- best_params$par - 1.96 * se
upper <- best_params$par + 1.96 * se
coverage_mat[mm, 3:4] <- ifelse(lower < c(beta_0, beta_1) &
upper > c(beta_0, beta_1),
1, 0)
## #####################
## BOOSTRAP TO GET A SE
## #####################
boot_mat <- matrix(0, nrow = n_boot, ncol = 2)
for(bb in 1:n_boot){
## Resample data
resampled_cluster_inds <- sample(1:nrow(binary_cluster_summaries),
size = sum(binary_cluster_summaries$freq),
prob = binary_cluster_summaries$freq /
sum(binary_cluster_summaries$freq),
replace = TRUE)
resampled_clusters <- binary_cluster_summaries[resampled_cluster_inds,] %>%
group_by(cluster_size, x_pos, x_neg) %>%
summarize(freq = n(), .groups = "drop")
## Get estimate of beta
best_params_boot <- optim(inf_params_0, fn = bp_loglike_binary_cov,
obs_data_summary = resampled_clusters,
mc_samples_summary = mc_trees,
return_neg = TRUE,
lower = c(-5, -5),
upper = c(0, 5),
method = "L-BFGS-B")
boot_mat[bb,] <- best_params_boot$par
}
se_boot <- apply(boot_mat, 2, sd)
lower_boot <- best_params$par - 1.96 * se_boot
upper_boot <- best_params$par + 1.96 * se_boot
coverage_mat[mm, 1:2] <- ifelse(lower_boot < c(beta_0, beta_1) &
upper_boot > c(beta_0, beta_1),
1, 0)
## #####################
print("best params")
print(best_params$par)
## Likelihood profiles
## lp_ests <- binary_likelihood_profs(best_pars = best_params$par,
## max_loglike = -best_params$value,
## obs_data_summary =
## binary_cluster_summaries,
## mc_samples_summary = mc_trees,
## alpha = .05)
# print("95% CI from LP")
# print(lp_ests)
print("95% CI from FI")
print(cbind(lower, upper))
## coverage_mat[mm, 1:2] <- ifelse(lp_ests[,2] < c(beta_0, beta_1) &
## lp_ests[,3] > c(beta_0, beta_1),
## 1, 0)
print("Data simulation time (hrs)")
print((proc.time()[3] - t_sims) / 3600, digits = 2)
print("Cumulative time (hrs)")
print((proc.time()[3] - total_time) / 3600, digits = 2)
}
print("Params simulation time (hrs)")
print((proc.time()[3] - t_params) / 3600, digits = 2)
cat(sprintf("beta_0 = %.2f, beta_1 = %.2f, gamma = %.2f\n ",
beta_0, beta_1, gamma))
print("Mean estimates")
print(colMeans(best_params_mat))
print("Median estimates")
print( apply(best_params_mat, 2, median))
print("2.5Q")
print( apply(best_params_mat, 2, quantile, probs = .025))
print("2.5Q")
print( apply(best_params_mat, 2, quantile, probs = .975))
print("Coverage beta0, beta1")
print(colMeans(coverage_mat))
print("Cluster sizes:")
cluster_size_summary <- quantile(cluster_sizes, probs = c(.5, .9, 1))
print(cluster_size_summary)
sims_list <- list(best_params_mat = best_params_mat,
set = zz,
max_cluster_sizes = max_cluster_sizes,
coverage = colMeans(coverage_mat),
cluster_size_summary = cluster_size_summary,
n_boot = n_boot)
out_list[[zz]] <- sims_list
}
current_time <- Sys.time()
current_time <- gsub(" ", "_", current_time)
current_time <- gsub(":", ".", current_time)
fn <- paste0("simulation_results_boot_", current_time, ".RDS")
saveRDS(out_list, fn)
analyze-base-sims.R
Expected run-time: ~seconds, given the above completed results.
## SKG
## July 31, 2020
## Analyze simulations
## JUNE 15, 2021 UPDATES FOR JCGS REVISION
library(ggplot2)
library(tidyverse)
library(gridExtra)
library(knitr)
library(kableExtra)
library(viridis)
fn <- list.files()
fn <- grep(".RDS", fn, value = TRUE)
out_list <- readRDS(max(fn))
## Parameter sets
## Simulations to try
M <- 100 # number of simulations for each parameter configuration
B <- 5000 # number of MC samples
K <- 1000 # number of clusters in a data set
par_df1 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
1, 0, -1),
gamma = .5)
par_df2 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
.5, 0, -.5),
gamma = .7)
par_df3 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.75, -2.75, -2.75),
beta_1 = c(2.25, 0, -.25),
gamma = .1)
###########################################################################
## data frame of simulation parameter configurations
par_df <- dplyr::bind_rows(par_df1, par_df2, par_df3)
sizes <- as.data.frame(do.call('rbind', lapply(out_list, "[[", 5)))
cov <- as.data.frame(do.call('rbind', lapply(out_list, "[[", 4))[, 1:2])
mean_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
colMeans)))
mean_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
function(mat){
apply(mat, 2, mean)
}))) %>%
rename(mean_beta0 = V1,
mean_beta1 = V2)
med_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
function(mat){
apply(mat, 2, median)
}))) %>%
rename(med_beta0 = V1,
med_beta1 = V2)
iqr_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
function(mat){
apply(mat, 2, function(x){
quantile(x, prob = .75) - quantile(x, prob = .25)
})
}))) %>%
rename(IQR_beta0 = V1,
IQR_beta1 = V2)
se_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
function(mat){
apply(mat, 2, function(x){
sd(x)
})
}))) %>%
rename(se_beta0 = V1,
se_beta1 = V2)
summary_df <- cbind(par_df,
mean_pars,
med_pars,
iqr_pars,
se_pars,
cov,
sizes) %>%
arrange(`100%`, `90%`)
pretty_df <- summary_df %>%
mutate(sizes = paste0("(", `100%`,
", ", `90%`,
", ", `50%`,
")"),
prob_pos = gamma,
betas = paste0("(", beta_0,
", ", beta_1,
")"),
est_betas1 = paste0("(", round(mean_beta0, 2),
" (", round(se_beta0, 2),
"), ", round(mean_beta1, 2),
" (", round(se_beta1, 2), "))"),
est_betas2 = paste0("(", round(med_beta0, 2),
" (", round(IQR_beta0, 2),
"), ", round(med_beta1, 2),
" (", round(IQR_beta1, 2), "))"),
cov = paste0("[", boot_beta0 * 100, ", ",
boot_beta1 * 100, "]\\%")
) %>%
select(sizes, prob_pos, betas,
est_betas1,
est_betas2,
cov)
pretty_df %>%
knitr::kable("latex", digits = 2, booktabs = TRUE, align = "c",
linesep = "",
col.names = c("$|\\mathcal{C}|$ (Max, 90Q, 50Q)",
"P(+)",
"$(\\beta_0$, $\\beta_1)$",
"(Mean $\\hat{\\beta}_0$, (SE), Mean $\\hat{\\beta}_1$ (SE))",
"(Median $\\hat{\\beta}_0$, (IQR), Median $\\hat{\\beta}_1$ (IQR))",
"Coverage [$\\beta_0$, $\\beta_1$]"
), escape = FALSE,
caption = paste0("Results of simulations where we generate from Model in Eq. \\eqref{eq:sim-mod}.",
" For each row in the table, we generate 100 sets of outbreak data each with 1000 transmission trees for a given $P(+)$, $\\beta_0$ and $\\beta_1$. ",
"to estimate the MLE of $\\bm{\\beta}$.",
"The mean and bootstrap standard error (SE) along with the median and inner quartile range (IQR) ",
"are reported over the 100 sets of outbreak data. ",
"Additionally we report the coverage where a 95\\% CI was derived ",
"using a bootstrap estimate of the SE."
),
label = "sim-results") %>%
kableExtra::kable_styling(latex_options = c("scale_down",
"striped"))
table2-binary-cov-mot-model-sims/
This table was produced by running the following two files in order.
outsider-sims.R
Expected run-time: ~30 hours
#!/usr/bin/env Rscript
## June 15, 2021
## SKG
## Rerun the simulations
## This is for the base model with a binary covariate
## We are using alpha values of .5, .7, .1
## bet0 values of -2.5, -1.5, and -2.75
## and beta1 values of -.5, 0, -.25
## There are 15 total sets (so not all combinations are used)
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(dplyr)
set.seed(6172021)
## Simulations to try
M <- 100 # number of simulations for each parameter configuration
B <- 5000 # number of MC samples
K <- 1000 # number of clusters in a data set
par_df1 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
1, 0, -1),
gamma = .5)
par_df2 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
.5, 0, -.5),
gamma = .7)
par_df3 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.75, -2.75, -2.75),
beta_1 = c(2.25, 0, -.25),
gamma = .1)
###########################################################################
## data frame of simulation parameter configurations
par_df <- dplyr::bind_rows(par_df1, par_df2, par_df3)
out_list <- vector(mode = "list", length = nrow(par_df))
#########################################################################
## Generate a small set of pre-sampled trees to get started
binary_cluster_summaries <- data.frame(cluster_size = c(1, 1, 2, 2, 2),
x_pos = c(0, 1, 0, 1, 2),
x_neg = c(1, 0, 2, 1, 0),
freq = 1)
mc_trees <- sample_mc_binary_cov(B = B,
observed_cluster_summaries =
binary_cluster_summaries) %>%
dplyr::select(-freq)
## Run the simulations
total_time <- proc.time()[3]
for(zz in 1:nrow(par_df)){
t_params <- proc.time()[3]
beta_0 <- par_df$beta_0[zz]
beta_1 <- par_df$beta_1[zz]
gamma <- par_df$gamma[zz]
cat(sprintf("Parameter set %d \n", zz))
cat(sprintf("beta_0 = %.2f, beta_1 = %.2f, gamma = %.2f\n",
beta_0, beta_1, gamma))
## For a given set of parameters
best_params_mat <- matrix(0, nrow = M, ncol = 2)
coverage_mat <- matrix(0, nrow = M, ncol = 4)
colnames(coverage_mat) <- c("lp_beta0", "lp_beta1",
"fi_beta0", "fi_beta1")
cluster_sizes <- numeric(K * M) # store all the cluster sizes from a simulated set of
## observed data
max_cluster_sizes <- matrix(M)
for(mm in 1:M){
t_sims <- proc.time()[3]
cat( sprintf("Simulation %d \n", mm))
## Simulate a branching process data set
## Distribution of xs
sample_covariates_df <- data.frame(x = c(rep(0, gamma * 10),
rep(1, (1-gamma) * 10)))
data <- simulate_bp(K = K ,
inf_params = c(beta_0, beta_1),
sample_covariates_df = sample_covariates_df,
covariate_names = "x",
max_size = 55)
## Summarize the data set
binary_cluster_summaries <- summarize_binary_clusters(data)
cluster_sizes[(mm-1) *K + (1:K)] <- rep(binary_cluster_summaries$cluster_size,
times = binary_cluster_summaries$freq)
max_cluster_sizes[mm] <- max(binary_cluster_summaries$cluster_size)
cat(sprintf("max cluster size is %d \n", max_cluster_sizes[mm]))
## ##################################3
## SAMPLE (ADDITIONAL) MC TREES
## #######################################
size_npos <- with(binary_cluster_summaries,
paste0(cluster_size, "-", x_pos))
sampled_size_npos <- with(mc_trees,
paste0(cluster_size, "-", x_pos))
missing_sample_ids <- which(!(size_npos %in% sampled_size_npos))
if(length(missing_sample_ids) > 0){
new_cluster_summaries <- binary_cluster_summaries[missing_sample_ids,]
print("Sampling the following trees")
print(size_npos[missing_sample_ids])
new_mc_trees <- sample_mc_binary_cov(B = B,
observed_cluster_summaries =
new_cluster_summaries) %>%
dplyr::select(-freq)
mc_trees <- bind_rows(mc_trees, new_mc_trees) %>%
arrange(cluster_size, x_pos)
}
#######################################
## OPTIMIZE
######################################
inf_params_0 <- c(0,0)
best_params <- optim(inf_params_0, fn = bp_loglike_binary_cov,
obs_data_summary = binary_cluster_summaries,
mc_samples_summary = mc_trees,
return_neg = TRUE,
lower = c(-5, -5),
upper = c(0, 5),
method = "L-BFGS-B",
hessian = TRUE)
best_params_mat[mm,] <- best_params$par
## Fisher information (approximately)
se <- sqrt(diag(solve(best_params$hessian)))
lower <- best_params$par - 1.96 * se
upper <- best_params$par + 1.96 * se
coverage_mat[mm, 3:4] <- ifelse(lower < c(beta_0, beta_1) &
upper > c(beta_0, beta_1),
1, 0)
print("best params")
print(best_params$par)
## Likelihood profiles
## lp_ests <- binary_likelihood_profs(best_pars = best_params$par,
## max_loglike = -best_params$value,
## obs_data_summary =
## binary_cluster_summaries,
## mc_samples_summary = mc_trees,
## alpha = .05)
# print("95% CI from LP")
# print(lp_ests)
print("95% CI from FI")
print(cbind(lower, upper))
## coverage_mat[mm, 1:2] <- ifelse(lp_ests[,2] < c(beta_0, beta_1) &
## lp_ests[,3] > c(beta_0, beta_1),
## 1, 0)
print("Data simulation time (hrs)")
print((proc.time()[3] - t_sims) / 3600, digits = 2)
print("Cumulative time (hrs)")
print((proc.time()[3] - total_time) / 3600, digits = 2)
}
print("Params simulation time (hrs)")
print((proc.time()[3] - t_params) / 3600, digits = 2)
cat(sprintf("beta_0 = %.2f, beta_1 = %.2f, gamma = %.2f\n ",
beta_0, beta_1, gamma))
print("Mean estimates")
print(colMeans(best_params_mat))
print("Median estimates")
print( apply(best_params_mat, 2, median))
print("2.5Q")
print( apply(best_params_mat, 2, quantile, probs = .025))
print("2.5Q")
print( apply(best_params_mat, 2, quantile, probs = .975))
print("Coverage beta0, beta1")
print(colMeans(coverage_mat))
print("Cluster sizes:")
cluster_size_summary <- quantile(cluster_sizes, probs = c(.5, .9, 1))
print(cluster_size_summary)
sims_list <- list(best_params_mat = best_params_mat,
set = zz,
max_cluster_sizes = max_cluster_sizes,
coverage = colMeans(coverage_mat),
cluster_size_summary = cluster_size_summary)
out_list[[zz]] <- sims_list
}
current_time <- Sys.time()
current_time <- gsub(" ", "_", current_time)
current_time <- gsub(":", ".", current_time)
fn <- paste0("simulation_results_", current_time, ".RDS")
saveRDS(out_list, fn)
analyze-mot-sims.R
Expected run-time: ~seconds, given the above completed results.
## SKG
## July 31, 2020
## Analyze simulations
## JUNE 15, 2021 UPDATES FOR JCGS REVISION
library(ggplot2)
library(tidyverse)
library(gridExtra)
library(knitr)
library(kableExtra)
library(viridis)
## Biowulf output from outsider-sims.R
folder_name <- "biowulf-results"
filenames <- list.files(file.path(folder_name))
out_list <- lapply(file.path(folder_name, filenames),
readRDS)[[1]]
## Parameter sets
## Simulations to try
M <- 100 # number of simulations for each parameter configuration
B <- 10000 # number of MC samples
K <- 1000 # number of clusters in a data set
par_df1 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
1, 0, -1),
gamma = .5)
par_df2 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.5, -2.5, -2.5,
-1.5, -1.5, -1.5),
beta_1 = c(.5, 0, -.5,
.5, 0, -.5),
gamma = .7)
par_df3 <- data.frame(
M = M,
B = B,
beta_0 = c(-2.75, -2.75, -2.75),
beta_1 = c(2.25, 0, -.25),
gamma = .1)
###########################################################################
## data frame of simulation parameter configurations
par_df <- dplyr::bind_rows(par_df1, par_df2, par_df3)
sizes <- as.data.frame(do.call('rbind', lapply(out_list, "[[", 7)))
cov <- as.data.frame(do.call('rbind', lapply(out_list, "[[", 8))[, 1:2])
colnames(cov) <- c("fi_beta0", "fi_beta1")
mean_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
colMeans)))
mean_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
function(mat){
apply(mat, 2, mean)
}))) %>%
rename(mean_beta0 = V1,
mean_beta1 = V2)
med_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
function(mat){
apply(mat, 2, median)
}))) %>%
rename(med_beta0 = V1,
med_beta1 = V2)
iqr_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
function(mat){
apply(mat, 2, function(x){
quantile(x, prob = .75) - quantile(x, prob = .25)
})
}))) %>%
rename(IQR_beta0 = V1,
IQR_beta1 = V2)
se_pars <- as.data.frame(do.call('rbind', lapply(
lapply(out_list, "[[", 1),
function(mat){
apply(mat, 2, function(x){
sd(x)
})
}))) %>%
rename(se_beta0 = V1,
se_beta1 = V2)
summary_df <- cbind(par_df,
mean_pars,
med_pars,
iqr_pars,
se_pars,
cov,
sizes) %>%
arrange(max, q90)
pretty_df <- summary_df %>%
mutate(sizes = paste0("(", `max`,
", ", `q90`,
", ", `med`,
")"),
prob_pos = gamma,
betas = paste0("(", beta_0,
", ", beta_1,
")"),
est_betas1 = paste0("(", round(mean_beta0, 2),
" (", round(se_beta0, 2),
"), ", round(mean_beta1, 2),
" (", round(se_beta1, 2), "))"),
est_betas2 = paste0("(", round(med_beta0, 2),
" (", round(IQR_beta0, 2),
"), ", round(med_beta1, 2),
" (", round(IQR_beta1, 2), "))"),
cov = paste0("[", fi_beta0 * 100, ", ",
fi_beta1 * 100, "]\\%")
) %>%
select(sizes, prob_pos, betas,
est_betas1,
est_betas2,
cov)
pretty_df %>%
knitr::kable("latex", digits = 2, booktabs = TRUE, align = "c",
linesep = "",
col.names = c("$|\\mathcal{C}|$ (Max, 90Q, 50Q)",
"P(+)",
"$(\\beta_0$, $\\beta_1)$",
"(Mean $\\hat{\\beta}_0$, (SE), Mean $\\hat{\\beta}_1$ (SE))",
"(Median $\\hat{\\beta}_0$, (IQR), Median $\\hat{\\beta}_1$ (IQR))",
"Coverage [$\\beta_0$, $\\beta_1$]"
), escape = FALSE,
caption = paste0("Results of simulations where we first generate data from Model in Eq. \\eqref{eq:sim-mod}",
"and then exclude the seed individual to reflect the outsider generation process",
" For each row in the table, we generate 100 sets of outbreak data each with 1000 transmission trees for a given $P(+)$, $\\beta_0$ and $\\beta_1$. ",
"to estimate the MLE of $\\bm{\\beta}$. However, fewer than 1000 transmission trees are observed once we exclude the first generation (the outside infection).",
"The mean and bootstrap standard error (SE) along with the median and inner quartile range (IQR) ",
"are reported over the 100 sets of outbreak data. ",
"Additionally we report the coverage where a 95\\% CI was derived ",
"using a Fisher information derived estimate of the SE."
),
label = "sim-results-mot") %>%
kableExtra::kable_styling(latex_options = c("scale_down",
"striped"))
md-tb/
tab3-4-data-eda.R
Expected run-time: ~seconds
## SKG
## June 21, 2021
## Revisions for JCGS
## Tables 3 and 4 in manuscript
## Looking at distribution of cluster sizes
## and a cross section of part of the data (cluster 15)
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(tidyverse)
library(knitr)
library(kableExtra)
data(tb_clean)
## Table 3
### grouped individuals
clust_sizes <- tb_clean %>%
group_by(group) %>%
summarize(size = n())
tab <- data.frame(table(clust_sizes$size))
colnames(tab) <- c("Cluster Size", "Freq.")
kable(t(tab),
format = "latex", row.names = TRUE,
booktabs = TRUE,
caption = "Distribution of cluster size within the Maryland 2003-2009 TB data.",
label = "data-clust-sizes")
## Table 4
## Example cluster
tb_clean %>% filter( group == 15) %>%
select(group, county, rel_time, hivstatus, race, sex, spsmear) %>%
arrange(rel_time) %>%
mutate(county = ifelse(county == "PRINCE GEORGES", "Prince Georges",
"Montgomery")) %>%
kable(format = "latex",
col.names = c("Cluster ID", "County", "Relative day",
"HIV status", "Race", "Sex", "Smear"),
booktabs = TRUE,
caption = "Example of individuals and some of their features within a single cluster", label = "ex-clust") %>%
kable_styling(latex_options="scale_down")
To get these results, we need to run the three models (base, multiple outside transmissions, and naive) and then combine them together.
tb-data-base-and-mot.R
Expected run-time: ~30 hours
#!/usr/bin/env Rscript
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(tidyr)
library(dplyr)
library(kableExtra)
library(ggplot2)
library(forcats)
library(readr)
theme_set(theme_bw() + theme(axis.title = element_text()))
## --------------------------------------------------------------------------------------------------------
clusters <- tb_clean %>%
dplyr::mutate(smear = ifelse(spsmear == "Positive",
1, 0),
cluster_id = group,
hiv_f = ifelse(hivstatus == "Negative", "neg",
ifelse(hivstatus == "Positive", "pos",
"unk"))) %>%
dplyr::mutate(hiv_neg_pos = ifelse(hiv_f == "neg", 1, 0),
hiv_unk_pos = ifelse(hiv_f == "unk", 1, 0)) %>%
dplyr::group_by(cluster_id) %>%
dplyr::mutate(rel_time = as.numeric(rel_time / 365)) %>%
dplyr::mutate(cluster_size = dplyr::n()) %>%
dplyr::ungroup() %>%
mutate(race_f = fct_collapse(race,
white = "White",
black = "Black or African American",
asian = "Asian")) %>%
mutate(race_asian_white = ifelse(race_f == "asian", 1, 0),
race_black_white = ifelse(race_f == "black", 1, 0)) %>%
select(cluster_id, smear,
hiv_neg_pos,
hiv_unk_pos,
rel_time,
race_asian_white,
race_black_white,
cluster_size)
## ----results = 'hide'------------------------------------------------------------------------------------
K <- 10000
my_seed <- 6172020
set.seed(my_seed)
## MODELS
## models
covariate_list <- vector(mode = "list", length = 5)
covariate_list[[1]] <- NA
covariate_list[[2]] <- "smear"
covariate_list[[3]] <- c("smear",
"hiv_neg_pos", "hiv_unk_pos")
covariate_list[[4]] <- c("smear",
"hiv_neg_pos", "hiv_unk_pos",
"rel_time")
covariate_list[[5]] <- c("smear",
"hiv_neg_pos", "hiv_unk_pos",
"rel_time",
"race_asian_white",
"race_black_white")
## Set up outputs
loglike_df <- data.frame(model = 1:length(covariate_list),
n_params = c(1, 2, 4, 5, 7),
loglike = 0,
aic = 0)
beta_mat1 <- matrix(0, nrow = 1, ncol = 4)
rownames(beta_mat1) <- c("Intercept")
colnames(beta_mat1) <- c("Est.", "lower", "upper", "SE")
beta_list <- vector(mode = "list", length = length(covariate_list))
beta_list[[1]] <- beta_mat1
for(ii in 2:length(covariate_list)){
mat <- matrix(0, nrow = length(covariate_list[[ii]]) + 1,
ncol = 4)
rownames(mat) <- c("Intercept", covariate_list[[ii]])
colnames(mat) <- c("Est.", "lower", "upper", "SE")
beta_list[[ii]] <- mat
}
## ----results = 'hide'------------------------------------------------------------------------------------
t_init <- proc.time()[3]
## Sample MC trees all at once
t0 <- proc.time()[3]
## Sample all MC clusters at once for each of the 5 models
mc_trees <- sample_mc_trees(clusters,
B = K,
multiple_outside_transmissions = FALSE,
covariate_names = covariate_list[[length(covariate_list)]])
print(proc.time()[3] - t0)
## Fit each of the models
for(jj in 1:length(covariate_list)){
covariate_names <- covariate_list[[jj]]
print("Model:")
print(covariate_names)
if(is.na(covariate_names[1])){
init_params <- 0
} else{
init_params <- rep(0, length(covariate_names) + 1)
}
## Optimize
print("Optimizing")
bds <- rep(-5, length(init_params))
if(length(covariate_names) > 5){
bds <- rep(-4, length(init_params))
}
lower_bds <- bds
upper_bds <- -bds
cov_mat <- covariate_df_to_mat(mc_trees,
cov_names = covariate_names)
t1 <- proc.time()[3]
best_params <- optim(par = init_params,
fn = general_loglike,
mc_trees = data.table::as.data.table(mc_trees),
return_neg = TRUE,
cov_mat = cov_mat,
cov_names = covariate_names,
use_outsider_prob = FALSE,
multiple_outside_transmissions = FALSE,
method = "L-BFGS-B",
lower = lower_bds,
upper = upper_bds,
hessian = TRUE
)
t2 <- proc.time()[3] - t1
print(paste("Optimization time:", round( t2 / 60, 3),
"min"))
beta_list[[jj]][,1] <- best_params$par
beta_list[[jj]][, 4] <- sqrt(diag(solve(best_params$hessian))) ## SE from Fisher info
print("best params:")
print(beta_list[[jj]])
loglike_df$loglike[jj] <- -best_params$val
print(paste("Total time:", round( (proc.time()[3] - t_init) / 3600, 3),
"hrs"))
}
## --------------------------------------------------------------------------------------------------------
loglike_df <- loglike_df %>%
mutate(aic = -loglike + 2 * n_params,
model = 1:5) %>%
select(model, everything())
loglike_df %>% kable(digits = 2,
col.names = c("Model", "# params.", "Log like.", "AIC")) %>%
kable_styling(bootstrap_options = c("condensed", "hover", "striped", "responsive"),
full_width = FALSE, position = "center")
## --------------------------------------------------------------------------------------------------------
beta_list[[4]] %>%
kable(digits = 2) %>%
kable_styling(bootstrap_options = c("condensed", "hover", "striped", "responsive"),
full_width = FALSE, position = "center")
saveRDS(list(beta_list = beta_list,
loglike_df = loglike_df),
"base-results.RDS")
## ----results = 'hide'------------------------------------------------------------------------------------
K <- 10000
my_seed <- 24
set.seed(my_seed)
## MODELS
## models
covariate_list <- vector(mode = "list", length = 5)
covariate_list[[1]] <- NA
covariate_list[[2]] <- "smear"
covariate_list[[3]] <- c("smear",
"hiv_neg_pos", "hiv_unk_pos")
covariate_list[[4]] <- c("smear",
"hiv_neg_pos", "hiv_unk_pos",
"rel_time")
covariate_list[[5]] <- c("smear",
"hiv_neg_pos", "hiv_unk_pos",
"rel_time",
"race_asian_white",
"race_black_white")
## Set up outputs
loglike_df <- data.frame(model = 1:length(covariate_list),
n_params = c(1, 2, 4, 5, 7),
loglike = 0,
aic = 0)
beta_mat1 <- matrix(0, nrow = 1, ncol = 4)
rownames(beta_mat1) <- c("Intercept")
colnames(beta_mat1) <- c("Est.", "lower", "upper", "SE")
beta_list <- vector(mode = "list", length = length(covariate_list))
beta_list[[1]] <- beta_mat1
for(ii in 2:length(covariate_list)){
mat <- matrix(0, nrow = length(covariate_list[[ii]]) + 1,
ncol = 4)
rownames(mat) <- c("Intercept", covariate_list[[ii]])
colnames(mat) <- c("Est.", "lower", "upper", "SE")
beta_list[[ii]] <- mat
}
## ----results = 'hide'------------------------------------------------------------------------------------
t_init <- proc.time()[3]
## Sample MC trees all at once
t0 <- proc.time()[3]
## Sample all MC clusters at once for each of the 5 models
mc_trees <- sample_mc_trees(clusters,
B = K,
multiple_outside_transmissions = TRUE,
covariate_names = covariate_list[[length(covariate_list)]])
print(proc.time()[3] - t0)
## Fit each of the models
for(jj in 1:length(covariate_list)){
covariate_names <- covariate_list[[jj]]
print("Model:")
print(covariate_names)
if(is.na(covariate_names[1])){
init_params <- 0
} else{
init_params <- rep(0, length(covariate_names) + 1)
}
## Optimize
print("Optimizing")
bds <- rep(-5, length(init_params))
if(length(covariate_names) > 5){
bds <- rep(-4, length(init_params))
}
lower_bds <- bds
upper_bds <- -bds
cov_mat <- covariate_df_to_mat(mc_trees,
cov_names = covariate_names)
t1 <- proc.time()[3]
best_params <- optim(par = init_params,
fn = general_loglike,
mc_trees = data.table::as.data.table(mc_trees),
return_neg = TRUE,
cov_mat = cov_mat,
cov_names = covariate_names,
use_outsider_prob = FALSE,
multiple_outside_transmissions = TRUE,
method = "L-BFGS-B",
lower = lower_bds,
upper = upper_bds,
hessian = TRUE
)
t2 <- proc.time()[3] - t1
print(paste("Optimization time:", round( t2 / 60, 3),
"min"))
beta_list[[jj]][,1] <- best_params$par
beta_list[[jj]][, 4] <- sqrt(diag(solve(best_params$hessian))) ## SE from Fisher info
print("best params:")
print(beta_list[[jj]])
loglike_df$loglike[jj] <- -best_params$val
print(paste("Total time:", round( (proc.time()[3] - t_init) / 3600, 3),
"hrs"))
}
## --------------------------------------------------------------------------------------------------------
loglike_df <- loglike_df %>%
mutate(aic = -loglike + 2 * n_params,
model = 1:5) %>%
select(model, everything())
loglike_df %>% kable(digits = 2,
col.names = c("Model", "# params.", "Log like.", "AIC")) %>%
kable_styling(bootstrap_options = c("condensed", "hover", "striped", "responsive"),
full_width = FALSE, position = "center")
## --------------------------------------------------------------------------------------------------------
beta_list[[4]] %>%
kable(digits = 2) %>%
kable_styling(bootstrap_options = c("condensed", "hover", "striped", "responsive"),
full_width = FALSE, position = "center")
saveRDS(list(beta_list = beta_list,
loglike_df = loglike_df),
"mot-results.RDS")
bootstrap-sims.R
Expected run-time: 5 days for the longest loop. Each loop was run in parallel on the NIAID supercomputer, Biowulf.
#!/usr/bin/env Rscript
## SKG
## Aug. 26, 2020
## Bootstrap simulations to estimate standard error for the data
## 5 different models
## Updated June 22, 2021
## Load libraries and data
if("InfectionTrees" %in% rownames(installed.packages())){
remove.packages("InfectionTrees")
}
devtools::install_github("skgallagher/InfectionTrees")
library(InfectionTrees)
library(tidyverse)
data(tb_clean)
for(use_mot in c(FALSE, TRUE)){
print("")
mod <- ifelse(use_mot,
"Multiple Outside Transmissions",
"Base model")
print(mod)
## simulation parameters
K <- 10000 # of MC trees to draw for each cluster
n_models <- 1
n_boot <- 100 # number of bootstrap simulations
my_seed <- 8262020 + ifelse(use_mot, 1000, 0)
## Each inner loop takes ~1 hr
## Format original clusters to useful variables
orig_clusters <- tb_clean %>%
dplyr::mutate(smear = ifelse(spsmear == "Positive",
1, 0),
cluster_id = group,
hiv_f = ifelse(hivstatus == "Negative", "neg",
ifelse(hivstatus == "Positive", "pos",
"unk"))) %>%
dplyr::mutate(hiv_neg_pos = ifelse(hiv_f == "neg", 1, 0),
hiv_unk_pos = ifelse(hiv_f == "unk", 1, 0)) %>%
dplyr::group_by(cluster_id) %>%
dplyr::mutate(rel_time = as.numeric(rel_time / 365)) %>%
dplyr::mutate(cluster_size = dplyr::n()) %>%
dplyr::ungroup() %>%
mutate(race_f = fct_collapse(race,
white = "White",
black = "Black or African American",
asian = "Asian")) %>%
mutate(race_asian_white = ifelse(race_f == "asian", 1, 0),
race_black_white = ifelse(race_f == "black", 1, 0)) %>%
select(cluster_id, smear,
hiv_neg_pos,
hiv_unk_pos,
rel_time,
race_asian_white,
race_black_white,
cluster_size)
## models
covariate_names <- c("smear",
"hiv_neg_pos", "hiv_unk_pos",
"rel_time")
beta_mat <- matrix(0, ncol = length(covariate_names) + 1,
nrow = n_boot)
colnames(beta_mat) <- c("Intercept", covariate_names)
t_init <- proc.time()[3]
for(nn in 1:n_boot){
set.seed(my_seed + nn)
print(sprintf("Bootstrap simulation %d", nn))
## Sample a bootstrap cluster
b_clusters <- bootstrap_clusters(clusters = orig_clusters)
print(sprintf("sampling %d MC trees", K))
## sample MC trees
t0 <- proc.time()[3]
## Sample all MC clusters at once for each of the 5 models
mc_trees <- sample_mc_trees(b_clusters,
B = K,
multiple_outside_transmissions = use_mot,
covariate_names = covariate_names)
print(proc.time()[3] - t0)
print("Model:")
print(covariate_names)
if(is.na(covariate_names[1])){
init_params <- 0
} else{
init_params <- rep(0, length(covariate_names) + 1)
}
## Optimize
print("Optimizing")
bds <- rep(-5, length(init_params))
if(length(covariate_names) > 5){
bds <- rep(-4, length(init_params))
}
lower_bds <- bds
upper_bds <- -bds
cov_mat <- covariate_df_to_mat(mc_trees,
cov_names = covariate_names)
## Now with data.table
t1 <- proc.time()[3]
best_params <- optim(par = init_params,
fn = general_loglike,
mc_trees = data.table::as.data.table(mc_trees),
return_neg = TRUE,
cov_mat = cov_mat,
cov_names = covariate_names,
use_outsider_prob = FALSE,
multiple_outside_transmissions = use_mot,
method = "L-BFGS-B",
lower = lower_bds,
upper = upper_bds
)
t2 <- proc.time()[3] - t1
print(paste("Optimization time:", round( t2 / 60, 3),
"min"))
print("best params:")
print(best_params$par)
beta_mat[nn,] <- best_params$par
print(paste("Total time:", round( (proc.time()[3] - t_init) / 3600, 3),
"hrs"))
}
se_vec <- apply(beta_mat, 2, sd)
print("Bootstrap sample error matrices")
print(se_vec)
output_list <- list(covariate_names = covariate_names,
se_vec = se_vec,
beta_mat = beta_mat,
n_boot = n_boot,
mot = use_mot)
current_time <- Sys.time()
current_time <- gsub(" ", "_", current_time)
current_time <- gsub(":", ".", current_time)
fn_base <- paste0("bootstrap_sims_",
ifelse(use_mot, "mot_", "base_"),
current_time, ".RDS")
saveRDS(output_list, fn_base)
}
data-naive-model.R
Expected run-time: ~seconds
## SKG
## Testing out model where transmission tree is determined by infection order
## Load libraries and data
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(tidyverse)
data(tb_clean)
## library(optimParallel)
#
## Set up clusters
## cl <- makeCluster(5, type = "FORK")
##setDefaultCluster(cl)
my_seed <- 6172020
set.seed(my_seed)
t0 <- proc.time()[3]
## Format the data for all the variables
## I'm filling in the Unknown with a negative smear
## Should later check to see if results are changed with positive
tb_sub <- tb_clean %>%
dplyr::mutate(smear = ifelse(spsmear == "Positive",
1, 0),
cluster_id = group,
hiv_f = ifelse(hivstatus == "Negative", "neg",
ifelse(hivstatus == "Positive", "pos",
"unk"))) %>%
dplyr::mutate(hiv_neg_pos = ifelse(hiv_f == "neg", 1, 0),
hiv_unk_pos = ifelse(hiv_f == "unk", 1, 0)) %>%
dplyr::group_by(cluster_id) %>%
dplyr::mutate(rel_time = as.numeric(rel_time / 365)) %>%
dplyr::mutate(cluster_size = dplyr::n()) %>%
dplyr::ungroup() %>%
mutate(race_f = fct_collapse(race,
white = "White",
black = "Black or African American",
asian = "Asian")) %>%
mutate(race_asian_white = ifelse(race_f == "asian", 1, 0),
race_black_white = ifelse(race_f == "black", 1, 0)) %>%
select(cluster_id, smear,
hiv_neg_pos,
hiv_unk_pos,
rel_time,
race_asian_white,
race_black_white,
cluster_size)
tb_orig <- tb_sub
naive_loglike <- function(inf_params, dt,
covariate_names, cov_mat,
return_neg = TRUE){
dt <- dt[, prob_inf :=
1 / (1 + exp(-(cov_mat %*% inf_params)))]
## Trying out data.table
loglike_df <- dt[,
.(loglike = naive_cluster_loglike(prob_inf, rel_time)),
by = .(cluster_id)]
loglike <- sum(loglike_df$loglike)
if(return_neg) loglike <- -loglike
return(loglike)
}
# cluster contains column datecoll which is the date of collection
naive_cluster_loglike <- function(prob_inf, datecoll){
last_ind <- which.max(datecoll)
probs <- prob_inf[-last_ind]
loglike <- sum(log(probs)) + sum(log(1 - prob_inf))
return(loglike)
}
naive_loglike_wrapper <- function(x, inf_params, dt,
covariate_names,
cov_mat,
beta_index,
max_loglike,
chi_stat = 1.92){
inf_params[beta_index] <- x
naive_loglike(inf_params, dt,
covariate_names, cov_mat,
return_neg = FALSE) + chi_stat - max_loglike
}
######################3
## PERMUTE relative time within clusters
set.seed(7272020)
beta1_vec <- numeric(10)
for(jj in 1:10){
tb_sub <- tb_orig %>%
group_by(cluster_id) %>%
mutate(rel_time = sample(rel_time)) %>%
ungroup()
covariate_names <- c("smear", "hiv_neg_pos", "hiv_unk_pos")
cov_mat <- covariate_df_to_mat(tb_sub,
cov_names = covariate_names)
init_params <- rep(0, length(covariate_names) + 1)
bds <- rep(-5, length(covariate_names) + 1)
t1 <- proc.time()[3]
best_params <- optim(par = init_params,
fn = naive_loglike,
dt = data.table::as.data.table(tb_sub),
return_neg = TRUE,
cov_mat = cov_mat,
covariate_names = covariate_names,
method = "L-BFGS-B",
lower = bds,
upper = -bds
)
t2 <- proc.time()[3] - t1
print(paste("Optimization time:", round( t2 / 60, 3),
"min"))
## Likelihood profile
print("Likelihood profile")
ci_mat <- matrix(0, ncol = 2,
nrow = length(best_params$par))
ci_mat <- matrix(0, ncol = 2,
nrow = length(best_params$par))
for(ii in 1:nrow(ci_mat)){
max_loglike <- -best_params$value
best_pars <- best_params$par
lower <- uniroot(f = naive_loglike_wrapper, c(best_pars[ii] - 3, best_pars[ii]),
max_loglike = max_loglike,
inf_params = best_pars,
beta_index = ii,
dt = data.table::as.data.table(tb_sub),
covariate_names = covariate_names,
cov_mat = cov_mat,
chi_stat = 1.92)
##
upper <- uniroot(f = naive_loglike_wrapper, c(best_pars[ii], best_pars[ii] + 3),
max_loglike = max_loglike,
inf_params = best_pars,
beta_index = ii,
dt = data.table::as.data.table(tb_sub),
covariate_names = covariate_names,
cov_mat = cov_mat,
chi_stat = 1.92)
ci_mat[ii, 1] <- lower$root
ci_mat[ii, 2] <- upper$root
}
est_pars <- cbind(best_pars, ci_mat)
rownames(est_pars) <- c("Intercept",
covariate_names)
colnames(est_pars) <- c("Mean", "Lower95", "Uppper95")
beta1_vec[jj] <- est_pars[2,1]
print(-best_params$value)
print(est_pars, digits = 2)
}
#######################################3
######################3
## ORIGINAL
set.seed(7272020)
beta1_vec <- numeric(10)
for(jj in 1:1){
tb_sub <- tb_orig
covariate_names <- c("smear", "hiv_neg_pos", "hiv_unk_pos", "rel_time")
cov_mat <- covariate_df_to_mat(tb_sub,
cov_names = covariate_names)
init_params <- rep(0, length(covariate_names) + 1)
bds <- rep(-5, length(covariate_names) + 1)
t1 <- proc.time()[3]
best_params <- optim(par = init_params,
fn = naive_loglike,
dt = data.table::as.data.table(tb_sub),
return_neg = TRUE,
cov_mat = cov_mat,
covariate_names = covariate_names,
method = "L-BFGS-B",
lower = bds,
upper = -bds,
hessian = TRUE
)
t2 <- proc.time()[3] - t1
print(paste("Optimization time:", round( t2 / 60, 3),
"min"))
se <- sqrt(diag(solve(best_params$hessian)))
ci_mat <- matrix(0, ncol = 2,
nrow = length(best_params$par))
ci_mat <- matrix(0, ncol = 2,
nrow = length(best_params$par))
ci_mat[,1] <- best_params$par - 1.96 * se
ci_mat[,2] <- best_params$par + 1.96 * se
est_pars <- cbind(best_pars, ci_mat, se)
rownames(est_pars) <- c("Intercept",
covariate_names)
colnames(est_pars) <- c("Mean", "Lower95", "Uppper95", "SE")
beta1_vec[jj] <- est_pars[2,1]
print(-best_params$value)
print(est_pars, digits = 2)
}
saveRDS(est_pars, "naive-tab.RDS")
data-naive-boot.R
Expected run-time: ~15 minutes
## SKG
## Testing out model where transmission tree is determined by infection order
## Updated June 23, 2021 to add a bootstrap SE
## Load libraries and data
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(tidyverse)
data(tb_clean)
my_seed <- 6172021
set.seed(my_seed)
t0 <- proc.time()[3]
## Format the data for all the variables
## I'm filling in the Unknown with a negative smear
## Should later check to see if results are changed with positive
tb_sub <- tb_clean %>%
dplyr::mutate(smear = ifelse(spsmear == "Positive",
1, 0),
cluster_id = group,
hiv_f = ifelse(hivstatus == "Negative", "neg",
ifelse(hivstatus == "Positive", "pos",
"unk"))) %>%
dplyr::mutate(hiv_neg_pos = ifelse(hiv_f == "neg", 1, 0),
hiv_unk_pos = ifelse(hiv_f == "unk", 1, 0)) %>%
dplyr::group_by(cluster_id) %>%
dplyr::mutate(rel_time = as.numeric(rel_time / 365)) %>%
dplyr::mutate(cluster_size = dplyr::n()) %>%
dplyr::ungroup() %>%
mutate(race_f = fct_collapse(race,
white = "White",
black = "Black or African American",
asian = "Asian")) %>%
mutate(race_asian_white = ifelse(race_f == "asian", 1, 0),
race_black_white = ifelse(race_f == "black", 1, 0)) %>%
select(cluster_id, smear,
hiv_neg_pos,
hiv_unk_pos,
rel_time,
race_asian_white,
race_black_white,
cluster_size)
tb_orig <- tb_sub
naive_loglike <- function(inf_params, dt,
covariate_names, cov_mat,
return_neg = TRUE){
dt <- dt[, prob_inf :=
1 / (1 + exp(-(cov_mat %*% inf_params)))]
## Trying out data.table
loglike_df <- dt[,
.(loglike = naive_cluster_loglike(prob_inf, rel_time)),
by = .(cluster_id)]
loglike <- sum(loglike_df$loglike)
if(return_neg) loglike <- -loglike
return(loglike)
}
# cluster contains column datecoll which is the date of collection
naive_cluster_loglike <- function(prob_inf, datecoll){
last_ind <- which.max(datecoll)
probs <- prob_inf[-last_ind]
loglike <- sum(log(probs)) + sum(log(1 - prob_inf))
return(loglike)
}
######################3
## ORIGINAL
set.seed(7272021)
beta1_vec <- numeric(10)
B <- 100
beta_mat <- matrix(0, nrow = B, ncol = 5)
cov_mat <- matrix(0, nrow = B, ncol = 5)
for(jj in 1:B){
## boot strap orig data
tb_sub <- bootstrap_clusters(tb_orig)
covariate_names <- c("smear", "hiv_neg_pos", "hiv_unk_pos", "rel_time")
cov_mat <- covariate_df_to_mat(tb_sub,
cov_names = covariate_names)
init_params <- rep(0, length(covariate_names) + 1)
bds <- rep(-5, length(covariate_names) + 1)
t1 <- proc.time()[3]
best_params <- optim(par = init_params,
fn = naive_loglike,
dt = data.table::as.data.table(tb_sub),
return_neg = TRUE,
cov_mat = cov_mat,
covariate_names = covariate_names,
method = "L-BFGS-B",
lower = bds,
upper = -bds,
hessian = TRUE
)
t2 <- proc.time()[3] - t1
print(paste("Optimization time:", round( t2 / 60, 3),
"min"))
se <- sqrt(diag(solve(best_params$hessian)))
beta_mat[jj,] <- best_params$par
lower <- best_params$par - 1.96 * se
upper <- best_params$par + 1.96 * se
cov_mat[jj,] <- ifelse(best_params$par >= lower &
best_params$par <= upper, 1, 0)
print(-best_params$value)
print(best_params$par, digits = 2)
}
se_boot <- apply(beta_mat, 2, sd)
saveRDS(list(beta_mat = beta_mat,
se_boot = se_boot), "naive-tab-boot.RDS")
analyze-data-big-table.R
Expected run-time: ~seconds, given outputs from above
## SKG
## August 14, 2020
## Combining the tables together from three model types
## Updated June 22, 2021
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(knitr)
library(tidyverse)
library(kableExtra)
## loglike table BASE MODEL
files <- list.files()
fn_rds <- grep("RDS", files, value = TRUE)
fn <- grep("base", fn_rds, value = TRUE)
data_list_base <- readRDS(fn)
loglike_df_base <- data_list_base$loglike_df
loglike_df_base$vars <- c("Null",
"Smear",
"Smear/HIV",
"Smear/HIV/rel. time",
"Smear/HIV/rel. time/race")
loglike_df_base$type <- "Base"
############################################
## OUTSIDER MODEL
files_o <- list.files()
fn_rds_o <- grep("RDS", files_o, value = TRUE)
fn_rds2_o <- grep("mot", fn_rds_o, value = TRUE)
data_list_o <- readRDS(fn_rds2_o)
loglike_df_o <- data_list_o$loglike_df
loglike_df_o$vars <- c("Null",
"Smear",
"Smear/HIV",
"Smear/HIV/rel. time",
"Smear/HIV/rel. time/race")
loglike_df_o$type <- "Multiple outside"
#########################################################################
## TABLE 5 - loglike for models
#####################################################
combined_loglike_df <- dplyr::bind_rows(loglike_df_base,
loglike_df_o) %>%
select(type, everything())
combined_loglike_df %>%
select(type, model, vars, n_params, loglike, aic) %>%
kable(format = "latex", digits = 2,
caption = "Reported log likelihood for the Maryland TB data for the five different models described in Eq. \\eqref{eq:data-models} for both the base and multiple outside transmissions model types.",
label = "data-models",
col.names = c("Type", "Model #", "Covariates", "# parameters", "Log like", "AIC"),
booktabs = TRUE,
linesep = "") %>%
kable_styling(latex_options = "striped", position = "center",
stripe_index = c(1, 3, 5, 7)) %>%
row_spec(c(4), background = "yellow") %>%
row_spec(c(9), background = "yellow") %>%
row_spec(c(5), extra_latex_after = "\\midrule")
#########################################################
###############################################################
## TABLE 6 - best model coefficients
## BEST MODELS
## smear/HIV/rel time
## BASE
best_mod_base2 <- data_list_base$beta_list[[4]][,c(1, 4)]
coeff_names <- c("Intercept", "Smear pos.",
"HIV neg.: HIV pos.",
"HIV unk.: HIV pos.",
"Relative time")
rownames(best_mod_base2) <- NULL
best_mod_base <- data.frame(Type = "Base",
Coeff = coeff_names,
best_mod_base2)
## add boot strap SE
base_boot <- readRDS("biowulf-results/bootstrap_sims_base_2021-06-23.RDS")
best_mod_base$se_boot <- base_boot$se_vec
## OUTSIDER
best_mod_o2 <- data_list_o$beta_list[[4]][,c(1,4)]
coeff_names <- c("Intercept", "Smear pos.",
"HIV neg.: HIV pos.",
"HIV unk.: HIV pos.",
"Relative time")
rownames(best_mod_o2) <- NULL
best_mod_o <- data.frame(Type = "Multiple outside",
Coeff = coeff_names,
best_mod_o2)
## add boot strap SE
mot_boot <- readRDS("biowulf-results/bootstrap_sims_mot_2021-06-24.RDS")
best_mod_o$se_boot <- mot_boot$se_vec
## NAIVE MODEL
best_mod_naive2 <- data.frame(readRDS("naive-tab.RDS"))
coeff_names <- c("Intercept", "Smear pos.",
"HIV neg.: HIV pos.",
"HIV unk.: HIV pos.",
"Relative time")
rownames(best_mod_naive2) <- NULL
best_mod_naive <- data.frame(Type = "Naive",
Coeff = coeff_names,
best_mod_naive2[, c(1, 4)]) %>%
rename("Est." = Mean)
## Se boot
naive_boot <- readRDS("naive-tab-boot.RDS")
best_mod_naive$se_boot <- naive_boot$se_boot
########
## Putting it all together
combined_best_mod <- rbind(best_mod_base,
best_mod_o,
best_mod_naive) %>%
mutate(or = exp(`Est.`),
lower = exp(`Est.`- 1.96 * se_boot),
upper = exp(`Est.` + 1.96 * se_boot)) %>%
mutate(pretty_or = sprintf("%.2f [%.2f, %.2f]", or, lower, upper)) %>%
mutate(combined_se = sprintf("(%.2f, %.2f)", SE, se_boot))
combined_best_mod %>%
select(c("Type", "Coeff", "Est.", "combined_se", "pretty_or")) %>%
kable(format = "latex", booktabs = TRUE,
col.names = c("Type", "Coeff.",
"$\\hat{\\beta}$", "$\\widehat{SE}_F(\\hat{\\beta}), \\widehat{SE}_B(\\hat{\\beta})$",
"Odds Ratio [95\\% CI]"),
caption = paste("Coefficient estimates for the best selected model.",
"We report the mean estimate and two estimates of the standard error of the $\\beta$ values",
"in addition to the odds ratio and 95\\% CI. $SE_F$ represents standard error",
"derived from the Fisher information, and $SE_B$ represents the bootstrap standard error.",
"The 95\\% CI is estimated using the bootstrap SE."),
label = "best-model-ests",
digits = 2,
linesep = "",
escape = FALSE,
align = c("l", "l", "c", "c", "c")) %>%
kable_styling(latex_options = "striped", position = "center",
stripe_index = c(1, 7, 11)) %>%
row_spec(c(3:5, 9,10, 13:15), background = "yellow") %>%
row_spec(c(5, 10), extra_latex_after = "\\midrule")
fig4-most-likely-trees.R
Expected run-time: ~seconds, given outputs from above
## SKG
## June 22, 2021
## JCGS revision to reproduce code for most likely trees
#################################################
## A couple helper functions
#' Assign x-coordinates based on generation and index for a single cluster
#'
#' @param gen generation number
#' @return generation number
assign_coords_x <- function(gen){
return(gen)
}
#' Assign x-coordinates based on generation and index for a single cluster
#'
#' @param index index ONLY for those in a single generation
#' @param ymin default is -1
#' @param ymax default is 1
#' @return index space, evenly spaced by number in generation
assign_coords_y <- function(index,
ymin = -1, ymax = 1){
gen_ranks <- rank(index)
n_in_gen <- length(gen_ranks)
if(n_in_gen == 1){
return(0)
}
y <- -1 + (gen_ranks - 1) /(.5 * (length(gen_ranks)-1))
return(y)
}
###########################################################
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(ggplot2)
library(data.table)
library(dplyr)
library(stringr)
## Subset data to cluster 27, it's cool
tb_ex <- tb_clean %>%
filter(group == 27) %>%
mutate(rel_time = as.numeric(rel_time) / 365) %>%
arrange(rel_time) %>%
mutate(order = 1:n()) %>%
dplyr::mutate(smear = ifelse(spsmear == "Positive",
1, 0),
cluster_id = group,
hiv_f = ifelse(hivstatus == "Negative", "neg",
ifelse(hivstatus == "Positive", "pos",
"unk"))) %>%
dplyr::mutate(hiv_neg_pos = ifelse(hiv_f == "neg", 1, 0),
hiv_unk_pos = ifelse(hiv_f == "unk", 1, 0)) %>%
ungroup()
## Load in best results from base model
fns <- list.files()
fns_rds <- grep(".RDS", fns, value = TRUE)
base_model_results <- readRDS(grep("base", fns_rds, value = TRUE))
inf_params <- base_model_results$beta_list[[4]][,1]
## Sample a bunch of MC trees
B <- 100000
set.seed(622021)
covariate_names <- c("smear",
"hiv_neg_pos", "hiv_unk_pos",
"rel_time")
mc_trees <- sample_mc_trees(tb_ex,
B = B,
multiple_outside_transmissions = FALSE,
covariate_names = c(covariate_names,
"order"))
## get the covariate matrix
cov_mat <- covariate_df_to_mat(mc_trees,
cov_names = covariate_names)
mc_trees$prob_inf <- 1 / (1 + exp(-(cov_mat %*% inf_params)))
cluster_id <- n_inf <- orig_id <- prob_inf <- NULL
my_prob_inf <- 1 / (1 + exp(-(cov_mat %*% inf_params)))
## Turning mc_trees to a data table
mc_trees.dt <- data.table::as.data.table(mc_trees)
mc_trees.dt <- mc_trees.dt[, prob_inf := my_prob_inf]
cluster_id <- NULL
## Getting the likelihood for each sampled cluster
like_df <- mc_trees.dt[,
.(like = general_cluster_like.dt(prob_inf, n_inf)),
by = .(orig_id, cluster_id)]
mc_trees_like <- left_join(mc_trees, like_df,
by = c(orig_id, cluster_id)) %>%
mutate(prob = like / sum(like))
df <- mc_trees_like %>% group_by(cluster_id) %>%
mutate(x = assign_coords_x(gen)) %>%
group_by(cluster_id, gen) %>%
mutate(y = assign_coords_y(n_in_gen)) %>%
ungroup() %>%
group_by(cluster_id) %>%
mutate(gen_size = max(gen)) %>%
arrange(desc(prob))
top_groups_by_gen <- df %>%
ungroup() %>%
group_by(gen_size, cluster_id) %>%
summarize(prob = min(prob),
.groups = "drop_last") %>%
slice_max(order_by = prob, n = 3, with_ties = FALSE)
top_clusts <- top_groups_by_gen$cluster_id
df_sub <- df %>% filter(cluster_id %in% top_clusts)
df_sub <- df_sub %>%
mutate(facet = paste(gen_size, cluster_id, sep = "-"))
inf_df <- df_sub %>%
select(id, cluster_id, x, y) %>%
rename(x_to = x, y_to = y,
inf_id = id)
jsub <- left_join(df_sub, inf_df, by = c("cluster_id", "inf_id"))
fctr_sub <- df_sub %>% group_by(cluster_id) %>%
summarize(like = prob[1])
jsub$factor_id <- factor(jsub$cluster_id,
labels = paste0("Sample ",
fctr_sub$cluster_id,
": P(C) = ",
formatC(fctr_sub$like,
format = "e",
digits = 2)))
jsub$factor_gen <- factor(jsub$gen_size,
levels = 1:max(jsub$gen_size),
labels = paste0("Gen. ", 1:max(jsub$gen_size)))
fctr_sub2 <- df_sub %>% group_by(facet, gen_size, cluster_id) %>%
summarize(like = prob[1]) %>%
mutate(id = stringr::str_sub(cluster_id, start = -3))
jsub$facet_lab <- factor(jsub$facet,
levels = unique(fctr_sub2$facet),
labels = paste0("# Gens: ",
fctr_sub2$gen_size,
"; ID: ",
fctr_sub2$id,
"; P(C) = ",
formatC(fctr_sub2$like,
format = "e",
digits = 2)))
my_orig_id <- 27
ggplot(data = jsub,
aes(x = x, y = y,
group = cluster_id,
col = factor(order),
shape = factor(hiv_neg_pos)
)) +
geom_curve(aes(xend = x_to, yend = y_to),
size = 2, curvature = -0,
col = "black") +
geom_point(size = 6, stroke = 4, col = "darkgray") +
geom_point(size = 5, stroke = 4) +
facet_wrap(~facet_lab, ncol = 3) +
scale_color_brewer(palette = "Set1", name = "Detection Order") +
scale_shape_manual(values = c(3, 16),
labels = c("Pos./Unk.", "Neg."),
name = "HIV Status") +
xlim(.8, 7.2) +
ylim(-1.2, 1.2) +
theme_bw(base_size = 18) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
labs(x = "Generation in transmission tree",
y = latex2exp::TeX("Order of infection in gen. among 'siblings' $\\rightarrow$"),
title = "Most likely sampled trees by number of generations",
subtitle = paste0("Cluster ", my_orig_id)) +
theme(legend.position = "bottom", legend.box = "horizontal") +
guides(color = guide_legend(nrow = 2, byrow = TRUE))
ggsave("trees-7.pdf", height = 15, width = 13)
enough-MC-trees/sampling-enough-mc-trees.R
Expected run-time: ~2 days
## ----setup, include=FALSE--------------------------------------------------------------------------------------
rep_from_lib <- TRUE
if(rep_from_lib){
library(InfectionTrees)
} else {
devtools::load_all()
}
library(InfectionTrees)
library(tidyr)
library(dplyr)
library(kableExtra)
library(ggplot2)
theme_set(theme_bw() + theme(axis.title = element_text()))
## --------------------------------------------------------------------------------------------------------------
set.seed(2020)
## Generate possible covariates
sample_covariates_df <- data.frame(x = c(1,0))
beta0 <- -2.5
beta1 <- .5
M <- 1000
data <- simulate_bp(K = M,
inf_params = c(beta0, beta1),
sample_covariates_df = sample_covariates_df,
covariate_names = "x",
max_size = 50)
data %>% group_by(cluster_id) %>%
summarize(cluster_size = n()) %>%
ungroup() %>%
group_by(cluster_size) %>%
summarize(freq = n()) %>%
ungroup() %>%
kable(type = "html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F,
position = "center")
## --------------------------------------------------------------------------------------------------------------
## Summarize data set into a table
binary_cluster_summaries <- summarize_binary_clusters(data)
B <- 5 ## Should change to 30+
K <- 5000
beta_mat <- matrix(0, nrow = B, ncol = 2)
var_mat <- matrix(0, nrow = B, ncol = 2)
seed <- 9152020
for(bb in 1:B){
## Get different seeds for MC sets
seed <- seed + bb * 10
set.seed(seed)
## Sample MC trees
mc_trees <- sample_mc_binary_cov(K,
observed_cluster_summaries = binary_cluster_summaries) %>%
dplyr::select(-freq)
## Fit model
inf_params_0 <- c(0,0)
best_params <- optim(inf_params_0, fn = bp_loglike_binary_cov,
obs_data_summary = binary_cluster_summaries,
mc_samples_summary = mc_trees,
return_neg = TRUE,
lower = c(-5, -5),
upper = c(0, 5),
method = "L-BFGS-B",
hessian = TRUE)
beta_mat[bb,] <- best_params$par
var_mat[bb,] <- diag(solve(best_params$hessian)) ## Var from obs. fisher info
}
## --------------------------------------------------------------------------------------------------------------
wald_scores_fisher <- beta_mat / sqrt(var_mat)
wald_se_fisher <- apply(wald_scores_fisher, 2, sd)
se_mc <- apply(beta_mat, 2, sd)
wald_scores_mc <- t(t(beta_mat) / se_mc)
wald_se_mc <- apply(wald_scores_mc, 2, sd)
wald_se_mat <- rbind(wald_se_fisher, wald_se_mc)
rownames(wald_se_mat) <- c("Fisher", "MC")
kable(wald_se_mat, digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))