## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6,
  warning = FALSE,
  message = FALSE
)

## ----install, eval=FALSE------------------------------------------------------
# # From Bioconductor (when available)
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("ClonalSim")
# 
# # From GitHub (development version)
# BiocManager::install("gbucci/ClonalSim")

## ----library------------------------------------------------------------------
library(ClonalSim)

## ----basic_sim----------------------------------------------------------------
# Set seed for reproducibility
set.seed(123)
sim <- simulateTumor(
  subclone_freqs = c(0.15, 0.25, 0.30, 0.30),
  n_mut_per_clone = c(20, 25, 30, 15),
  n_mut_founder = 10
)

# Display summary
sim

## ----access_results-----------------------------------------------------------
# Get mutation data
mutations <- getMutations(sim)
head(mutations)

# Get simulation parameters
params <- getSimParams(sim)
params$subclone_freqs

# Get true vs observed VAF
true_vaf <- getTrueVAF(sim)
obs_vaf <- getObservedVAF(sim)

# Compare
summary(true_vaf)
summary(obs_vaf)

## ----plot_vaf_density, fig.cap="VAF density plot showing mixed tumor sample"----
# ggplot2 is already loaded above
plot(sim, type = "vaf_density")

## ----plot_vaf_scatter, fig.cap="VAF scatter plot colored by mutation type"----
plot(sim, type = "vaf_scatter")

## ----plot_depth, fig.cap="Sequencing depth distribution"----------------------
plot(sim, type = "depth_histogram")

## ----plot_matrix, fig.cap="Clonal matrix showing mutation presence"-----------
plot(sim, type = "clone_matrix")

## ----bio_noise_demo, fig.cap="Effect of biological noise concentration parameter"----
# High heterogeneity (low concentration)
set.seed(123)
sim_high_het <- simulateTumor(
  subclone_freqs = c(0.3, 0.4, 0.3),
  n_mut_per_clone = c(30, 40, 30),
  biological_noise = list(enabled = TRUE, concentration = 20)
)

# Low heterogeneity (high concentration)
set.seed(123)
sim_low_het <- simulateTumor(
  subclone_freqs = c(0.3, 0.4, 0.3),
  n_mut_per_clone = c(30, 40, 30),
  biological_noise = list(enabled = TRUE, concentration = 100)
)

# Compare VAF spread
par(mfrow = c(1, 2))
hist(getTrueVAF(sim_high_het), main = "High Heterogeneity\n(concentration = 20)",
     xlab = "True VAF", breaks = 30, col = "skyblue")
hist(getTrueVAF(sim_low_het), main = "Low Heterogeneity\n(concentration = 100)",
     xlab = "True VAF", breaks = 30, col = "lightcoral")

## ----depth_demo, fig.cap="Depth distributions: negative binomial vs Poisson"----
# Realistic (negative binomial)
set.seed(123)
sim_nb <- simulateTumor(
  sequencing_noise = list(
    mean_depth = 100,
    depth_variation = "negative_binomial",
    depth_dispersion = 20
  )
)

# Unrealistic (Poisson)
set.seed(124)
sim_poisson <- simulateTumor(
  sequencing_noise = list(
    mean_depth = 100,
    depth_variation = "poisson"
  )
)

par(mfrow = c(1, 2))
hist(getMutations(sim_nb)$Depth, main = "Negative Binomial\n(realistic)",
     xlab = "Depth", breaks = 30, col = "skyblue")
hist(getMutations(sim_poisson)$Depth, main = "Poisson\n(unrealistic)",
     xlab = "Depth", breaks = 30, col = "lightcoral")

## ----binomial_demo, fig.cap="Stochastic variation from binomial sampling"-----
# With binomial sampling (realistic)
set.seed(123)
sim_binom <- simulateTumor(
  sequencing_noise = list(binomial_sampling = TRUE)
)

# Without binomial sampling (deterministic)
set.seed(123)
sim_det <- simulateTumor(
  sequencing_noise = list(binomial_sampling = FALSE)
)

# Compare True_VAF vs observed VAF
par(mfrow = c(1, 2))
with(getMutations(sim_binom), {
  plot(True_VAF, VAF, main = "With Binomial Sampling",
       xlab = "True VAF", ylab = "Observed VAF", pch = 16, col = rgb(0,0,1,0.3))
  abline(0, 1, col = "red", lty = 2)
})
with(getMutations(sim_det), {
  plot(True_VAF, VAF, main = "Without Binomial Sampling",
       xlab = "True VAF", ylab = "Observed VAF", pch = 16, col = rgb(0,0,1,0.3))
  abline(0, 1, col = "red", lty = 2)
})

## ----low_purity---------------------------------------------------------------
set.seed(123)
sim_low_purity <- simulateTumor(
  subclone_freqs = c(0.05, 0.10, 0.15),  # Sum = 0.30 (30% tumor)
  n_mut_per_clone = c(20, 25, 30)
)

cat("Tumor purity:", sum(getSimParams(sim_low_purity)$subclone_freqs), "\n")
plot(sim_low_purity, type = "vaf_density")

## ----high_coverage------------------------------------------------------------
set.seed(123)
sim_deep <- simulateTumor(
  sequencing_noise = list(
    enabled = TRUE,
    mean_depth = 500,
    depth_dispersion = 100,  # More uniform
    error_rate = 0.0005
  )
)

summary(getMutations(sim_deep)$Depth)
plot(sim_deep, type = "depth_histogram")

## ----low_coverage-------------------------------------------------------------
set.seed(123)
sim_exome <- simulateTumor(
  sequencing_noise = list(
    mean_depth = 50,
    depth_dispersion = 10,  # More variable
    error_rate = 0.002
  )
)

summary(getMutations(sim_exome)$Depth)

## ----no_noise-----------------------------------------------------------------
set.seed(123)
sim_ideal <- simulateTumor(
  biological_noise = list(enabled = FALSE),
  sequencing_noise = list(enabled = FALSE)
)

# VAF exactly matches expected frequencies
head(getMutations(sim_ideal)[, c("Clone", "True_VAF", "VAF")])

## ----germline_variants, fig.cap="VAF distribution showing germline peak at 0.5"----
# 70% purity tumor with germline variants
set.seed(789)
sim_germline <- simulateTumor(
  subclone_freqs = c(0.3, 0.4),  # 70% tumor purity
  n_mut_per_clone = c(40, 50),
  germline_variants = list(
    enabled = TRUE,
    n_variants = 150,  # Number of germline SNPs
    vaf_expected = 0.5  # Heterozygous diploid
  )
)

# Check mutation types
table(getMutations(sim_germline)$Type)

# Compare VAFs
germline_vafs <- getMutations(sim_germline)[getMutations(sim_germline)$Type == "germline", "VAF"]
somatic_vafs <- getMutations(sim_germline)[getMutations(sim_germline)$Type != "germline", "VAF"]

cat("Germline VAF (expected ~0.5):", round(mean(germline_vafs), 3), "\n")
cat("Somatic VAF mean:", round(mean(somatic_vafs), 3), "\n")

# Plot showing germline peak
plot(sim_germline, type = "vaf_density")

## ----granges------------------------------------------------------------------
library(GenomicRanges)

gr <- toGRanges(sim)
gr

# Access metadata
mcols(gr)[1:5, ]

# Subset by chromosome
gr_chr1 <- gr[seqnames(gr) == "chr1"]
length(gr_chr1)

## ----vcf----------------------------------------------------------------------
# Get VRanges object
vr <- suppressWarnings(toVCF(sim, sample_name = "TumorSample"))
vr

# Write to VCF file (using tempdir to avoid polluting user's filesystem)
vcf_file <- tempfile(fileext = ".vcf")
suppressWarnings(toVCF(sim, output_file = vcf_file))
file.exists(vcf_file)

## ----pyclone------------------------------------------------------------------
pyclone_file <- tempfile(fileext = ".tsv")
toPyClone(sim, file = pyclone_file, sample_id = "sample1")
head(read.table(pyclone_file, header = TRUE, sep = "\t"))

## ----sciclone-----------------------------------------------------------------
sciclone_file <- tempfile(fileext = ".tsv")
toSciClone(sim, file = sciclone_file)
head(read.table(sciclone_file, header = TRUE, sep = "\t"))

## ----csv----------------------------------------------------------------------
df <- toDataFrame(sim)
csv_file <- tempfile(fileext = ".csv")
write.csv(df, csv_file, row.names = FALSE)
head(read.csv(csv_file))

## ----benchmark_workflow, eval=FALSE-------------------------------------------
# # 1. Generate ground truth
# set.seed(42)
# sim <- simulateTumor(
#   subclone_freqs = c(0.2, 0.3, 0.5),
#   n_mut_per_clone = c(50, 75, 50)
# )
# 
# # 2. Export to VCF (ground truth)
# toVCF(sim, output_file = "ground_truth.vcf")
# 
# # 3. Get true positive set
# true_mutations <- getMutations(sim)
# 
# # 4. Run your variant caller on simulated data
# # (your variant caller code here)
# 
# # 5. Compare results
# # - Sensitivity: TP / (TP + FN)
# # - Precision: TP / (TP + FP)
# # - F1 score: 2 * (Precision * Sensitivity) / (Precision + Sensitivity)
# 
# # 6. Evaluate VAF estimation accuracy
# # cor(true_mutations$VAF, called_vaf)

## ----complex_hierarchy--------------------------------------------------------
set.seed(123)
sim_complex <- simulateTumor(
  subclone_freqs = c(0.1, 0.15, 0.2, 0.25, 0.3),
  n_mut_per_clone = c(30, 40, 50, 40, 30),
  n_mut_shared = list(
    "2 3 4 5" = 20,  # Shared by clones 2,3,4,5
    "3 4 5" = 15,    # Shared by clones 3,4,5
    "4 5" = 10,      # Shared by clones 4,5
    "1 2" = 8        # Shared by clones 1,2
  )
)

plot(sim_complex, type = "vaf_density")

## ----clonal_structure---------------------------------------------------------
structure <- getClonalStructure(sim_complex)
print(structure)

## ----sessioninfo--------------------------------------------------------------
sessionInfo()

