## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  message = FALSE,
  warning = FALSE,
  fig.align = "center",
  crop = NULL
)

## ----eval = FALSE-------------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("CrcBiomeScreen")

## ----eval = FALSE-------------------------------------------------------------
# install.packages(c("remotes"))
# remotes::install_github("iChronostasis/CrcBiomeScreen", force = TRUE)

## ----eval = TRUE--------------------------------------------------------------
library(CrcBiomeScreen)

## ----eval = TRUE--------------------------------------------------------------
# Set working directory
# setwd("/home/CRCscreening/CRCscreening-Workflow/")
# List available datasets in the package
# data(package = "CrcBiomeScreen")

# Load the toy dataset from curatedMetagenomicData
data(Thomas_2018_RelativeAbundance, package = "CrcBiomeScreen")

# Create the CrcBiomeScreenObject
## Direct creation from TreeSummarizedExperiment

# library(curatedMetagenomicData)
# tse <- curatedMetagenomicData("ThomasAM_2018a.relative_abundance",
#                               dryrun = FALSE, rownames = "short")[[1]]

tse <- Thomas_2018_RelativeAbundance$`2021-03-31.ThomasAM_2018a.relative_abundance`
CrcBiomeScreenObject <- CreateCrcBiomeScreenObjectFromTSE(tse)

# Load the simple toy dataset from curatedMetagenomicData
data(NHSBCSP_screeningData, package = "CrcBiomeScreen")

# Create the CrcBiomeScreenObject for simple toy data
CrcBiomeScreenObject_NHSBCSP_screeningData <- CreateCrcBiomeScreenObject(
  AbsoluteAbundance = NHSBCSP_screeningData[,-c(1:2,646:647)],
  TaxaData = colnames(NHSBCSP_screeningData[,-c(1:2,646:647)]),
  SampleData = NHSBCSP_screeningData[,c(1:2,646:647)]
)

## Data exploration
head(getTaxaData(CrcBiomeScreenObject))
dim(getRelativeAbundance(CrcBiomeScreenObject))
summary(getSampleData(CrcBiomeScreenObject)$age)

## ----eval = TRUE--------------------------------------------------------------
# Split the taxa into multiple levels[If not done already]
# CrcBiomeScreenObject <- SplitTaxas(CrcBiomeScreenObject)

head(getTaxaData(CrcBiomeScreenObject))

taxa <- getTaxaData(CrcBiomeScreenObject)
colnames(taxa) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")

setTaxaData(CrcBiomeScreenObject) <- taxa

colnames(getTaxaData(CrcBiomeScreenObject))

# Keep only the genus level data
CrcBiomeScreenObject <- KeepTaxonomicLevel(CrcBiomeScreenObject,level = "Genus")

# Inspect processed data
head(getTaxaData(CrcBiomeScreenObject))
dim(getTaxaData(CrcBiomeScreenObject))

## ----eval = TRUE--------------------------------------------------------------
# Normalize using GMPR (Geometric Mean of Pairwise Ratios)
CrcBiomeScreenObject <- NormalizeData(CrcBiomeScreenObject, method = "GMPR", level = "Genus")

# Normalize using TSS (Total Sum Scaling)
CrcBiomeScreenObject <- NormalizeData(CrcBiomeScreenObject, method = "TSS", level = "Genus")

## ----eval = TRUE--------------------------------------------------------------
set.seed(123)
# -------------------------
# Toy taxonomy
# -------------------------
toy_taxa_strings <- c(
  "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderA|D_4__FamilyA|D_5__GenusA",
  "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderB|D_4__FamilyB|D_5__GenusB",
  "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderC|D_4__FamilyC|D_5__GenusC",
  "D_0__Bacteria|D_1__Bacteroidetes|D_2__Bacteroidia|D_3__OrderD|D_4__FamilyD|D_5__GenusD",
  "D_0__Bacteria|D_1__Bacteroidetes|D_2__Bacteroidia|D_3__OrderE|D_4__FamilyE|D_5__GenusE",
  "D_0__Bacteria|D_1__Proteobacteria|D_2__Gammaproteobacteria|D_3__OrderF|D_4__FamilyF|D_5__GenusF"
)

toy_taxa <- data.frame(
  Taxa = toy_taxa_strings,
  stringsAsFactors = FALSE
)

# -------------------------
# Toy training abundance
# 6 taxa x 12 samples
# -------------------------
train_samples <- paste0("S", 1:12)

toy_abs <- matrix(
  c(
    rpois(6 * 6, lambda = 54.8887777),   # controls
    rpois(6 * 6, lambda = 55)    # CRC
  ),
  nrow = 6,
  ncol = 12
)

rownames(toy_abs) <- toy_taxa_strings
colnames(toy_abs) <- train_samples
toy_abs <- as.data.frame(toy_abs)

toy_sample <- data.frame(
  number_reads = rep(10000, 12),
  study_condition = c(rep("control", 6), rep("CRC", 6)),
  row.names = train_samples,
  stringsAsFactors = FALSE
)

# -------------------------
# 1. Create toy training data
# -------------------------

toy_obj <- CreateCrcBiomeScreenObject(
  AbsoluteAbundance = toy_abs,
  TaxaData = toy_taxa,
  SampleData = toy_sample
)

# -------------------------
# 2. Keep one taxonomic level and normalize
# -------------------------

toy_obj <- SplitTaxas(toy_obj)
toy_obj <- KeepTaxonomicLevel(toy_obj, level = "Genus")
toy_obj <- NormalizeData(toy_obj, method = "TSS", level = "Genus")

# -------------------------
# 3. Split dataset
# -------------------------
toy_obj <- SplitDataSet(
  toy_obj,
  label = c("control", "CRC"),
  partition = 0.7
)

# Perform quality control using cmdscale
toy_obj <- qcByCmdscale(
  toy_obj,
  TaskName = "toy_obj",
  normalize_method = "TSS",
  plot = FALSE
)

# Check class balance in the training data
checkClassBalance(getModelData(toy_obj)$TrainLabel)

## ----eval = TRUE--------------------------------------------------------------
# -------------------------------
# 4. Train and Evaluate RF model
# -------------------------------
toy_obj <- TrainModels(
  toy_obj,
  model_type = "RF",
  TaskName = "toy_rf",
  ClassWeights = FALSE,
  TrueLabel = "CRC",
  num_cores = 1,
  n_cv = 2
)

toy_obj <- EvaluateModel(
  toy_obj,
  model_type = "RF",
  TaskName = "ToyData_RF_Test",
  TrueLabel = "CRC",
  PlotAUC = FALSE
)

## ----eval = TRUE--------------------------------------------------------------
# -------------------------
# 5. Create toy validation data
# -------------------------
val_taxa_strings <- c(
  "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderA|D_4__FamilyA|D_5__GenusA",
  "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderB|D_4__FamilyB|D_5__GenusB",
  "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderC|D_4__FamilyC|D_5__GenusC",
  "D_0__Bacteria|D_1__Bacteroidetes|D_2__Bacteroidia|D_3__OrderD|D_4__FamilyD|D_5__GenusD",
  "D_0__Bacteria|D_1__Bacteroidetes|D_2__Bacteroidia|D_3__OrderE|D_4__FamilyE|D_5__GenusE",
  "D_0__Bacteria|D_1__Proteobacteria|D_2__Gammaproteobacteria|D_3__OrderF|D_4__FamilyF|D_5__GenusF"
)

val_taxa <- data.frame(
  Taxa = val_taxa_strings,
  stringsAsFactors = FALSE
)

val_samples <- paste0("V", 1:8)

val_abund <- matrix(
  c(
    rpois(6 * 4, lambda = 38),
    rpois(6 * 4, lambda = 48)
  ),
  nrow = 6,
  ncol = 8
)

rownames(val_abund) <- val_taxa_strings
colnames(val_abund) <- val_samples

val_abund <- as.data.frame(val_abund)

val_sample <- data.frame(
  number_reads = rep(10000, 8),
  study_condition = c(rep("control", 4), rep("CRC", 4)),
  condition = c(rep("control", 4), rep("CRC", 4)),
  row.names = val_samples,
  stringsAsFactors = FALSE
)

val_obj <- CreateCrcBiomeScreenObject(
  AbsoluteAbundance = val_abund,
  TaxaData = val_taxa,
  SampleData = val_sample
)

val_obj <- SplitTaxas(val_obj)
val_obj <- KeepTaxonomicLevel(val_obj, level = "Genus")
val_obj <- NormalizeData(val_obj, method = "TSS", level = "Genus")

# -------------------------
# 6. Align features
# -------------------------
train_norm <- getNormalizedData(toy_obj)
val_norm <- getNormalizedData(val_obj)

common_features <- intersect(colnames(train_norm), colnames(val_norm))

setNormalizedData(toy_obj) <- train_norm[, common_features, drop = FALSE]
setNormalizedData(val_obj) <- val_norm[, common_features, drop = FALSE]

# -------------------------
# 7. Validate model
# -------------------------
validated_obj <- ValidateModelOnData(
  toy_obj,
  ValidationData = val_obj,
  model_type = "RF",
  TaskName = "toy_validation",
  TrueLabel = "CRC",
  PlotAUC = FALSE
)

## ----eval = TRUE--------------------------------------------------------------
# ----------------------------------
# 8. Minimal end-to-end RunScreening
# ----------------------------------
toy_obj2 <- CreateCrcBiomeScreenObject(
  AbsoluteAbundance = toy_abs,
  TaxaData = toy_taxa,
  SampleData = toy_sample
)

toy_obj2 <- SplitTaxas(toy_obj2)
toy_obj2 <- KeepTaxonomicLevel(toy_obj2, level = "Genus")
toy_obj2 <- NormalizeData(toy_obj2, method = "TSS", level = "Genus")

toy_obj2 <- RunScreening(
  obj = toy_obj2,
  model_type = "RF",
  partition = 0.7,
  split.requirement = list(
    label = c("control", "CRC"),
    condition_col = "study_condition"
  ),
  ClassWeights = FALSE,
  n_cv = 2,
  num_cores = 1,
  TaskName = "RF_TSS_toydata",
  ValidationData = val_obj,
  TrueLabel = "CRC"
)

## ----eval = TRUE--------------------------------------------------------------
# -------------------------
# 9. Prediction on new data
# -------------------------
toy_obj <- PredictCrcBiomeScreen(
  toy_obj,
  newdata = getNormalizedData(val_obj),
  model_type = "RF"
)

# Inspect the predictions
head(getPredictResult(toy_obj)$RF)
#>      control       CRC
#> V1 0.5122222 0.4877778
#> V2 0.6066667 0.3933333
#> V3 0.4205556 0.5794444
#> V4 0.5255556 0.4744444
#> V5 0.4772222 0.5227778
#> V6 0.4677778 0.5322222
predictions <- getPredictResult(toy_obj)$RF

# -------------------------
# 10. Evaluate predictions
# -------------------------

evaluation_results <- EvaluateCrcBiomeScreen(
  predictions = predictions,
  true_labels = getSampleData(val_obj)$study_condition,
  TrueLabel = "CRC",
  TaskName = "Toy_Prediction_Eval"
)


## ----eval = TRUE--------------------------------------------------------------
# Check the available labels in the data
table(getSampleData(CrcBiomeScreenObject)$study_condition)

# Filter data to keep only CRC and control samples
CrcBiomeScreenObject <- FilterDataSet(
  CrcBiomeScreenObject,
  label = c("CRC", "control"),
  condition_col = "study_condition"
)

# Verify the filtering results
table(getSampleData(CrcBiomeScreenObject)$study_condition)

# Split data into training (70%) and test (30%) sets
CrcBiomeScreenObject <- SplitDataSet(
  CrcBiomeScreenObject, 
  label = c("control", "CRC"), 
  partition = 0.7
)

## ----eval = TRUE--------------------------------------------------------------
# Perform quality control using cmdscale
CrcBiomeScreenObject <- qcByCmdscale(
  CrcBiomeScreenObject,
  TaskName = "Normalize_ToyData_filtered_qc",
  normalize_method = "GMPR",
  plot = FALSE
)

# Check class balance in the training data
checkClassBalance(getModelData(CrcBiomeScreenObject)$TrainLabel)

## ----eval = FALSE-------------------------------------------------------------
# # Assuming 'CrcBiomeScreenObject' contains a trained XGBoost model
# # and 'MyNewData' is your new dataset (e.g., another CrcBiomeScreenObject or a data frame).
# 
# # Generate predictions for the new dataset
# ValidationData_filtered_qc <- PredictCrcBiomeScreen(
#     CrcBiomeScreenObject,
#     newdata = getNormalizedData(ValidationData_filtered_qc), # Use the appropriate data slot from your new data
#     model_type = "RF"
# )
# # Inspect the predictions
# head(getPredictResult(ValidationData_filtered_qc)$RF)
# predictions <- getPredictResult(ValidationData_filtered_qc)$RF

## ----eval = FALSE-------------------------------------------------------------
# # Assuming 'MyNewData' has a 'study_condition' column in its SampleData
# # and the true positive label is "CRC".
# 
# # Evaluate the predictions
# evaluation_results <- EvaluateCrcBiomeScreen(
#     predictions = predictions,
#     true_labels = getSampleData(ValidationData_filtered_qc)$study_condition, # Provide the actual true labels
#     TrueLabel = "CRC", # Specify the positive class label
#     TaskName = "MyNewDatasetEvaluation" # A name for the output plot/files
# )
# 
# # View the evaluation results
# print(evaluation_results$AUC)

## ----full_pipeline, eval=FALSE------------------------------------------------
# # Load the toy dataset from curatedMetagenomicData
# data(Thomas_2018_RelativeAbundance, package = "CrcBiomeScreen")
# 
# # Create the CrcBiomeScreenObject
# tse <- Thomas_2018_RelativeAbundance$`2021-03-31.ThomasAM_2018a.relative_abundance`
# CrcBiomeScreenObject <- CreateCrcBiomeScreenObjectFromTSE(tse)
# 
# taxa <- getTaxaData(CrcBiomeScreenObject)
# colnames(taxa) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
# 
# setTaxaData(CrcBiomeScreenObject) <- taxa
# 
# # Keep only the genus level data
# CrcBiomeScreenObject <- KeepTaxonomicLevel(CrcBiomeScreenObject,level = "Genus")
# 
# # Inspect processed data
# head(getTaxaData(CrcBiomeScreenObject))
# 
# # Normalize using GMPR (Geometric Mean of Pairwise Ratios)
# CrcBiomeScreenObject <- NormalizeData(CrcBiomeScreenObject, method = "GMPR", level = "Genus")
# 
# # Filter data to keep only CRC and control samples
# CrcBiomeScreenObject <- FilterDataSet(
#   CrcBiomeScreenObject,
#   label = c("CRC", "control"),
#   condition_col = "study_condition"
# )
# 
# # Perform quality control using cmdscale
# CrcBiomeScreenObject <- qcByCmdscale(
#   CrcBiomeScreenObject,
#   TaskName = "Normalize_ToyData_filtered_qc",
#   normalize_method = "GMPR",
#   plot = FALSE
# )
# 
# # Verify the filtering results
# table(getSampleData(CrcBiomeScreenObject)$study_condition)
# 
# # Split data into training (70%) and test (30%) sets
# CrcBiomeScreenObject <- SplitDataSet(
#   CrcBiomeScreenObject,
#   label = c("control", "CRC"),
#   partition = 0.7
# )
# 
# # Check class balance in the training data
# checkClassBalance(getModelData(CrcBiomeScreenObject)$TrainLabel)
# 
# # Load validation data
# data("ZellerG_2014_RelativeAbundance")
# 
# ValidationData_tse <- ZellerG_2014_RelativeAbundance$`2021-03-31.ZellerG_2014.relative_abundance`
# # Create CrcBiomeScreenObject for validation data
# ValidationData <- CreateCrcBiomeScreenObjectFromTSE(ValidationData_tse)
# 
# # Process validation data similarly to training data
# # ValidationData <- SplitTaxas(ValidationData)
# taxa <- getTaxaData(ValidationData)
# colnames(taxa) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
# 
# setTaxaData(ValidationData) <- taxa
# 
# ValidationData <- KeepTaxonomicLevel(ValidationData,level = "Genus")
# ValidationData <- NormalizeData(ValidationData, method = "GMPR", level = "Genus")
# 
# # Align features between training and validation data
# val_norm <- getNormalizedData(ValidationData)
# crc_norm <- getNormalizedData(CrcBiomeScreenObject)
# # Ensure consistent features between training and validation data
# val_norm <- val_norm[, colnames(val_norm) %in% colnames(crc_norm)]
# crc_norm <- crc_norm[, colnames(crc_norm) %in% colnames(val_norm)]
# # Set back the normalized data
# setNormalizedData(ValidationData) <- val_norm
# setNormalizedData(CrcBiomeScreenObject) <- crc_norm
# 
# # Filter validation data
# ValidationData_filtered <- FilterDataSet(
#   ValidationData,
#   label = c("CRC", "control"),
#   condition_col = "study_condition"
# )
# 
# # Quality control for validation data
# ValidationData_filtered_qc <- qcByCmdscale(
#   ValidationData_filtered,
#   TaskName = "Normalize_ValidationData_filtered_qc",
#   normalize_method = "GMPR",
#   plot = FALSE
# )
# 
# # Run the complete screening workflow
# ValidationData_filtered_qc <- RunScreening(
#   # Input data
#   obj = CrcBiomeScreenObject,
#   # Model and data splitting
#   model_type = "RF",
#   partition = 0.7,
#   split.requirement = list(
#     label = c("control", "CRC"),
#     condition_col = "study_condition"
#   ),
#   ClassWeights = FALSE,
# 
#   # Cross-validation and parallelization
#   n_cv = 10,
#   num_cores = 10,
# 
#   # Task and output naming
#   TaskName = "RF_GMPR_toydata",
# 
#   # External validation
#   ValidationData = ValidationData_filtered_qc,
#   TrueLabel = "CRC"
# )
# 
# # Example with XGBoost model
# ValidationData_filtered_qc <- RunScreening(
#   CrcBiomeScreenObject,
#   model = "RF",
#   partition = 0.7,
#   split.requirement = list(
#     label = c("control", "CRC"),
#     condition_col = "study_condition"
#   ),
#   ClassWeights = TRUE,
#   n_cv = 10,
#   num_cores = 10,
#   TaskName = "RF_GMPR_toydata",
#   ValidationData = ValidationData_filtered_qc,
#   TrueLabel = "CRC"
# )

## ----sessioninfo--------------------------------------------------------------
sessionInfo()