## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  warning = FALSE,
  message = FALSE
)

## ----load_packages------------------------------------------------------------
library(scTypeEval)
library(Matrix)

## ----generate_data------------------------------------------------------------
set.seed(22)

# Number of genes and cells
n_genes <- 5000
n_cells_per_sample <- 500
n_samples <- 6
cell_types <- c("CD4_T", "CD8_T", "B_cells", "NK_cells")

# Generate gene names
gene_names <- paste0("Gene_", seq_len(n_genes))

# Total cells
total_cells <- n_cells_per_sample * n_samples

# Generate metadata: samples and cell types
samples <- rep(paste0("Sample", seq_len(n_samples)), each = n_cells_per_sample)

# Assign cell types evenly within each sample
n_cell_types <- length(cell_types)
cells_per_type <- n_cells_per_sample %/% n_cell_types
celltypes <- rep(rep(cell_types, each = cells_per_type), times = n_samples)

# Initialize count matrix
count_matrix <- matrix(0, nrow = n_genes, ncol = total_cells)
rownames(count_matrix) <- gene_names
colnames(count_matrix) <- paste0("Cell_", seq_len(total_cells))

# Add cell-type-specific expression patterns
for (i in seq_along(cell_types)) {
  ct <- cell_types[i]
  ct_indices <- which(celltypes == ct)
  
  # Marker genes for this cell type (50 genes per type)
  marker_start <- ((i - 1) * 50 + 1)
  marker_end <- min(marker_start + 49, n_genes)
  marker_genes <- marker_start:marker_end
  
  # High expression in marker genes (lambda = 50)
  count_matrix[marker_genes, ct_indices] <- matrix(
    rpois(length(marker_genes) * length(ct_indices), lambda = 50),
    nrow = length(marker_genes)
  )
  
  # Background expression in other genes (lambda = 5)
  other_genes <- setdiff(seq_len(n_genes), marker_genes)
  count_matrix[other_genes, ct_indices] <- matrix(
    rpois(length(other_genes) * length(ct_indices), lambda = 5),
    nrow = length(other_genes)
  )
}

# Add sample-specific batch effects
for (s in seq_len(n_samples)) {
  sample_cells <- which(samples == paste0("Sample", s))
  batch_effect <- rnorm(1, mean = 1, sd = 0.1)
  count_matrix[, sample_cells] <- round(
    count_matrix[, sample_cells] * batch_effect
  )
}

# Convert to sparse matrix
count_matrix <- Matrix(count_matrix, sparse = TRUE)

# Create metadata dataframe
metadata <- data.frame(
  celltype = celltypes,
  sample = samples,
  batch = rep(
    c("Batch1", "Batch2"),
    each = n_cells_per_sample * (n_samples/2)
  ),
  row.names = colnames(count_matrix),
  stringsAsFactors = FALSE
)

# Preview the data
head(metadata)
dim(count_matrix)

## ----create_object------------------------------------------------------------
# Create scTypeEval object from matrix and metadata
sceval <- create_scTypeEval(
  matrix = count_matrix,
  metadata = metadata
)


## ----process_data-------------------------------------------------------------
# Process data with filtering criteria
sceval <- run_processing_data(
  scTypeEval = sceval,
  ident = "celltype",          # Column defining cell type identities
  sample = "sample",           # Column defining sample IDs
  min_samples = 3,             # Minimum samples required per cell type
  min_cells = 10               # Minimum cells per sample-celltype combination
)


## ----hvg----------------------------------------------------------------------
# Identify HVGs using the basic method
sceval <- run_hvg(
  scTypeEval = sceval,
  var_method = "basic", # Method for variance modeling: "basic" or "scran"
  ngenes = 1000,        # Number of HVGs to retain
  sample = TRUE         # Perform sample-level blocking
)

# View stored gene lists
gene_lists <- sceval@gene_lists
names(gene_lists)
if ("HVG" %in% names(gene_lists)) {
  length(gene_lists$HVG)
}

## ----markers------------------------------------------------------------------
# Identify marker genes per cell type
sceval <- run_gene_markers(
  scTypeEval = sceval,
  method = "scran.findMarkers",  # Method for finding markers
  ngenes_celltype = 50           # Maximum markers per cell type
)

# View marker genes
gene_lists <- sceval@gene_lists
if (!is.null(gene_lists$scran.findMarkers)) {
  head(gene_lists$scran.findMarkers, 20)
}

## ----custom_genes-------------------------------------------------------------
# Example: Add a custom list of immune response genes
custom_genes <- c("Gene_1", "Gene_50", "Gene_100", "Gene_150")

sceval <- add_gene_list(
  scTypeEval = sceval,
  gene_list = list("custom_genes" = custom_genes)
)

## ----explore_gene_lists-------------------------------------------------------
gene_lists <- sceval@gene_lists
names(gene_lists)

## ----pca----------------------------------------------------------------------
# Run PCA on processed data
sceval <- run_pca(
  scTypeEval = sceval,
  ndim = 20,         # Number of principal components
  gene_list = "HVG" # specify gene list to subset if multiple were added
)

# View available reductions
reductions <- sceval@reductions
if (length(reductions) > 0) {
  names(reductions)
}

## ----dissim_pseudobulk--------------------------------------------------------
# Euclidean distance on pseudobulk profiles
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Euclidean",
  reduction = FALSE # Use processed expression data of selected features
)

# Cosine distance on PCA embeddings
sceval <- run_dissimilarity(
  sceval,
  method = "Pseudobulk:Cosine",
  reduction = TRUE               # Use PCA space
)

## ----dissim_wasserstein-------------------------------------------------------
# Wasserstein distance on PCA embeddings (faster)
sceval <- run_dissimilarity(
  sceval,
  method = "WasserStein",
  reduction = TRUE
)

## ----dissim_reciprocal--------------------------------------------------------
# Reciprocal classification with SingleR
sceval <- run_dissimilarity(
  sceval,
  method = "recip_classif:Match",
  reciprocal_classifier = "SingleR",
  # usage of dimensional reduction not supported
  # for this dissimilarity approach
  reduction = FALSE 
)

## ----view_dissim--------------------------------------------------------------
# List all computed dissimilarity matrices
dissim <- sceval@dissimilarity
if (length(dissim) > 0) {
  names(dissim)
}

## ----consistency_silhouette---------------------------------------------------
# Compute silhouette consistency on Euclidean dissimilarity
consistency_sil <- get_consistency(
  sceval,
  dissimilarity_slot = "Pseudobulk:Euclidean",
  consistency_metric = "silhouette"
)

# View results
print(consistency_sil)

## ----consistency_neighborhood-------------------------------------------------
# Compute neighborhood purity on Wasserstein dissimilarity
consistency_nbhd <- get_consistency(
  sceval,
  dissimilarity_slot = "WasserStein",
  consistency_metric = "NeighborhoodPurity"
)

# View results
print(consistency_nbhd)

## ----compare_metrics----------------------------------------------------------
consistency_all <- 
  get_consistency(
    sceval,
    dissimilarity_slot = c("Pseudobulk:Euclidean",
                           "WasserStein"), # allow multiple arguments
    consistency_metric = c("silhouette",
                           "NeighborhoodPurity",
                           "Average_similarity") # allow multiple arguments
  )

print(consistency_all)

## ----plot_heatmap, fig.width=8, fig.height=7----------------------------------
# Plot dissimilarity heatmap with consistency-based ordering
plot_heatmap(
  sceval,
  dissimilarity_slot = "Pseudobulk:Euclidean",
  sort_consistency = "silhouette",      # Order by silhouette scores
  sort_similarity = NULL                 # Or order by similarity
)

## ----pca_pseudobulk-----------------------------------------------------------
plot_pca(
  sceval,
  reduction_slot = "pseudobulk"
)

## ----session_info-------------------------------------------------------------
sessionInfo()

