## ----setup, include=FALSE-----------------------------------------------------
# Solution for the "magick package is required to crop" error:
knitr::opts_chunk$set(
  echo = TRUE, # keep showing code chunks
  message = FALSE, 
  warning = FALSE,
  # This is the line that fixes the cropping issue:
  crop = NULL
)

## ----Install the scECODA package, eval=FALSE----------------------------------
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("scECODA")

## ----Load scECODA-------------------------------------------------------------
library(scECODA)

## ----Load example SingleCellExperiment dataset, message=FALSE, warning=FALSE----
# Loading an example SingleCellExperiment dataset
library(scRNAseq)

all.ds <- as.data.frame(surveyDatasets())
ds <- 1
sce_object <- fetchDataset(
  all.ds[["name"]][ds],
  all.ds[["version"]][ds]
)

names(colData(sce_object))

## ----Create ECODA object------------------------------------------------------
# Create ECODA object
se <- ecoda(
  sce_object,
  sample_col = "sample",
  celltype_col = "cluster"
)

## ----Show example of CLR df---------------------------------------------------
assay(se, "clr")[1:5,1:5]

## ----Create ECODA object from Seurat object-----------------------------------
# Commented out because Seurat depends on igraph which has not been updated
# to work with R 4.6 yet.

# library(Seurat)
# 
# counts <- counts(sce_object)
# metadata <- as.data.frame(SummarizedExperiment::colData(sce_object))
# 
# seurat_object <- CreateSeuratObject(counts = counts, meta.data = metadata)
# 
# se <- ecoda(
#   seurat_object,
#   sample_col = "sample",
#   celltype_col = "cluster"
# )

## ----Load example datasets----------------------------------------------------
# Load example datasets
data(example_data)

Zhang <- example_data$Zhang
counts <- Zhang$cell_counts_lowresolution
freq <- calc_freq(counts)
metadata <- Zhang$metadata

se <- ecoda(data = counts, metadata = metadata)

se <- ecoda(data = freq, metadata = metadata)

## ----Show PCA based on cell type composition, message=FALSE, warning=FALSE----
se <- ecoda(
  data = example_data$Zhang$cell_counts_lowresolution,
  metadata = example_data$Zhang$metadata,
)

plot_pca(
  se,
  label_col = "Tissue",
  title = "PCA based on cell type composition",
  n_hv_feat_show = 5 # Shows the most highly variable features (cell types)
)

## ----PCA based on highly variable cell types----------------------------------
plot_pca(
  se,
  assay = "clr_hvc",
  label_col = "Tissue",
  title = "PCA based on highly variable cell types",
  n_hv_feat_show = nrow(metadata(se)$clr_hvc)
)

## ----3D PCA-------------------------------------------------------------------
plot_pca3d(se, label_col = "Tissue")

## ----Quantifying group separation, message=FALSE, warning=FALSE---------------
# You can get the separation scores like this:
dist_mat <- dist(assay(se, "clr"))
# Or use the pre-calculated distance matrix based on the CLR abundances:
dist_mat <- metadata(se)$sample_distances

labels <- colData(se)$Tissue

anosim_r_score <- calc_anosim(dist_mat, labels)
ari_score <- calc_ari(dist_mat, labels)
# modularity_score <- calc_modularity(dist_mat, labels)
silhouette_score <- calc_sil(dist_mat, labels)

print(paste("ANOSIM R:", anosim_r_score))
print(paste("ARI:", ari_score))
# print(paste("Modularity:", modularity_score))
print(paste("Silhouette:", silhouette_score))

## ----Box plot-----------------------------------------------------------------
plot_boxplot(se, title = "CLR Abundance by Cell Type")

## ----Grouped box plot---------------------------------------------------------
plot_boxplot(
  se,
  label_col = "Tissue",
  title = "CLR Abundance by Cell Type (with Wilcoxon Test)"
)

## ----Bar plot by group--------------------------------------------------------
# Plotting average cell type abundance by experimental group
plot_barplot(
  se,
  label_col = "Tissue",
  plot_by = "group",
  title = "Mean Relative Abundance by Condition"
)

## ----Bar plot by sample-------------------------------------------------------
# Plotting cell type abundance for each sample separately
plot_barplot(
  se,
  label_col = "Tissue",
  plot_by = "sample",
  title = "Relative Abundance for Each Sample"
)

## ----Plot heatmap example 1---------------------------------------------------
plot_heatmap(
  se,
  label_col = c("Clinical.efficacy.", "Tissue"),
  cutree_rows = 3,
  cutree_cols = 3
)

## ----Plot heatmap example 2, fig.height=16, fig.width=12----------------------
se <- ecoda(
  data = example_data$GongSharma_full$cell_counts_highresolution,
  metadata = example_data$GongSharma_full$metadata
)

plot_heatmap(
  se,
  label_col = c("subject.cmv", "age_group"),
  # Additional arguments for pheatmap:
  cutree_rows = 3,
  cutree_cols = 5,
  show_colnames = FALSE
)

## ----Plot heatmap example 2 only HVCs-----------------------------------------
plot_heatmap(
  se,
  assay = "clr_hvc",
  label_col = c("subject.cmv", "age_group"),
  # Additional arguments for pheatmap:
  cutree_rows = 3,
  cutree_cols = 4,
  show_colnames = FALSE
)

## ----Correlation plot, fig.height=16, fig.width=16----------------------------
plot_corr(se)

## ----Correlation plot only HVCs, fig.height=16, fig.width=16------------------
plot_corr(se, assay = "clr_hvc")

## ----Mean-variance plot 50 percent--------------------------------------------
library(ggplot2)
plot_varmean(se, labels = "none") +
  ggtitle("Default - cell types explaining 50% variance")

## ----Mean-variance plot 80 percent--------------------------------------------
se_new <- find_hvcs(se, variance_explained = 0.8)
plot_varmean(se_new) +
  ggtitle("Cell types explaining 80% variance")

## ----Mean-variance plot top 5 HVCs--------------------------------------------
se_new <- find_hvcs(se, top_n_hvcs = 5)
plot_varmean(se_new) +
  ggtitle("Top 5 HVCs")

## ----Load dataset including pseudobulk calculation, message=FALSE, warning=FALSE----
# Loading an example SingleCellExperiment dataset
library(scRNAseq)

all.ds <- as.data.frame(surveyDatasets())
ds <- 1
sce_object <- fetchDataset(
  all.ds[["name"]][ds],
  all.ds[["version"]][ds]
)

se <- ecoda(
  data = sce_object,
  sample_col = "sample",
  celltype_col = "cluster",
  get_pb = TRUE
)

## ----PCA plot showing top 5 features------------------------------------------
plot_pca(
  se,
  label_col = "Condition",
  title = "PCA based on cell type composition",
  n_hv_feat_show = 5
)

## ----PCA plot based on pseudobulk---------------------------------------------
plot_pca(
  se,
  assay = "pb",
  label_col = "Condition",
  title = "PCA based on pseudobulk gene expression"
  #, n_hv_feat_show = 5 # optionally, show top n most highly variable genes
)

## ----Removing cell types, eval=FALSE------------------------------------------
# # Load example datasets
# data(example_data)
# Stephenson <- example_data$Stephenson
# counts <- Stephenson$cell_counts_lowresolution
# metadata <- Stephenson$metadata
# se <- ecoda(data = counts, metadata = metadata)
# 
# # Specify cell types to drop
# cts_to_drop <- c("RBC", "Platelets", "Neutrophils")
# keep <- !rownames(assay(se, "counts")) %in% cts_to_drop
# counts_subset <- assay(se, "counts")[, keep]
# colData_subset <- as.data.frame(colData(se))[keep, ]
# 
# # Re-initialize a new subset object
# # This ensures all transformations (CLR, HVCs, etc.) are recalculated correctly
# se_subset <- ecoda(data = counts_subset, metadata = colData_subset)

## ----sessionInfo--------------------------------------------------------------
sessionInfo()