## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    eval = TRUE,
    warning = FALSE,
    message = FALSE,
    fig.align = "center",
    out.width = "80%",
    dev = "png"
)

options(bitmapType = "cairo")
set.seed(1)

## ----logo, echo=FALSE, eval=TRUE, out.width='40%'-----------------------------
knitr::include_graphics("../man/figures/clamp.png", dpi = 800)

## ----install, eval=FALSE------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# BiocManager::install("CLAMP")

## ----install-dev, eval=FALSE--------------------------------------------------
# BiocManager::install("chikinalab/CLAMP")

## ----load-packages------------------------------------------------------------
library(CLAMP)
library(CLAMPData)
library(dplyr)
library(rsvd)
library(glmnet)
library(Matrix)
library(rhdf5)
library(data.table)
library(bigstatsr)
library(here)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DT)
library(DiagrammeR)

## ----CLAMP-workflow, echo=FALSE, warning=FALSE, message=FALSE, fig.width=12, fig.height=9, out.width='100%'----
grViz('
 digraph CLAMP_pipeline {
   graph [
     layout=dot, rankdir=TB, splines=ortho,
     ranksep=0.6, nodesep=0.45, margin=0.03
   ]
   node [fontname=Helvetica, fontsize=16, margin="0.10,0.05", penwidth=1.2]
   edge [arrowsize=0.7, penwidth=1.1]

   // ---- Inputs & Priors (gray) ----
   DF     [shape=folder,   style=filled, fillcolor=Gray90, label="data.frame"]
   H5     [shape=folder,   style=filled, fillcolor=Gray90, label="H5 file"]
   GO [shape=cylinder, style=filled, fillcolor=Gray90,
       label="GO Biological Process"]
   MSIGDB [shape=cylinder, style=filled, fillcolor=Gray90,
           label="MSigDB Hallmark"]
   GTEx   [shape=cylinder, style=filled, fillcolor=Gray90, label="GTEx Tissues"]
   GMTFILE [shape=folder,  style=filled, fillcolor=Gray90, label="GMT file"]

   // ---- Functions (white) ----
   node [shape=rectangle, style=filled, fillcolor=White]
   cpmDF      [label="cpmCLAMP()"]
   preDF      [label="preprocessCLAMP()"]
   zDF        [label="zscoreCLAMP()"]
   selKMem    [label="select_svd_k()"]
   svdMem     [label="compute_svd()"]
   cpmFBM     [label="cpmCLAMPFBM()"]
   preFBM     [label="preprocessCLAMPFBM()"]
   zFBM       [label="zscoreCLAMPFBM()"]
   selKFBM    [label="select_svd_k()"]
   svdFBM     [label="compute_svd()"]
   numPC      [label="select_clamp_k()"]
   getGMTfunc [label="getGMT()"]
   readGMT    [label="read_gmt()"]
   gmtList    [label="gmtListToSparseMat()"]
   matchPaths [label="getMatchedPathwayMat()"]
   base       [label="CLAMPbase()"]
   full       [label="CLAMPfull()"]

   // ---- Outputs (light-blue tabs) ----
   node [shape=tab, style=filled, fillcolor=LightBlue]
   cpmOut       [label="Y_cpm"]
   filtered     [label="Y_filtered, rowStats"]
   zscored      [label="Y_zscored"]
   svdK         [label="svd_k"]
   svdRes       [label="svdres"]
   clampK       [label="clamp_k"]
   cpmFBMOut    [label="FBM_cpm"]
   filteredFBM  [label="FBM_filtered, rowStats"]
   zscoredFBM   [label="FBM_zscored"]
   svdKFBM      [label="svd_k"]
   svdResFBM    [label="svdres"]
   pathMat      [label="pathMat"]
   matchedPaths [label="matchedPathways"]
   baseResult   [label="CLAMPbase.result"]
   Zmat         [label="Z"]
   Bmat         [label="B"]
   summaryTbl   [label="summary"]
   Umat         [label="U"]

   // ---- In-memory branch ----
   DF -> cpmDF -> cpmOut -> preDF -> filtered -> zDF -> 
        zscored -> selKMem -> svdK
   svdK -> svdMem
   zscored -> svdMem -> svdRes
   svdRes -> numPC
   svdK -> numPC
   numPC -> clampK
   svdRes -> base
   clampK -> base
   zscored -> base
   base -> baseResult -> full

   // ---- On-disk branch ----
   H5 -> cpmFBM -> cpmFBMOut -> preFBM -> filteredFBM -> 
        zFBM -> zscoredFBM -> selKFBM -> svdKFBM
   svdKFBM -> svdFBM
   zscoredFBM -> svdFBM -> svdResFBM
   svdResFBM -> numPC
   svdKFBM -> numPC
   svdResFBM -> base
   zscoredFBM -> base

   // ---- Prior-knowledge branch ----
   GO -> getGMTfunc
   MSIGDB -> getGMTfunc
   GTEx -> getGMTfunc
   GMTFILE -> readGMT
   getGMTfunc -> gmtList
   readGMT -> gmtList
   gmtList -> pathMat -> matchPaths -> matchedPaths
   matchedPaths -> full

   // ---- CLAMPfull outputs ----
   full -> Zmat
   full -> Bmat
   full -> summaryTbl
   full -> Umat
 }',
    width = "100%",
    height = "900px"
)

## ----list-clamp-data----------------------------------------------------------
list_clamp_data()

## ----load-data, message=FALSE-------------------------------------------------
data("dataWholeBlood") # expression matrix
dim(dataWholeBlood) # genes x samples
dataWholeBlood[1:6, 1:6] # genes x samples

## ----preprocess, message=TRUE-------------------------------------------------
#  CPM normalization
dataWholeBlood_cpm <- cpmCLAMP(dataWholeBlood)

# Filter and compute row statistics
prep_wb <- preprocessCLAMP(
    Y = dataWholeBlood_cpm,
    mean_cutoff = 0.5,
    var_cutoff = 0.1
)

# Extract filtered matrix and rowStats
wb_Y_filtered <- prep_wb$Y_filtered
wb_rowStats <- prep_wb$rowStats


# Z-score normalization
wb_Y_z <- zscoreCLAMP(
    Y_filtered = wb_Y_filtered,
    rowStats = wb_rowStats
)

## ----wb-svd, message=FALSE----------------------------------------------------
# Select SVD rank and compute SVD
wb_svd_k   <- select_svd_k(wb_Y_z)
wb_svd     <- compute_svd(wb_Y_z, k = wb_svd_k)

# Select clamp_k (elbow method by default)
wb_clamp_k <- select_clamp_k(wb_svd, n_samples = ncol(wb_Y_z), svd_k = wb_svd_k)
wb_clamp_k

## ----CLAMPbase, message=FALSE-------------------------------------------------
wb_baseRes <- CLAMPbase(
    Y = wb_Y_z,
    svdres = wb_svd,
    clamp_k = wb_clamp_k
)

## ----priors-download, eval=FALSE----------------------------------------------
# # How to download pathway and cell marker libraries from Enrichr.
# # Not run during vignette build to avoid network calls; pre-fetched
# # .rds files are loaded in the next chunk instead.
# enrichr_url <- "https://maayanlab.cloud/Enrichr/geneSetLibrary"
# gmtList <- list(
#     CellMarkers = getGMT(
#         paste0(enrichr_url, "?mode=text&libraryName=CellMarker_2024"),
#         "CellMarker_2024"
#     ),
#     KEGG = getGMT(
#         paste0(enrichr_url, "?mode=text&libraryName=KEGG_2021_Human"),
#         "KEGG_2021_Human"
#     )
# )

## ----priors, message=FALSE----------------------------------------------------
# Load pre-fetched gene set libraries bundled with the package
gmtList <- list(
    CellMarkers = readRDS(
        system.file("extdata", "CellMarker_2024.rds", package = "CLAMP")
    ),
    KEGG = readRDS(
        system.file("extdata", "KEGG_2021_Human.rds", package = "CLAMP")
    )
)

# Combine into a single sparse matrix
pathMatCell <- gmtListToSparseMat(gmtList)

# Load additional xCell reference matrix
data("xCell")

# Match pathways to the gene space of whole blood
matchedPathsWB <- getMatchedPathwayMatList(
    pathMatCell, xCell,
    new.genes = rownames(dataWholeBlood),
    min.genes = 2
)

## ----CLAMPfull, message=FALSE-------------------------------------------------
wb_fullRes <- CLAMPfull(
    wb_Y_z,
    priorMat = matchedPathsWB,
    clamp.base.result = wb_baseRes,
    svdres = wb_svd,
    clamp_k = wb_clamp_k,
    use_cpp = TRUE
)

## ----summary-table------------------------------------------------------------
# Display significant latent variables
wb_summary_df <- as.data.frame(wb_fullRes$summary) %>%
    dplyr::filter(FDR < 0.05 & AUC > 0.7) %>%
    dplyr::arrange(FDR) %>%
    dplyr::select(LV, pathway, FDR, AUC)

datatable(
    wb_summary_df,
    filter = "top",
    options = list(
        pageLength = 10,
        autoWidth  = TRUE
    ),
    rownames = FALSE,
    class = "stripe hover compact"
) %>%
    formatSignif(c("AUC", "FDR"), 3)

## ----fbm-cleanup, message=FALSE-----------------------------------------------
output_dir <- here("output", "alzFBM")
fbm_base <- file.path(output_dir, "FBMalz")
bk_paths <- paste0(fbm_base, c(".bk", "_preproc.bk", "_preproc_filtered.bk"))
file.remove(bk_paths[file.exists(bk_paths)])

## ----alz-schema---------------------------------------------------------------
clamp_h5_schema()

## ----hdf5-load, message=FALSE-------------------------------------------------
expr_mat <- read_clamp_alz_expression()
genes <- rownames(expr_mat)

## ----fbm-construct, message=FALSE---------------------------------------------
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
alzFBM <- FBM(
    nrow = nrow(expr_mat), ncol = ncol(expr_mat),
    backingfile = fbm_base
)
blk <- 1000

for (i in seq_len(ceiling(nrow(expr_mat) / blk))) {
    rows <- ((i - 1) * blk + 1):min(i * blk, nrow(expr_mat))
    alzFBM[rows, ] <- expr_mat[rows, , drop = FALSE]
}

## ----fbm-preprocess, message=FALSE--------------------------------------------
prep_alz <- preprocessCLAMPFBM(
    fbm = alzFBM,
    mean_cutoff = 0.5,
    var_cutoff = 0.1
)

alz_fbm_filt <- prep_alz$fbm_filtered
alz_rowStats <- prep_alz$rowStats
zscoreCLAMPFBM(alz_fbm_filt, alz_rowStats)
alz_genes <- genes[prep_alz$kept_rows]

## ----fbm-svd, message=FALSE---------------------------------------------------
# Select SVD rank and compute SVD (dispatches to bigstatsr for FBM)
alz_svd_k   <- select_svd_k(alz_fbm_filt)
alz_svd     <- compute_svd(alz_fbm_filt, k = alz_svd_k)

# Select clamp_k (elbow method by default)
alz_clamp_k <- select_clamp_k(alz_svd, n_samples = ncol(alz_fbm_filt),
                              svd_k = alz_svd_k)
alz_clamp_k

## ----fbm-CLAMPbase, message=FALSE---------------------------------------------
alz_baseRes <- CLAMPbase(
    Y = alz_fbm_filt,
    svdres = alz_svd,
    clamp_k = alz_clamp_k
)

## ----fbm-priors-download, eval=FALSE------------------------------------------
# # How to fetch the libraries; not run during vignette build.
# enrichr_url <- "https://maayanlab.cloud/Enrichr/geneSetLibrary"
# alz_gmtList <- list(
#     GTEx_Tissues = getGMT(
#         paste0(enrichr_url, "?mode=text&libraryName=GTEx_Tissues_V8_2023")
#     ),
#     BP = getGMT(
#         paste0(enrichr_url, "?mode=text&libraryName=GO_Biological_Process_2025")
#     ),
#     MSigDB = getGMT(
#         paste0(enrichr_url, "?mode=text&libraryName=MSigDB_Hallmark_2020")
#     )
# )

## ----fbm-priors, message=FALSE------------------------------------------------
alz_gmtList <- list(
    GTEx_Tissues = readRDS(
        system.file("extdata", "GTEx_Tissues_V8_2023.rds", package = "CLAMP")
    ),
    BP = readRDS(
        system.file(
            "extdata", "GO_Biological_Process_2025.rds",
            package = "CLAMP"
        )
    ),
    MSigDB = readRDS(
        system.file("extdata", "MSigDB_Hallmark_2020.rds", package = "CLAMP")
    )
)

alz_pathMat <- gmtListToSparseMat(alz_gmtList)
alz_matched <- getMatchedPathwayMat(alz_pathMat, alz_genes)

## ----fbm-CLAMPfull, message=FALSE---------------------------------------------
alz_fullRes <- CLAMPfull(
    alz_fbm_filt,
    priorMat = alz_matched,
    clamp.base.result = alz_baseRes,
    svdres = alz_svd,
    clamp_k = alz_clamp_k,
    use_cpp = TRUE
)

## ----fbm-summary--------------------------------------------------------------
alz_summary_df <- as.data.frame(alz_fullRes$summary) %>%
    dplyr::filter(FDR < 0.05 & AUC > 0.7) %>%
    dplyr::arrange(FDR) %>%
    dplyr::select(LV, pathway, FDR, AUC)

datatable(
    alz_summary_df,
    filter = "top",
    options = list(
        pageLength = 10,
        autoWidth  = TRUE
    ),
    rownames = FALSE,
    class = "stripe hover compact"
) %>%
    formatSignif(c("AUC", "FDR"), 3)

## ----islet-load, message=FALSE------------------------------------------------
islet_df <- read_islet_counts()

islet_df$symbol <- mapIds(org.Hs.eg.db,
    keys = islet_df$ensembl,
    column = "SYMBOL",
    keytype = "ENSEMBL",
    multiVals = "first"
)

islet_df <- islet_df[!is.na(islet_df$symbol), ]

## ----islet-aggregate, message=FALSE-------------------------------------------
# Sum counts per symbol
setDT(islet_df)
num_cols <- names(islet_df)[sapply(islet_df, is.numeric)]
expr <- islet_df[, lapply(.SD, sum), by = symbol, .SDcols = num_cols]
expr <- as.data.frame(expr)
rownames(expr) <- expr$symbol
expr$symbol <- NULL
expr <- as.matrix(expr)

## ----islet-preprocess, message=FALSE------------------------------------------
prep_is <- preprocessCLAMP(
    Y = expr,
    mean_cutoff = 0.5,
    var_cutoff = 0.1
)

iso_Yf <- prep_is$Y_filtered
iso_rowS <- prep_is$rowStats

iso_Yz <- zscoreCLAMP(
    Y_filtered = iso_Yf,
    rowStats = iso_rowS
)

## ----islet-svd, message=FALSE-------------------------------------------------
# Select SVD rank and compute SVD
islet_svd_k   <- select_svd_k(iso_Yz)
islet_svd     <- compute_svd(iso_Yz, k = islet_svd_k)

# Select clamp_k (elbow method by default)
islet_clamp_k <- select_clamp_k(islet_svd, n_samples = ncol(iso_Yz),
                                svd_k = islet_svd_k)
islet_clamp_k

## ----islet-CLAMPbase, message=FALSE-------------------------------------------
islet_baseRes <- CLAMPbase(
    Y = iso_Yz,
    svdres = islet_svd,
    clamp_k = islet_clamp_k
)

## ----islet-priors-download, eval=FALSE----------------------------------------
# # How to fetch the libraries; not run during vignette build.
# enrichr_url <- "https://maayanlab.cloud/Enrichr/geneSetLibrary"
# islet_gmtList <- list(
#     GTEx_Tissues = getGMT(
#         paste0(enrichr_url, "?mode=text&libraryName=GTEx_Tissues_V8_2023")
#     ),
#     Diabetes_Perturbations = getGMT(
#         paste0(
#             enrichr_url,
#             "?mode=text&libraryName=Diabetes_Perturbations_GEO_2022"
#         )
#     ),
#     MSigDB_Hallmark = getGMT(
#         paste0(enrichr_url, "?mode=text&libraryName=MSigDB_Hallmark_2020")
#     )
# )

## ----islet-priors, message=FALSE----------------------------------------------
islet_gmtList <- list(
    GTEx_Tissues = readRDS(
        system.file("extdata", "GTEx_Tissues_V8_2023.rds", package = "CLAMP")
    ),
    Diabetes_Perturbations = readRDS(
        system.file(
            "extdata", "Diabetes_Perturbations_GEO_2022.rds",
            package = "CLAMP"
        )
    ),
    MSigDB_Hallmark = readRDS(
        system.file("extdata", "MSigDB_Hallmark_2020.rds", package = "CLAMP")
    )
)

islet_pathMat <- gmtListToSparseMat(islet_gmtList)
islet_matched <- getMatchedPathwayMat(islet_pathMat, rownames(iso_Yz))
islet_chatObj <- getChat(islet_matched)

## ----islet-CLAMPfull, message=FALSE-------------------------------------------
islet_fullRes <- CLAMPfull(
    iso_Yz,
    priorMat = islet_matched,
    clamp.base.result = islet_baseRes,
    svdres = islet_svd,
    clamp_k = islet_clamp_k,
    use_cpp = TRUE
)

## ----islet-summary------------------------------------------------------------
islet_summary_df <- as.data.frame(islet_fullRes$summary) %>%
    dplyr::filter(FDR < 0.05 & AUC > 0.7) %>%
    dplyr::arrange(FDR) %>%
    dplyr::select(LV, pathway, FDR, AUC)

datatable(
    islet_summary_df,
    filter = "top",
    options = list(
        pageLength = 10,
        autoWidth  = TRUE
    ),
    rownames = FALSE,
    class = "stripe hover compact"
) %>%
    formatSignif(c("AUC", "FDR"), 3)

## ----islet-diff, message=FALSE------------------------------------------------
islet_metadata <- read_islet_metadata()

lv_stats_all_vs_nd <- differentialLVActivity(
    islet_fullRes,
    metadata = islet_metadata,
    sample_col = "id",
    group_col = "type",
    reference = "ND"
)

sig_lv_all_vs_nd <- lv_stats_all_vs_nd %>%
    dplyr::filter(FDR < 0.1)

sig_pathway <- islet_summary_df %>%
    dplyr::filter(FDR < 0.05 & AUC > 0.7) %>%
    dplyr::filter(LV %in% sig_lv_all_vs_nd$LV) %>%
    dplyr::arrange(FDR) %>%
    dplyr::select(LV, pathway, FDR, AUC)

datatable(
    sig_pathway,
    filter = "top",
    options = list(
        pageLength = 10,
        autoWidth  = TRUE
    ),
    rownames = FALSE,
    class = "stripe hover compact"
) %>%
    formatSignif(c("AUC", "FDR"), 3)

## ----project-islet-model-to-whole-blood, message=TRUE-------------------------
islet_model_genes <- rownames(islet_fullRes$Z)
wb_project_genes <- rownames(wb_Y_z)

common_genes <- intersect(islet_model_genes, wb_project_genes)
cat(
    "Overlapping genes:", length(common_genes), "/", length(islet_model_genes),
    "islet model genes",
    sprintf(
        "(%.1f%%)\n",
        100 * length(common_genes) / length(islet_model_genes)
    )
)

# projectCLAMP aligns common row names in the model's gene order
wb_projected_B <- projectCLAMP(islet_fullRes, wb_Y_z)

dim(wb_projected_B)
wb_projected_B[
    seq_len(min(5, nrow(wb_projected_B))),
    seq_len(min(5, ncol(wb_projected_B))),
    drop = FALSE
]

## ----k-elbow------------------------------------------------------------------
select_clamp_k(
    wb_svd,
    n_samples = ncol(wb_Y_z),
    svd_k     = wb_svd_k,
    method    = "elbow"
)

## ----k-permutation, eval=FALSE------------------------------------------------
# select_clamp_k(
#     wb_svd,
#     n_samples = ncol(wb_Y_z),
#     svd_k     = wb_svd_k,
#     method    = "permutation",
#     data      = wb_Y_z,
#     B         = 2
# )

## ----k-gavish, eval=FALSE-----------------------------------------------------
# select_clamp_k(
#     wb_svd,
#     n_samples = ncol(wb_Y_z),
#     svd_k     = wb_svd_k,
#     method    = "gavish_donoho",
#     data      = wb_Y_z
# )

## ----plot-U, message=FALSE, fig.width=14, fig.height=10-----------------------
CLAMPplotU(
    wb_fullRes,
    auc.cutoff = 0.6,
    fdr.cutoff = 0.05,
    top        = 3
)

## ----plot-topZ, message=FALSE, fig.width=9, fig.height=7----------------------
# Use the first few LVs that have pathway support
lv_with_paths <- wb_fullRes$withPrior[
    seq_len(min(4, length(wb_fullRes$withPrior)))
]

CLAMPplotTopZ(
    wb_fullRes,
    top       = 50,
    label.top = 10,
    index     = lv_with_paths
)

## ----plot-topZ1, message=FALSE, fig.width=5, fig.height=3---------------------
# Use the first few LVs that have pathway support
lv_with_paths <- wb_fullRes$withPrior[1]

CLAMPplotTopZ(
    wb_fullRes,
    top       = 50,
    label.top = 10,
    index     = lv_with_paths
)

## ----plot-dot, message=FALSE, fig.width=7, fig.height=4, out.width='100%'-----
CLAMPdotplot(
    wb_fullRes,
    lv         = "LV2",
    top        = 15,
    auc.cutoff = 0.6,
    fdr.cutoff = 0.1,
    x.axis     = "AUC",
    order.by   = "AUC"
)

## ----plot-dot-fdr, message=FALSE, fig.width=7, fig.height=4, out.width='100%'----
CLAMPdotplot(
    wb_fullRes,
    lv         = "LV2",
    top        = 15,
    auc.cutoff = 0.6,
    fdr.cutoff = 0.1,
    x.axis     = "-log10(FDR)",
    order.by   = "-log10(FDR)"
)

## ----plot-dotAll, message=FALSE, fig.width=10, fig.height=7.5, out.width='100%'----
CLAMPdotplotAll(
    wb_fullRes,
    auc.cutoff = 0.65,
    fdr.cutoff = 0.05,
    top.per.lv = 5
)

## ----session-info-------------------------------------------------------------
sessionInfo()

