## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    fig.width = 7,
    fig.height = 5
)

## ----eval=FALSE---------------------------------------------------------------
# # Install from Bioconductor (once submitted)
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# BiocManager::install("DMRsegaldata")

## ----load-package-------------------------------------------------------------
library(DMRsegaldata)

## ----beta-data----------------------------------------------------------------
# Load the beta values matrix
library(ExperimentHub)
eh <- ExperimentHub()
beta <- eh[["EH10275"]]

# Examine the structure
dim(beta)
class(beta)

# Preview the first few rows and columns
beta[1:5, 1:5]

# Summary statistics
summary(beta[1:100, 1])

## ----pheno-data---------------------------------------------------------------
# Load phenotype data
pheno <- eh[["EH10276"]]
# View the structure
str(pheno)
head(pheno)

# Summary of sample groups
table(pheno$Sample_Group)

# Summary of gender distribution
table(pheno$Gender)
table(pheno$Sample_Group, pheno$Gender)

# Age distribution
summary(pheno$Age)

## ----dmps-data----------------------------------------------------------------
# Load DMP results
dmps <- eh[["EH10277"]]

# Examine the structure
head(dmps)
dim(dmps)

# Summary of significance levels
summary(dmps$pval)
summary(dmps$pval_adj)

# Top 10 most significant DMPs
head(dmps, 10)

# Count of significant DMPs at different thresholds
sum(dmps$pval_adj < 0.05)
sum(dmps$pval_adj < 0.01)
sum(dmps$pval_adj < 0.001)

## ----array-type---------------------------------------------------------------
# Load array type information
array_type <- eh[["EH10278"]]
array_type

## ----exploration--------------------------------------------------------------
# Check sample consistency between beta and pheno
all(colnames(beta) == rownames(pheno))

# Calculate mean methylation per sample
mean_methylation <- colMeans(beta, na.rm = TRUE)
names(mean_methylation) <- colnames(beta)

# Compare mean methylation between groups
cancer_samples <- rownames(pheno)[pheno$Sample_Group == "cancer"]
normal_samples <- rownames(pheno)[pheno$Sample_Group == "normal"]

mean_cancer <- mean(mean_methylation[cancer_samples])
mean_normal <- mean(mean_methylation[normal_samples])

cat("Mean methylation in cancer samples:", round(mean_cancer, 4), "\n")
cat("Mean methylation in normal samples:", round(mean_normal, 4), "\n")

## ----visualization, fig.cap="Distribution of mean methylation by sample group"----
# Create a data frame for plotting
plot_data <- data.frame(
    Sample = colnames(beta),
    MeanMethylation = mean_methylation,
    Group = pheno[colnames(beta), "Sample_Group"],
    Age = pheno[colnames(beta), "Age"],
    Gender = pheno[colnames(beta), "Gender"]
)

# Boxplot comparing groups
boxplot(MeanMethylation ~ Group,
    data = plot_data,
    main = "Mean Methylation by Sample Group",
    xlab = "Sample Group", ylab = "Mean Beta Value",
    col = c("cancer" = "lightcoral", "normal" = "lightblue")
)

# Add individual points
stripchart(MeanMethylation ~ Group,
    data = plot_data,
    vertical = TRUE, method = "jitter",
    add = TRUE, pch = 19, col = "black"
)

## ----top-dmps, fig.cap="Methylation levels at top 3 DMPs"---------------------
# Get top 3 DMPs
top_dmps <- head(rownames(dmps), 3)

# Set up plotting area
par(mfrow = c(1, 3))

# Plot each DMP
for (cpg in top_dmps) {
    if (cpg %in% rownames(beta)) {
        cpg_values <- beta[cpg, ]
        boxplot(cpg_values ~ pheno$Sample_Group,
            main = cpg,
            xlab = "Group", ylab = "Beta Value",
            col = c("lightcoral", "lightblue")
        )
        stripchart(cpg_values ~ pheno$Sample_Group,
            vertical = TRUE, method = "jitter",
            add = TRUE, pch = 19, col = "black"
        )
    }
}

par(mfrow = c(1, 1))

## ----volcano-plot, fig.cap="Volcano plot of differentially methylated positions"----
# Calculate effect sizes for DMPs
# (mean difference between cancer and normal)

# show only a subset
dmps_subset <- dmps[sample(rownames(dmps), min(1000, nrow(dmps))), , drop = FALSE]
cancer_beta <- beta[rownames(dmps_subset), cancer_samples]
normal_beta <- beta[rownames(dmps_subset), normal_samples]
cancer_beta_means <- rowMeans(cancer_beta, na.rm = TRUE)
normal_beta_means <- rowMeans(normal_beta, na.rm = TRUE)
dmps_subset$delta_beta <- cancer_beta_means - normal_beta_means

# Create volcano plot
plot(dmps_subset$delta_beta, -log10(dmps_subset$pval),
    xlab = "Delta Beta (Cancer - Normal)",
    ylab = "-log10(p-value)",
    main = "Volcano Plot of DMPs",
    pch = 19, col = ifelse(dmps_subset$pval_adj < 0.01 & abs(dmps_subset$delta_beta) > 0.2, "red", "grey50"),
    cex = 0.5
)
abline(h = -log10(0.05), lty = 2, col = "blue")
abline(v = c(-0.2, 0.2), lty = 2, col = "blue")
legend("topright",
    legend = c("FDR < 0.01 && abs(Delta Beta) > 0.2", "Not significant"),
    col = c("red", "grey50"), pch = 19
)

## ----session-info-------------------------------------------------------------
sessionInfo()

