###Differential Methylation Analysis with Whole Genome Bisulfite Sequencing (WGBS) Data in TCGA
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
library(BiocManager)
BiocManager::install("tcgaWGBSData.hg19", version = "devel")Data can be extracted in the following way:
tcga_data <- eh[["EH1661"]]
TCGA_bs <- eh[["EH1662"]]
tcga_eh_directory <- dirname(tcga_data)
assay_file <- tcga_data
rds_file <- paste0(tcga_eh_directory, '/1662')
file.rename(from=assay_file, to=paste0(tcga_eh_directory, '/assays.h5'))
file.rename(from=rds_file, to=paste0(tcga_eh_directory, '/se.rds'))
TCGA_bs <- HDF5Array::loadHDF5SummarizedExperiment(tcga_eh_directory)Phenotypic data can be extracted by
Methylation Comparison between normal and tumor sample
library(ggplot2)
cov_matrix <- bsseq::getCoverage(TCGA_bs)
meth_matrix <- bsseq::getCoverage(TCGA_bs, type='M')
meth_matrix <- meth_matrix/cov_matrix
# Get total CpG coverage
totCov <- colSums(cov_matrix>0)
# Restrict to CpGs with minimum read covergae of 10
meth_matrix[cov_matrix<10] <- NA 
meanMethylation <- DelayedArray::colMeans(meth_matrix, na.rm=TRUE)
sampleType <- phenoData$sample.type
Df <- data.frame('mean-methylation' = meanMethylation, 'type' = sampleType)
g <- ggplot2::ggplot() 
g <- g + ggplot2::geom_boxplot(data=Df,ggplot2::aes(x=type,y=mean.methylation))
g <- g + ggplot2::xlab('sample type') + ggplot2::ylab('Methylation') 
g <- g + ggplot2::theme(axis.text.x = element_text(angle = 0, hjust = 1))
gAs expected overall methylation is lower in tumor samples compared to the normal samples.
In this analysis we are using functions from bsseq package to conduct the differntial methylation analysis.
# In this analysis we restrict the data to just chromosome 22
chrIndex <- seqnames(TCGA_bs) == 'chr22'
# Also we are only conducting the analysis for 3 normal and tumor pairs
group1 <- c(11, 6, 23) # normal samples
group2 <- c(20, 26, 25) # Tumor samples
subSample <- c(group1, group2)
TCGA_bs_sub <- updateObject(TCGA_bs[chrIndex, subSample])
TCGA_bs_sub.fit <- bsseq::BSmooth(TCGA_bs_sub, mc.cores = 2, verbose = TRUE)
TCGA_bs_sub.tstat <- bsseq::BSmooth.tstat(TCGA_bs_sub.fit, 
                                group1 = c(1, 2, 3),
                                group2 = c(4, 5, 6), 
                                estimate.var = "group2",
                                local.correct = TRUE,
                                verbose = TRUE)
plot(TCGA_bs_sub.tstat)dmrs0 <- bsseq::dmrFinder(TCGA_bs_sub.tstat, cutoff = c(-4.6, 4.6))
dmrs <- subset(dmrs0, n >= 3 & abs(meanDiff) >= 0.1)
nrow(dmrs)
head(dmrs, n = 3)
pData <- pData(TCGA_bs_sub.fit)
pData$col <- rep(c("red", "blue"), each = 3)
pData(TCGA_bs_sub.fit) <- pData
bsseq::plotRegion(TCGA_bs_sub.fit, dmrs[1,], extend = 5000, addRegions = dmrs)## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.27     R6_2.5.0          jsonlite_1.7.2    magrittr_2.0.1   
##  [5] evaluate_0.14     rlang_0.4.11      stringi_1.6.2     jquerylib_0.1.4  
##  [9] bslib_0.2.5.1     rmarkdown_2.8     tools_4.1.0       stringr_1.4.0    
## [13] xfun_0.23         yaml_2.2.1        compiler_4.1.0    htmltools_0.5.1.1
## [17] knitr_1.33        sass_0.4.0