We illustrate the GenomicOZone package with an example. The Escherichia coli transcriptome data set (Gault and Rodrigue 2016) contains transcriptome profiles of E. coli K-12 under nickel stress, with three samples under exposure to nickel and three normal samples.
To run this example, two other packages GEOquery
(Davis and Meltzer 2007) for data download and
readxl
for Excel file reading are required and should have
been installed, in addition to GenomicOZone
.
require(GenomicOZone)
# require(GEOquery)
require(readxl)
The following code chunk reads a data transcriptome set, prepares
genome annotation, and creates parameters before outstanding zone
analysis. Specifically, an Escherichia coli transcriptome data
set of series number GSE76167 was downloaded from GEO database (Edgar, Domrachev, and Lash 2002) and saved in
the package. We accessed the data and extracted the RPKM values from six
samples including three stressed samples and three wild-type samples.
The genome annotation is prepared as a GRanges
(Lawrence et al. 2013) object.
# Obtain data matrix from GSE76167 supplementary file
# invisible(getGEOSuppFiles("GSE76167"))
# data <- read_excel(".//GSE76167_GeneFPKM_AllSamples.xlsx")
<- system.file("extdata", "GSE76167_GeneFPKM_AllSamples.xlsx", package = "GenomicOZone", mustWork = TRUE)
file <- read_excel(file)
data
# Adjust the input data
<- data[,1:5]
data.info <- data[,-c(1:5)]
data <- data[,substr(colnames(data), 1, 4) == "FPKM"]
data <- data.matrix(data[,c(1,5,6,3,4,2)])
data colnames(data) <- c(paste(rep("WT",3), "_", c(1,2,3), sep=""),
paste(rep("Ni",3), "_", c(1,2,3), sep=""))
rownames(data) <- data.info$tracking_id
# Obtain genes
<- data.info$gene_short_name
data.genes == "-"] <- data.info$tracking_id[data.genes == "-"]
data.genes[data.genes
# Create colData
<- data.frame(Sample_name = colnames(data),
colData Condition = factor(rep(c("WT", "Ni"), each = 3),
levels = c("WT", "Ni")))
# Create design
<- ~ Condition
design
# Create rowData.GRanges
<- "(.[^\\:]*)\\:([0-9]+)\\-([0-9]+)"
pattern <- regexec(pattern, as.character(data.info$locus))
matched <- regmatches(as.character(data.info$locus), matched)
values <- data.frame(chr = as.character(sapply(values, function(x){x[[2]]})),
data.gene.coor start = as.numeric(sapply(values, function(x){x[[3]]})),
end = as.numeric(sapply(values, function(x){x[[4]]})))
rownames(data.gene.coor) <- as.character(data.info$tracking_id)
<- GRanges(seqnames = data.gene.coor$chr,
rowData.GRanges IRanges(start = data.gene.coor$start,
end = data.gene.coor$end),
Gene.name = data.genes)
names(rowData.GRanges) <- data.info$tracking_id
<- 4646332
chr.size names(chr.size) <- "NC_007779"
seqlevels(rowData.GRanges) <- names(chr.size)
seqlengths(rowData.GRanges) <- chr.size
With the formatted data, parameters, and annotation, we run the outstanding zone analysis as below:
# Create an input object also checking for data format, consistency, and completeness
<- GOZDataSet(data = data,
GOZ.ds colData = colData,
design = design,
rowData.GRanges = rowData.GRanges)
# Run the outstanding zone analysis
<- GenomicOZone(GOZ.ds) GOZ.ds
The following four auxiliary functions extract gene annotation, zone annotation, outstanding zone annotation, and zone expression matrix, respectively:
# Extract gene/zone GRanges object
<- extract_genes(GOZ.ds)
Gene.GRanges head(Gene.GRanges)
#> GRanges object with 6 ranges and 2 metadata columns:
#> seqnames ranges strand | Gene.name zone
#> <Rle> <IRanges> <Rle> | <character> <factor>
#> gene0 NC_007779 189-255 * | thrL NC_007779_1
#> gene1 NC_007779 336-2799 * | thrA NC_007779_1
#> gene2 NC_007779 2800-3733 * | thrB NC_007779_1
#> gene3 NC_007779 3733-5020 * | thrC NC_007779_1
#> gene4 NC_007779 5233-5530 * | yaaX NC_007779_1
#> gene5 NC_007779 5682-6459 * | yaaA NC_007779_1
#> -------
#> seqinfo: 1 sequence from an unspecified genome
<- extract_zones(GOZ.ds)
Zone.GRanges head(Zone.GRanges)
#> GRanges object with 6 ranges and 3 metadata columns:
#> seqnames ranges strand | zone p.value.adj
#> <Rle> <IRanges> <Rle> | <factor> <numeric>
#> NC_007779_1 NC_007779 1-10641 * | NC_007779_1 5.12662e-02
#> NC_007779_2 NC_007779 10642-37896 * | NC_007779_2 7.18169e-01
#> NC_007779_3 NC_007779 37897-72227 * | NC_007779_3 1.43496e-02
#> NC_007779_4 NC_007779 72228-96000 * | NC_007779_4 1.72429e-06
#> NC_007779_5 NC_007779 96001-117750 * | NC_007779_5 3.22434e-05
#> NC_007779_6 NC_007779 117751-147942 * | NC_007779_6 9.22932e-01
#> effect.size
#> <numeric>
#> NC_007779_1 7.46667e-02
#> NC_007779_2 9.40623e-04
#> NC_007779_3 4.08649e-02
#> NC_007779_4 1.86783e-01
#> NC_007779_5 1.52381e-01
#> NC_007779_6 6.61376e-05
#> -------
#> seqinfo: 1 sequence from an unspecified genome
# min.effect.size = 0.36 is chosen from the
# minumum of top 5% effect size values
<- extract_outstanding_zones(
OZone.GRanges
GOZ.ds,alpha = 0.05,
min.effect.size = 0.36)
head(OZone.GRanges)
#> GRanges object with 6 ranges and 3 metadata columns:
#> seqnames ranges strand | zone p.value.adj
#> <Rle> <IRanges> <Rle> | <factor> <numeric>
#> NC_007779_9 NC_007779 194902-211875 * | NC_007779_9 2.24272e-08
#> NC_007779_20 NC_007779 505826-538369 * | NC_007779_20 7.00190e-18
#> NC_007779_40 NC_007779 1027532-1044650 * | NC_007779_40 3.67993e-11
#> NC_007779_127 NC_007779 3256940-3288667 * | NC_007779_127 4.06311e-20
#> NC_007779_140 NC_007779 3542408-3569464 * | NC_007779_140 1.33424e-28
#> NC_007779_164 NC_007779 4190532-4193530 * | NC_007779_164 3.55346e-13
#> effect.size
#> <numeric>
#> NC_007779_9 0.369738
#> NC_007779_20 0.367495
#> NC_007779_40 0.368724
#> NC_007779_127 0.384379
#> NC_007779_140 0.591375
#> NC_007779_164 0.771429
#> -------
#> seqinfo: 1 sequence from an unspecified genome
<- extract_zone_expression(GOZ.ds)
Zone.exp.mat head(Zone.exp.mat)
#> WT_1 WT_2 WT_3 Ni_1 Ni_2 Ni_3
#> NC_007779_1 62254.279 84002.857 73827.669 48028.196 55300.012 49693.918
#> NC_007779_2 14081.429 30395.490 34327.503 31066.653 31408.564 35835.076
#> NC_007779_3 3969.418 4348.122 4297.210 5082.839 6690.894 6168.065
#> NC_007779_4 10769.938 10361.703 11069.645 13117.399 14347.793 16482.038
#> NC_007779_5 5118.158 4878.421 4847.403 4985.081 4368.440 4238.027
#> NC_007779_6 8668.344 8609.166 9161.151 9130.383 8987.597 8840.630
Three types of plot can be generated from the returned object by
GenomicOZone()
, including genome-wide overview,
within-chromosome heatmap, and within-zone expression. The plots are
generated using R package ggplot2
(Wickham 2016) and ggbio
(Yin, Cook, and Lawrence 2012). The value of
min.effect.size = 0.36
is chosen from the minumum of top 5%
effect size values. The effect size for ANOVA is calculated by R package
sjstats
(Lüdecke 2019).
# Genome-wide overview
plot_genome(GOZ.ds, plot.file = "E_coli_genome.pdf",
plot.width = 15, plot.height = 4,
alpha = 0.05, min.effect.size = 0.36)
#> Warning: Ignoring unknown parameters: fill
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
#> NULL
::include_graphics("E_coli_genome.pdf") knitr
# Within-chromosome heatmap
plot_chromosomes(GOZ.ds, plot.file = "E_coli_chromosome.pdf",
plot.width = 20, plot.height = 4,
alpha = 0.05, min.effect.size = 0.36)
#> png
#> 2
::include_graphics("E_coli_chromosome.pdf") knitr
# Within-zone expression
plot_zones(GOZ.ds, plot.file = "E_coli_zone.pdf",
plot.all.zones = FALSE,
alpha = 0.05, min.effect.size = 0.36)
#> Warning: `fun.y` is deprecated. Use `fun` instead.
#> `fun.y` is deprecated. Use `fun` instead.
#> `fun.y` is deprecated. Use `fun` instead.
#> `fun.y` is deprecated. Use `fun` instead.
#> `fun.y` is deprecated. Use `fun` instead.
#> `fun.y` is deprecated. Use `fun` instead.
#> `fun.y` is deprecated. Use `fun` instead.
#> `fun.y` is deprecated. Use `fun` instead.
#> `fun.y` is deprecated. Use `fun` instead.
#> png
#> 2
::include_graphics("E_coli_zone.pdf") knitr