## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 4.5
)
library(SpliceImpactR)
library(BiocParallel)

## ----install-biocmanager, eval=FALSE------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#   install.packages("BiocManager")
# }
# BiocManager::install("SpliceImpactR")

## ----install-devtools, eval=FALSE---------------------------------------------
# if (!requireNamespace("devtools", quietly = TRUE)) {
#   install.packages("devtools")
# }
# devtools::install_github("fiszbein-lab/SpliceImpactR")

## ----load-resources-----------------------------------------------------------
library(SpliceImpactR)
annotation_df <- get_annotation(load = "test")

interpro_features <- get_protein_features(
  biomaRt_databases = c("interpro"),
  gtf_df = annotation_df$annotations,
  test = TRUE
)

signalp_features <- get_protein_features(
  biomaRt_databases = c("signalp"),
  gtf_df = annotation_df$annotations,
  test = TRUE
)

elm_features <- get_protein_features(
  biomaRt_databases = c("elm"),
  gtf_df = annotation_df$annotations,
  test = TRUE
)

protein_feature_total <- get_comprehensive_annotations(
  list(interpro_features, signalp_features, elm_features)
)

exon_features <- get_exon_features(
  annotation_df$annotations,
  protein_feature_total
)

## ----feature-summary----------------------------------------------------------
table(protein_feature_total$database)

## ----load-data----------------------------------------------------------------
sample_frame <- data.frame(
  path = c(
    file.path(system.file("extdata", package = "SpliceImpactR"),
              "rawData/control_S5/"),
    file.path(system.file("extdata", package = "SpliceImpactR"),
              "rawData/control_S6/"),
    file.path(system.file("extdata", package = "SpliceImpactR"),
              "rawData/control_S7/"),
    file.path(system.file("extdata", package = "SpliceImpactR"),
              "rawData/control_S8/"),
    file.path(system.file("extdata", package = "SpliceImpactR"),
              "rawData/case_S1/"),
    file.path(system.file("extdata", package = "SpliceImpactR"),
              "rawData/case_S2/"),
    file.path(system.file("extdata", package = "SpliceImpactR"),
              "rawData/case_S3/"),
    file.path(system.file("extdata", package = "SpliceImpactR"),
              "rawData/case_S4/")
  ),
  sample_name = c("S5", "S6", "S7", "S8", "S1", "S2", "S3", "S4"),
  condition = c(
    "control", "control", "control", "control",
    "case", "case", "case", "case"
  ),
  stringsAsFactors = FALSE
)

## ----wrapper-quickstart, eval=FALSE-------------------------------------------
# # data.table-first return
# out_dt <- get_splicing_impact(
#   sample_frame = sample_frame,
#   source_data = "both",
#   annotation_df = annotation_df,
#   protein_feature_total = protein_feature_total,
#   exon_features = exon_features,
#   return_class = "data.table"
# )
# 
# # S4-first return (single object carries all slots)
# out_s4 <- get_splicing_impact(
#   sample_frame = sample_frame,
#   source_data = "both",
#   annotation_df = annotation_df,
#   protein_feature_total = protein_feature_total,
#   exon_features = exon_features,
#   return_class = "S4"
# )

## ----read-splicing-events-----------------------------------------------------
data <- get_rmats_hit(
  sample_frame,
  event_types = c("ALE", "AFE", "MXE", "SE", "A3SS", "A5SS", "RI")
)

DT <- data.table::as.data.table(data)
DT[, .(
  n_rows = .N,
  n_events = data.table::uniqueN(event_id),
  n_genes = data.table::uniqueN(gene_id)
)]

## ----load-data-separate-------------------------------------------------------
sample_frame_rmats <- sample_frame
sample_frame_hit <- sample_frame

rmats_only <- get_rmats(
  load_rmats(
    sample_frame_rmats,
    use = "JCEC",
    event_types = c("MXE", "SE", "A3SS", "A5SS", "RI")
  )
)

hit_only <- get_hitindex(
  sample_frame_hit,
  keep_annotated_first_last = TRUE
)

shared_cols <- intersect(names(rmats_only), names(hit_only))
data_separate <- data.table::rbindlist(
  list(
    rmats_only[, ..shared_cols],
    hit_only[, ..shared_cols]
  ),
  use.names = TRUE,
  fill = TRUE
)

data_separate[, .(
  n_rows = .N,
  n_events = data.table::uniqueN(event_id),
  n_genes = data.table::uniqueN(gene_id)
)]

## ----hitindex-compare, fig.cap="HITindex condition comparison."---------------
hit_compare <- compare_hit_index(
  sample_frame,
  condition_map = c(control = "control", test = "case")
)

hit_compare$plot

## ----hitindex-table-----------------------------------------------------------
head(hit_compare$results[order(fdr)], 6)

## ----overview-psi, fig.cap="Depth-normalized event counts and PSI overview."----
overview_plot <- overview_spicing_comparison(
  events = data,
  sample_df = sample_frame,
  depth_norm = "exon_files",
  event_type = "AFE"
)

overview_plot

## ----differential-inclusion---------------------------------------------------
res <- get_differential_inclusion(
  data,
  min_total_reads = 10,
  parallel_glm = TRUE,
  BPPARAM = BiocParallel::SerialParam()
)

res_di <- keep_sig_pairs(res)

res[, .(
  n_rows = .N,
  n_sig = sum(padj <= 0.05 & abs(delta_psi) >= 0.1, na.rm = TRUE)
)]

## ----di-volcano, fig.cap="Differential inclusion volcano plot."---------------
plot_di_volcano_dt(res)

## ----matching-and-pairs-------------------------------------------------------
matched <- get_matched_events_chunked(
  res_di,
  annotation_df$annotations,
  chunk_size = 2000
)

hits_sequences <- attach_sequences(matched, annotation_df$sequences)
pairs <- get_pairs(hits_sequences, source = "multi")

pairs[, .(
  n_pairs = .N,
  n_events = data.table::uniqueN(event_id)
)]

## ----proximal-distal, fig.cap="Proximal versus distal terminal exon summary."----
proximal_output <- get_proximal_shift_from_hits(pairs)
proximal_output$plot

## ----sequence-compare---------------------------------------------------------
seq_compare <- compare_sequence_frame(pairs, annotation_df$annotations)

## ----alignment-summary, fig.cap="Protein alignment summary by event type."----
alignment_plot <- plot_alignment_summary(seq_compare)
alignment_plot

## ----length-summary, fig.cap="Case versus control isoform length comparison."----
length_plot <- plot_length_comparison(seq_compare)
length_plot

## ----domain-background-and-hits-----------------------------------------------
bg <- get_background(
  source = "annotated",
  annotations = annotation_df$annotations,
  protein_features = protein_feature_total,
  BPPARAM = BiocParallel::SerialParam()
)

hits_domain <- get_domains(seq_compare, exon_features)

hits_domain[, .(
  n_hits = .N,
  n_domain_changed = sum(diff_n > 0, na.rm = TRUE)
)]

## ----domain-enrichment, fig.cap="Top enriched InterPro domain differences."----
enriched_domains <- enrich_domains_hypergeo(
  hits = hits_domain,
  background = bg,
  db_filter = "interpro"
)

domain_plot <- plot_enriched_domains_counts(enriched_domains, top_n = 20)
domain_plot

## ----domain-enrichment-by-event-db--------------------------------------------
enriched_afe_interpro <- enrich_by_event(
  hits = hits_domain,
  background = bg,
  events = "AFE",
  db_filter = "interpro"
)

enriched_interpro_only <- enrich_by_db(
  hits = hits_domain,
  background = bg,
  dbs = "interpro"
)

list(
  by_event_rows = nrow(enriched_afe_interpro),
  by_db_rows = nrow(enriched_interpro_only)
)

## ----ppi-effects, fig.cap="Summary of predicted case/control PPI changes."----
ppi <- get_ppi_interactions()

hits_final <- get_ppi_switches(
  hits_domain,
  ppi,
  protein_feature_total
)

ppi_plot <- plot_ppi_summary(hits_final)
ppi_plot

## ----enrichment-foregrounds---------------------------------------------------
fg_di <- get_gene_enrichment(
  mode = "di",
  res = res,
  padj_threshold = 0.1,
  delta_psi_threshold = 0.1
)

fg_domain <- get_gene_enrichment(mode = "domain", hits = hits_domain)
fg_ppi <- get_gene_enrichment(mode = "ppi", hits = hits_final)

lengths(list(di = fg_di, domain = fg_domain, ppi = fg_ppi))
intersect(fg_di, fg_ppi)

## ----run-get-enrichment-------------------------------------------------------
needed <- c("clusterProfiler", "msigdbr", "org.Hs.eg.db")
has_needed <- all(vapply(
  needed,
  requireNamespace,
  FUN.VALUE = logical(1),
  quietly = TRUE
))

if (has_needed) {
  enrichment_di <- get_enrichment(
    foreground = fg_domain,
    background = bg$gene_id,
    species = "human",
    gene_id_type = "ensembl",
    sources = "GO:BP",
    p_adjust_cutoff = 1, ## CHANGE FOR NON-TOY DATA USAGE, eg: 0.05
    top_n_plot = 1 ## CHANGE FOR NON-TOY DATA USAGE, eg: NULL or 10
  )

  enrichment_di$plot
} else {
  message(
    "Skipping get_enrichment(): install ",
    paste(needed, collapse = ", ")
  )
}

## ----transcript-protein-views, fig.cap="Transcript and protein views for one paired event."----
viz_pair <- hits_final[
  !is.na(transcript_id_case) &
    !is.na(transcript_id_control) &
    transcript_id_case != "" &
    transcript_id_control != ""
][1]

if (nrow(viz_pair) == 0) {
  stop("No transcript pairs available for visualization.")
}

tx_pair <- c(viz_pair$transcript_id_case, viz_pair$transcript_id_control)

transcript_centric <- plot_two_transcripts_with_domains_unified(
  transcripts = tx_pair,
  gtf_df = annotation_df$annotations,
  protein_features = protein_feature_total,
  feature_db = c("interpro"),
  combine_domains = TRUE,
  view = "transcript"
)

protein_centric <- plot_two_transcripts_with_domains_unified(
  transcripts = tx_pair,
  gtf_df = annotation_df$annotations,
  protein_features = protein_feature_total,
  feature_db = c("interpro"),
  combine_domains = TRUE,
  view = "protein"
)

transcript_centric
protein_centric

## ----probe-event, fig.cap="PSI distribution for one selected event."----------
event_to_probe <- res$event_id[1]
probe <- probe_individual_event(data, event = event_to_probe)
probe$plot

## ----probe-table--------------------------------------------------------------
head(probe$data)

## ----integrated-summary, fig.cap="Integrated multi-layer summary.", fig.width=14, fig.height=10----
int_summary <- integrated_event_summary(hits_final, res)
int_summary$plot

## ----integrated-summary-table-------------------------------------------------
int_summary$summaries$relative_use

## ----s4-init------------------------------------------------------------------
obj <- as_splice_impact_result(
  data = data,
  res = res,
  res_di = res_di,
  matched = hits_sequences,
  sample_frame = sample_frame,
  hits_final = hits_final
)

## ----s4-accessors-------------------------------------------------------------
raw_dt <- as_dt_from_s4(obj, "raw_events")
di_dt <- as_dt_from_s4(obj, "di_events")
hits_dt <- as_dt_from_s4(obj, "paired_hits")

hits_core <- get_hits_core(obj)
hits_domain_view <- get_hits_domain(obj)
hits_ppi_view <- get_hits_ppi(obj)
hits_sequence_view <- get_hits_sequence(obj)

c(
  raw_rows = nrow(raw_dt),
  di_rows = nrow(di_dt),
  hits_rows = nrow(hits_dt),
  hits_core_rows = nrow(hits_core),
  hits_domain_rows = nrow(hits_domain_view),
  hits_ppi_rows = nrow(hits_ppi_view),
  hits_sequence_rows = nrow(hits_sequence_view)
)

## ----s4-filtering-------------------------------------------------------------
obj_focus <- filter_spliceimpact_hits(
  obj,
  diff_n > 0,
  n_ppi > 0
)

focus_hits <- as_dt_from_s4(obj_focus, "paired_hits")
focus_hits[, .N, by = event_type][order(-N)]

## ----s4-enrichment------------------------------------------------------------
fg_di_s4 <- get_gene_enrichment(
  mode = "di",
  x = obj,
  padj_threshold = 0.05,
  delta_psi_threshold = 0.1
)

fg_ppi_s4 <- get_gene_enrichment(mode = "ppi", x = obj)

fg_ppi_focus <- get_gene_enrichment(mode = "ppi", x = obj_focus)
list(
  di_equal_table = identical(sort(fg_di), sort(fg_di_s4)),
  ppi_equal_table = identical(sort(fg_ppi), sort(fg_ppi_s4)),
  focused_ppi_genes = head(fg_ppi_focus)
)

## ----s4-schema----------------------------------------------------------------
spliceimpact_s4_schema()

## ----s4-main-workflow-code-only, eval=FALSE-----------------------------------
# obj_flow <- as_splice_impact_result(
#   data = data,
#   sample_frame = sample_frame
# )
# 
# obj_flow <- get_differential_inclusion(
#   obj_flow,
#   min_total_reads = 10,
#   parallel_glm = TRUE,
#   BPPARAM = BiocParallel::SerialParam(),
#   return_class = "S4"
# )
# obj_flow <- keep_sig_pairs(obj_flow, return_class = "S4")
# 
# obj_flow <- get_matched_events_chunked(
#   obj_flow,
#   annotation_df$annotations,
#   chunk_size = 2000,
#   return_class = "S4"
# )
# obj_flow <- attach_sequences(
#   obj_flow,
#   annotation_df$sequences,
#   return_class = "S4"
# )
# obj_flow <- get_pairs(obj_flow, source = "multi", return_class = "S4")
# obj_flow <- compare_sequence_frame(
#   obj_flow,
#   annotation_df$annotations,
#   return_class = "S4"
# )
# obj_flow <- get_domains(obj_flow, exon_features, return_class = "S4")
# obj_flow <- get_ppi_switches(
#   obj_flow,
#   ppi,
#   protein_feature_total,
#   return_class = "S4"
# )
# 
# hits_core_flow <- get_hits_core(obj_flow)
# hits_domain_flow <- get_hits_domain(obj_flow)
# hits_ppi_flow <- get_hits_ppi(obj_flow)
# hits_sequence_flow <- get_hits_sequence(obj_flow)

## ----custom-manual-features---------------------------------------------------
ann_dt <- data.table::as.data.table(annotation_df$annotations)
coding_tx <- unique(ann_dt[type == "exon" & cds_has == TRUE, transcript_id])
n_manual <- min(3L, length(coding_tx))

if (n_manual < 1L) {
  stop("No coding transcripts found for manual feature example.")
}

manual_df <- data.frame(
  ensembl_transcript_id = coding_tx[seq_len(n_manual)],
  ensembl_peptide_id = rep(NA_character_, n_manual),
  name = paste0("demo_feature_", seq_len(n_manual)),
  start = c(20L, 45L, 80L)[seq_len(n_manual)],
  stop = c(35L, 58L, 92L)[seq_len(n_manual)],
  database = rep("manual", n_manual),
  alt_name = rep(NA_character_, n_manual),
  feature_id = rep(NA_character_, n_manual),
  stringsAsFactors = FALSE
)

manual_features <- get_manual_features(
  manual_features = manual_df,
  gtf_df = annotation_df$annotations
)

head(manual_features)

## ----custom-pre-di------------------------------------------------------------
example_df <- data.frame(
  event_id = rep("A3SS:1", 8),
  event_type = "A3SS",
  form = rep(c("INC", "EXC"), each = 4),
  gene_id = "ENSG00000158286",
  chr = "chrX",
  strand = "-",
  inc = c(rep("149608626-149608834", 4), rep("149608626-149608829", 4)),
  exc = c(rep("", 4), rep("149608830-149608834", 4)),
  inclusion_reads = c(30, 32, 29, 31, 2, 3, 4, 3),
  exclusion_reads = c(1, 1, 2, 1, 28, 27, 26, 30),
  sample = c("S1", "S2", "S3", "S4", "S1", "S2", "S3", "S4"),
  condition = rep(c("case", "case", "control", "control"), 2),
  stringsAsFactors = FALSE
)
example_df$psi <- example_df$inclusion_reads /
  (example_df$inclusion_reads + example_df$exclusion_reads)

user_data <- get_user_data(example_df)
head(user_data)

## ----custom-post-di-----------------------------------------------------------
example_user_data <- data.frame(
  event_id = rep("A3SS:1", 8),
  event_type = "A3SS",
  gene_id = "ENSG00000158286",
  chr = "chrX",
  strand = "-",
  form = rep(c("INC", "EXC"), each = 4),
  inc = c(rep("149608626-149608834", 4), rep("149608626-149608829", 4)),
  exc = c(rep("", 4), rep("149608830-149608834", 4)),
  stringsAsFactors = FALSE
)

user_res <- get_user_data_post_di(example_user_data)
head(user_res)

## ----custom-rmats-post-di-----------------------------------------------------
rmats_df <- data.frame(
  ID = 1L,
  GeneID = "ENSG00000182871",
  geneSymbol = "COL18A1",
  chr = "chr21",
  strand = "+",
  longExonStart_0base = 45505834L,
  longExonEnd = 45505966L,
  shortES = 45505837L,
  shortEE = 45505966L,
  flankingES = 45505357L,
  flankingEE = 45505431L,
  ID.2 = 2L,
  IJC_SAMPLE_1 = "1,1,1",
  SJC_SAMPLE_1 = "1,1,1",
  IJC_SAMPLE_2 = "1,1,1",
  SJC_SAMPLE_2 = "1,1,1",
  IncFormLen = 52L,
  SkipFormLen = 49L,
  PValue = 0.6967562,
  FDR = 1,
  IncLevel1 = "0.0,0.0,0.0",
  IncLevel2 = "1.0,1.0,1.0",
  IncLevelDifference = 1.0,
  stringsAsFactors = FALSE
)

user_res_rmats <- get_rmats_post_di(rmats_df, event_type = "A3SS")
head(user_res_rmats)

## ----custom-transcript-pairs--------------------------------------------------
tx_ids <- unique(annotation_df$annotations$transcript_id)
tx_ids <- tx_ids[!is.na(tx_ids) & tx_ids != ""]

if (length(tx_ids) < 4L) {
  stop("Need at least four transcripts for transcript-pair example.")
}

transcript_pairs <- data.frame(
  transcript1 = tx_ids[1:2],
  transcript2 = tx_ids[3:4],
  stringsAsFactors = FALSE
)

user_matched <- compare_transcript_pairs(
  transcript_pairs = transcript_pairs,
  annotations = annotation_df$annotations
)

head(user_matched)

## -----------------------------------------------------------------------------
sessionInfo()