## ----load_libraries-----------------------------------------------------------
suppressPackageStartupMessages({
library("QFeatures")  
library("dplyr") 
library("tidyr")
library("ggplot2")
library("msqrob2")    
library("stringr")
library("ExploreModelMatrix")
library("kableExtra")
library("ComplexHeatmap")
library("scater")
})

## ----file_location------------------------------------------------------------
precursorFile <- "https://github.com/statOmics/msqrob2data/raw/refs/heads/main/dia/spikeinStaes/spikein248-staesetal2024.parquet" 

## ----import_data--------------------------------------------------------------
precursors <- arrow::read_parquet(precursorFile) # function from the arrow package
## For older versions of Spectronaut, where the results are stored as tsv files. 
## Note that the precursorFile then would point to "report.tsv" 
## precursors <- data.table::fread(precursorFile, check.names = TRUE, integer64 = "double") 
knitr::kable(head(precursors))

## ----select_variables---------------------------------------------------------
precursors <- precursors |> 
  select(
         Run, 
         Precursor.Id, 
         Modified.Sequence, 
         Stripped.Sequence, 
         Precursor.Charge, 
         Protein.Group, 
         Protein.Names,
         Protein.Ids,
         Genes,
         Precursor.Quantity,
         Precursor.Normalised,
         Normalisation.Factor,
         Ms1.Area,
         Ms1.Normalised,
         PG.MaxLFQ,
         Q.Value, 
         Lib.Q.Value,
         PG.Q.Value,
         Lib.PG.Q.Value,
         Proteotypic,
         Decoy, # Not available in older versions of DIA-NN
         RT)

## ----add_species_info---------------------------------------------------------
precursors <- precursors |> 
  mutate(species = grepl(pattern = "UPS",Protein.Group) |> 
           as.factor() |>
           recode("TRUE"="ups","FALSE" = "yeast"))
precursors |> 
  pull(species) |>
  table()

## ----check_intensity_distribution---------------------------------------------
precursors |> 
  ggplot(aes(x = log2(Precursor.Quantity))) +
  geom_density() +
  theme_minimal()

## ----create_metadata----------------------------------------------------------
(annot <- data.frame(runCol = precursors |> 
                       pull(Run) |> 
                       unique() # 1. 
                   ) |>
    mutate(sampleId = gsub(x = runCol, pattern = "_DIA", replacement = "") |> #2.a
               str_split("UPS2_") |> #2.b
               sapply(`[`, 2) #2.c
           ) |>
    mutate(
           condition =  gsub("ratio", "", sampleId) |> #3.a
               str_split("_") |> #3.b
               sapply(`[`, 1) |> #3.c
               as.numeric() |> #3.d
               as.factor(), #3.e
           rep = sampleId |> 
               str_split("_") |> #4.a
               sapply(`[`, 2) |> #4.b
               replace_na(replace = "1") |> #4.c
               as.factor(), #4.d
           ratio = condition |> 
             as.character() |> 
             as.double()
           )
)

## ----convert_to_QFeatures-----------------------------------------------------
(qf <- readQFeatures(assayData = precursors,  
                              colData = annot,
                              quantCols = "Precursor.Quantity",
                              runCol = "Run",
                              fnames = "Precursor.Id"))

## ----convert0_to_NA-----------------------------------------------------------
qf <- zeroIsNA(qf, names(qf))

## ----qvalue_filtering---------------------------------------------------------
qf <-  qf |>
  filterFeatures(~ Q.Value <= 0.01 & #1.
                     PG.Q.Value <= 0.01 &  #1.
                     Lib.Q.Value <= 0.01 & #1.
                     Lib.PG.Q.Value <= 0.01 & #1.
                     Precursor.Id != "" & #2.
                     Decoy == 0 & #3. (Note, that this filter is not available with previous versions of DIA-NN, as the report.tsv file did not include a Decoy column. So other strategies are needed if Decoys are in the output file.)
                     Proteotypic == 1) #4.

## ----assay_joining------------------------------------------------------------
(qf <- joinAssays(
  x = qf,
  i = names(qf),
  fcol = "Precursor.Id",
  name = "precursors"
  ))

## ----filter_features_with_many_NA---------------------------------------------
nObs <- 4
n <- ncol(qf[["precursors"]])

(qf <- filterNA(qf, i = "precursors", pNA = (n - nObs) / n))

## ----filter_one_hit_wonders---------------------------------------------------
# Filter for peptides per protein
pepsPerProtDf <- qf[["precursors"]] |> 
    rowData() |> 
    data.frame() |>
    dplyr::select("Stripped.Sequence", "Protein.Group") |>
    group_by(Protein.Group) |>
    mutate(pepsPerProt = Stripped.Sequence |>
               unique() |> length()
           ) #1.

rowData(qf[["precursors"]])$pepsPerProt <- pepsPerProtDf$pepsPerProt #2. 

qf <- filterFeatures(qf, 
                              ~ pepsPerProt > 1, 
                              keep = TRUE) #3.

## ----log_transformation-------------------------------------------------------
qf <- logTransform(qf, 
                            base = 2, 
                            i = "precursors", 
                            name = "precursors_log")

## ----normalisation_needed-----------------------------------------------------
qf[, , "precursors_log"] |> #1.
  longForm(colvars = c( "rep", "condition")) |>  #2. 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + #3.
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  theme_minimal()

## ----normalize_peptides-------------------------------------------------------
qf <- sweep( #4. Subtract log2 norm factor column-by-column (MARGIN = 2)
  qf,
  MARGIN = 2,
  STATS = nfLogMedian(qf,"precursors_log"),
  i = "precursors_log",
  name = "precursors_norm"
)

## ----assess_normalisation-----------------------------------------------------
qf[, , "precursors_norm"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  labs(subtitle = "Normalised log2 precursor intensities") +
  theme_minimal()

## ----summarize_precursors_to_proteins, warning=FALSE--------------------------
(qf <- aggregateFeatures(
  qf, i = "precursors_norm", 
  name = "proteins",
  fcol = "Protein.Group", 
  fun = function(X) iq::maxLFQ(X)$estimate
  #fun = MsCoreUtils::medianPolish, na.rm = TRUE
))

## ----qc_normalisation_peptides------------------------------------------------
qf[, , "precursors_norm"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  theme_minimal() +
   labs(subtitle = "Normalised log2 precursor intensities")

## ----qc_normalisation_peptides_boxplot----------------------------------------
qf[, , "precursors_norm"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = sampleId,
      y = value,
      colour = condition,
      group = colname) +
  xlab("sample") +
  geom_boxplot() +
  theme_minimal() +
   labs(subtitle = "Normalised log2 precursor intensities")

## ----qc_normalisation_proteins------------------------------------------------
qf[, , "proteins"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = value,
      colour = condition,
      group = colname) +
  geom_density() +
  theme_minimal() +
   labs(subtitle = "Normalised log2 protein intensities")

## ----qc_normalisation_proteins_boxplot----------------------------------------
qf[, , "proteins"] |>
  longForm(colvars = colnames(colData(qf))) |> 
  data.frame() |> 
  filter(!is.na(value)) |>
  ggplot() + 
  aes(x = sampleId,
      y = value,
      colour = condition,
      group = colname) +
  xlab("sample") +
  geom_boxplot() +
  theme_minimal() +
   labs(subtitle = "Normalised log2 protein intensities")

## ----qc_charge_state----------------------------------------------------------
qf[, , "precursors_norm"] |>
  longForm(colvars = colnames(colData(qf)), rowvars = "Precursor.Charge") |>
  as.data.frame() |>
  filter(!is.na(value)) |>
  filter(Precursor.Charge<=4) |> 
ggplot(aes(x = sampleId)) +
  geom_bar(aes(fill = factor(Precursor.Charge, levels = 4:1)),
           colour = "black") +
  labs(title = "Peptide types per sample",
       x = "Sample",
       fill = "Charge state") +
  theme_bw()

## ----qc_identifications-------------------------------------------------------
qf[,,"precursors_norm"] |> 
  longForm(colvars = colnames(colData(qf)), 
           rowvars= c("Precursor.Id", 
                      "Protein.Group", 
                      "Stripped.Sequence",
                      "Precursor.Charge")) |>
  data.frame() |>
  filter(!is.na(value)) |>
  mutate(cond_rep = paste0("ratio", condition, "_", rep)) |>
  group_by(condition, cond_rep) |>
  summarise(Precursors = length(unique(Precursor.Id)),
            `Protein Groups` = length(unique(Protein.Group))) |> 
  pivot_longer(-(1:2),
               names_to = "Feature",
               values_to = "IDs") |> 
  ggplot(aes(x = cond_rep, y = IDs, fill = condition)) +
  geom_col() +
  #scale_fill_observable() +
  facet_wrap(~Feature,
             scales = "free_y") +
  labs(title = "Precursor and protein group identificiations per sample",
       x = "Sample",
       y = "Identifications") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90))

## ----qc_mds_proteins----------------------------------------------------------
getWithColData(qf, "proteins") |> 
  as("SingleCellExperiment") |> 
  runMDS(exprs_values = 1) |> 
  plotMDS(colour_by = "condition", shape_by = "rep", point_size = 3) 

## ----qc_correlation_proteins--------------------------------------------------
corMat <- qf[["proteins"]] |> 
  assay() |>
  cor(method = "spearman", use = "pairwise.complete.obs") 
 
colnames(corMat) <- qf$sampleId
rownames(corMat) <- qf$sampleId
corMat |> 
  ggcorrplot::ggcorrplot() +
  scale_fill_viridis_c() +
  labs(title = "Correlation matrix",
       fill = "Correlation") +
  theme(axis.text.x = element_text(angle = 90))

## ----define_model-------------------------------------------------------------
model <- ~ condition

## ----fit_models, warning=FALSE------------------------------------------------
  qf <- msqrob(
    qf,  
    i = "proteins",
    formula = model,
    robust = TRUE)

## ----print_models-------------------------------------------------------------
models <- rowData(qf[["proteins"]])[["msqrobModels"]]
models[1:3]

## ----explore_model_matrix-----------------------------------------------------
vd <- ExploreModelMatrix::VisualizeDesign(
    sampleData =  colData(qf),
    designFormula = model,
    textSizeFitted = 4
)
vd$plotlist

## ----define_H0----------------------------------------------------------------
(allHypotheses <- createPairwiseContrasts(
  model, colData(qf), "condition"
  )
)

## ----make_contrasts-----------------------------------------------------------
(L <- makeContrast(
  allHypotheses,
  parameterNames = colnames(vd$designmatrix)
))

## ----hypthesis_tests----------------------------------------------------------
qf <- hypothesisTest(qf, i = "proteins", contrast = L)

## ----show_tables_qf-----------------------------------------------------------
qf[["proteins"]] |> 
  rowData() |> 
  names()

## ----collect_results----------------------------------------------------------
inferences <- 
  msqrobCollect(qf[["proteins"]], L) 
head(inferences)

## ----add_species_info_results-------------------------------------------------
inferences <- inferences |> 
    mutate(species =  rep(rowData(qf[["proteins"]])$species, ncol(L)))

## ----significant_tables, results='asis'---------------------------------------
alpha <- 0.05 #1.
for (contrasti in colnames(L)) #2. 
{
  sigList <- inferences |> 
    filter(contrast == contrasti & adjPval < alpha) #3.
  cat("**Contrast:**", contrasti, "= 0 (", nrow(sigList), " significant proteins)\n\n") #4.
  print(kable(sigList) |>
      kable_styling(full_width = FALSE) |>
      scroll_box(height = "250px")
      ) #4.
  cat("\n\n\n---\n\n") #4.
}

## ----volcanoplots-------------------------------------------------------------
volcanoplots <- inferences |> 
  plotVolcano() +
  facet_wrap(~contrast)
volcanoplots

## ----heatmaps-----------------------------------------------------------------
heatmaps <- lapply(
  colnames(L), #1.
  function(contrast, se, alpha)
    {
    sig <- rowData(se)[[contrast]] |> #2.
      filter(adjPval < alpha) |> 
      rownames()
    
    quants <- t(scale(t(assay(se[sig,])))) #3
    
    colnames(quants) <- paste0("con", se$condition,"_rep",se$rep) #4.
    annotations <- columnAnnotation(condition = se$condition) #4.
    set.seed(1234) ## annotation colours are randomly generated by default
    return( #.5
                       Heatmap(
                       quants, 
                       name = "log2 intensity",
                       top_annotation = annotations, 
                       column_title = paste0(contrast, " = 0")
                       )
                     )
                   },
                   se = getWithColData(qf, "proteins"), #6.
                   alpha = alpha) #7.
heatmaps

## ----define_alpha-------------------------------------------------------------
alpha <- 0.05

## ----real_logFC---------------------------------------------------------------
(realLogFC <- data.frame(
    logFC = colData(qf)$condition |> 
        levels() |> # 1. 
        as.double() |> # 2. 
        log2() |> # 3. 
        combn(m=2, FUN = diff), #4.
    contrast = colData(qf)$condition |> 
        levels() |> 
        combn(m=2, FUN= function(x) paste0(x[2]," - ",x[1])) |>
        recode("4 - 2" = "condition4", 
               "8 - 2" = "condition8",
               "8 - 4" = "condition8 - condition4")
    )
)

## ----assess_logFC-------------------------------------------------------------
logFC <- inferences |> 
  filter(!is.na(logFC)) |>
  ggplot(aes(x = species, y = logFC, color= species)) + #1.
  geom_boxplot() + #2.
  theme_bw() + 
  scale_color_manual( 
    values = c("grey20", "firebrick"), #3. 
    name = "",
    labels = c("yeast", "ups")
    ) + 
  geom_hline(yintercept = 0, color="grey20") + # 4. 
  facet_wrap(~contrast) +
  geom_hline(aes(yintercept = logFC), color="firebrick", data=realLogFC) #5. 
logFC

## ----assess_TP_FP_FDP---------------------------------------------------------
tpFpTable <- group_by(inferences, contrast) |>
    filter(adjPval < alpha) |>
    summarise("TP" = sum(species == "ups"),
              "FP" = sum(species != "ups"),
              "FDP" = mean(species != "ups")) 
tpFpTable

## ----FDR_TPR_functions--------------------------------------------------------
computeFDP <- function(pval, tp) {
    ord <- order(pval)
    fdp <- cumsum(!tp[ord]) / 1:length(tp)
    fdp[order(ord)]
}
computeTPR <- function(pval, tp, nTP = NULL) {
    if (is.null(nTP)) nTP <- sum(tp)
    ord <- order(pval)
    tpr <- cumsum(tp[ord]) / nTP
    tpr[order(ord)]
}

## ----calculate_performance----------------------------------------------------
performance <- inferences |> group_by(contrast) |> 
    na.exclude() |>
    mutate(tpr = computeTPR(pval, species == "ups"),
           fdp = computeFDP(pval, species == "ups")) |>
    arrange(fdp)

## ----calculate_workpoints-----------------------------------------------------
workPoints <- performance |> 
  group_by(contrast) |>
    filter(adjPval < 0.05) |>
    slice_max(pval)

## ----plot_tpr_fdp-------------------------------------------------------------
ggplot(performance) +
    aes(
        y = fdp,
        x = tpr,
    ) +
    geom_line() +
    geom_point(data = workPoints, size = 3) +
    geom_hline(yintercept = 0.05, linetype = 2) +
    facet_wrap( ~ contrast) +
    coord_flip(ylim = c(0, 0.2)) +
    theme_minimal()

## ----volcanoplots_with_groundtruth--------------------------------------------
  inferences |> 
  plotVolcano() + 
  facet_wrap(~contrast) +
  aes(color = species, shape = adjPval < alpha) + 
    scale_color_manual(
      values = c("grey20", "firebrick"), 
      name = "species",
      labels = c("yeast", "ups")
      ) +
  labs(shape="FDR < 0.05") +
  geom_vline(aes(xintercept = logFC), data = realLogFC , col ="firebrick") +
  geom_hline(aes(yintercept = -log10(adjAlpha)), data = inferences |> 
               group_by(contrast) |> 
               summarize(adjAlpha = alpha *mean(adjPval < alpha, na.rm=TRUE), .groups="drop"))

## ----sessionInfo--------------------------------------------------------------
sessionInfo()