---
title: "Preprocessing Short-Read Sequencing Files for Isoformic"
date: 2025-10-02
date-modified: 2025-11-11
date-format: long
author:
  - name: Lucio Rezende Queiroz
  - name: Izabela Mamede Conceição
vignette: |
  %\VignetteIndexEntry{Preprocessing Sequencing Files}
  %\VignetteEngine{quarto::html}
  %\VignetteEncoding{UTF-8}
knitr:
  opts_chunk:
    collapse: true
    comment: "#>"
---

```{r}
#| label: setup
#| include: false
knitr::opts_chunk$set(echo = TRUE)
```

##  Salmon: command command line interface anatomy

### From the Terminal - Salmon Command Line Interface

Below is a single-sample invocation (to be run in a Bash shell) showing the
typical arguments we used when making the mini dataset used in this vignette.

Replace the paths with yours if you want to reproduce.

More thorough information can be found on: <https://combine-lab.github.io/salmon/>.

``` bash
# Not needed as of Salmon 1.10.3:
  # --validateMappings \               # selective-alignment (more specific, recommended)

salmon quant \
  --libType A \                      # A -> autodetect library type from the reads
  --index /path/to/index \           # Transcriptome index (here: index built from the FASTA annotation file)
  --mates1 /path/SRR..._1.fastq \    # R1 FASTQ
  --mates2 /path/SRR..._2.fastq \    # R2 FASTQ
  --output /path/SRR..._quant \      # Output folder (contains quant.sf, aux_info/)
  --threads 8 \                      # Number of parallel threads used
  --disableChainingHeuristic \       # improve mapping sensitivity, but slower
  --allowDovetail \
  --softclipOverhangs \              # softclip reads that overhang transcripts start and end
  --dumpEq \
  --dumpEqWeights \                  # dump equivalent class information, needed by Terminus
  --posBias --seqBias --gcBias \     # bias corrections (positional, sequence, GC)
  --numGibbsSamples 100              # draw inferential replicates; required for fishpond/Swish
```

### From R - `condathis` wrapper for Salmon

Caveats: currently [Salmon](https://anaconda.org/bioconda/salmon) is only available for macOS and Linux on the [Bioconda](https://bioconda.github.io/) channel.

Therefore, this example will not work natively on Windows.
But you can use WSL2 (Windows Subsystem for Linux) on Windows 10/11 to run it.

```{r}
#| eval: false
library(isoformic)
tx_fasta_path <- download_reference(version = "49", file_type = "fasta")
genome_fasta_path <- download_reference(version = "49", file_type = "genome_fasta")
gff_path <- download_reference(version = "49", file_type = "gff")

base_dir <- fs::path("data-raw", "vignette_data")

index_path <- fs::path(base_dir, "salmon_index_k15")
salmon_index(
  fasta_path = tx_fasta_path,
  index_path = index_path,
  kmer_len = 15,
  num_threads = 8,
  is_gencode = TRUE,
  decoy_fasta = genome_fasta_path
)

fastq_files <- fs::dir_ls("data-raw/mini_data/sub_fastq/")

sample_names <- fs::path_file(fastq_files) |>
  stringr::str_remove("_sub_[1-2].fastq.gz$") |>
  unique() |>
  sort()

for (i in seq_along(sample_names)) {
  sample_name <- sample_names[i]

  salmon_output <- fs::path(base_dir, "salmon_quants", paste0(sample_name, "_quant"))
  fastq_r1 <- fs::dir_ls("data-raw/mini_data/sub_fastq/", regexp = paste0(sample_name, "_sub_1.fastq.gz$"))
  fastq_r2 <- fs::dir_ls("data-raw/mini_data/sub_fastq/", regexp = paste0(sample_name, "_sub_2.fastq.gz$"))
  # fs::file_exists(fastq_r1)
  # fs::file_exists(fastq_r2)

  salmon_quant(
    index_path = index_path,
    input_r1 = fastq_r1,
    input_r2 = fastq_r2,
    output_dir = salmon_output,
    num_threads = 8,
    num_gibbs = 20,
    min_score_fraction = "0.65"
  )
}
```

## Load Data and Create IsoformicExperiment Object

```{r}
suppressPackageStartupMessages({
  library(tximeta)
  library(SummarizedExperiment)
  library(fishpond)
  library(DESeq2)
})

library(isoformic)

# Where extdata lives
extdata_dir <- system.file("extdata", package = "isoformic")

quants_dir <- file.path(extdata_dir, "mini_quants")

gff_path <- download_reference(version = "49", file_type = "gff")
```

## Load Salmon Quantification with tximport

Point to Salmon's `quant.sf` output files.

```{r}
# discover sample folders like SRR11498039_quant, etc.
sample_dirs <- list.dirs(quants_dir, full.names = FALSE, recursive = FALSE)

# keep only *_quant and strip the suffix for sample names
sample_names <- sub("_quant$", "", sample_dirs)
stopifnot(length(sample_names) == 6)

# define groups: "control" = first 3, "treatment" = last 3 (sorted by sample ID)
sample_names <- sort(sample_names)
condition <- c(rep("control", 3), rep("treatment", 3))

quant_files <- file.path(quants_dir, paste0(sample_names, "_quant"), "quant.sf")
names(quant_files) <- sample_names
stopifnot(all(file.exists(quant_files)))

col_data <- tibble::tibble(
  files = quant_files,
  names = sample_names,
  condition = factor(condition, levels = c("control", "treatment"))
)
col_data
```

Import transcriptomics data with tximport.

```{r}
se_tx <- tximeta::tximeta(
  coldata = col_data,
  type = "salmon",
  txOut = TRUE,
  skipMeta = TRUE,
  useHub = FALSE,
  ignoreAfterBar = TRUE,
  skipSeqinfo = TRUE
)

mini_inf_reps <- readRDS(system.file("extdata/mini_inf_reps.rds", package = "isoformic"))
assays(se_tx) <- c(assays(se_tx), mini_inf_reps)
```

Convert to IsoformicExperiment object.

```{r}
iso <- as_isoformic(se_tx, gff_path)
iso
```

```{r}
tx_to_gene_df <- tx_to_gene(iso)
head(tx_to_gene_df)

# rowData(se_tx) <- row_data(iso)
iso
```

Perform transcript-level Differential Expression Analysis with fishpond.

```{r}
y <- se_tx
y <- fishpond::scaleInfReps(y)

y <- fishpond::labelKeep(y, minCount = 10, minN = 3)
# y <- y[mcols(y)$keep, ]

colData(y)$condition <- factor(col_data$condition, levels = c("control", "treatment"))
# y$condition <- factor(y$condition, levels = c("control", "treatment"))
colData(y)

# run Swish
y <- fishpond::swish(y, x = "condition")

dea_tx_res <- mcols(y) |>
  base::as.data.frame() |>
  tibble::rownames_to_column("transcript_id") |>
  tibble::as_tibble()

iso@dea$det <- dea_tx_res

de_tx(iso)
```

Summarize Transcript SummarizedExperiment to Gene level.

```{r}
se_gene <- tximeta::summarizeToGene(
  se_tx,
  tx2gene = dplyr::select(row_data(iso), transcript_id, gene_id),
  skipRanges = TRUE
)
```

Perform Gene level Differential Expression with DESeq2.

```{r}
colData(se_gene)$condition <- factor(colData(se_gene)$condition, levels = c("control", "treatment"))

assays(se_gene)$counts <- round(assays(se_gene)$counts)

dds <- DESeq2::DESeqDataSet(se_gene, design = ~condition)
dds <- dds[rowSums(DESeq2::counts(dds) >= 10) >= 3, ]
dds <- DESeq2::DESeq(dds)

res <- DESeq2::results(dds, contrast = c("condition", "treatment", "control"))

# shrink for stability on small example
if (rlang::is_installed("apeglm")) {
  message("apeglm is installed")
  shrink_type <- "apeglm"
} else {
  message("apeglm is NOT installed")
  shrink_type <- "normal"
}

res_shr <- DESeq2::lfcShrink(
  dds,
  coef = "condition_treatment_vs_control",
  res = res,
  type = shrink_type
)

# res
# res_shr
dea_gene_res <- base::as.data.frame(res_shr) |>
  tibble::rownames_to_column("gene_id") |>
  tibble::as_tibble() |>
  dplyr::rename(
    log2FC = log2FoldChange,
    pvalue = pvalue,
    qvalue = padj
  )


iso@dea$deg <- dea_gene_res

de_gene(iso)
```

Plot examples.

```{r}
top_de_gene <- de_gene(iso) |>
  dplyr::arrange(qvalue) |>
  dplyr::slice_head(n = 1) |>
  dplyr::pull(gene_id)

plot_log2fc(
  iso,
  feature = top_de_gene,
  feature_column = "gene_id"
)
```

By gene name

```{r}
gene_to_plot <- row_data(iso) |>
  dplyr::filter(gene_id %in% top_de_gene) |>
  dplyr::slice_head(n = 1) |>
  dplyr::pull(gene_name)

plot_log2fc(
  iso,
  feature = gene_to_plot,
  feature_column = "gene_name"
)
```

Top DE transcripts

```{r}
row_data(iso) |>
  dplyr::filter(gene_name %in% "MTG2")

plot_log2fc(
  iso,
  feature = "MTG2",
  feature_column = "gene_name"
)
```