Intermediate ~35 minutes

RNA-seq Differential Expression

Analyze RNA-seq count data to identify differentially expressed genes between conditions. This tutorial covers normalization, statistical testing, visualization with volcano plots, and pathway enrichment analysis.

What you will learn

Loading and inspecting count matrices
CPM and TPM normalization
Running differential expression analysis
Multiple testing correction (FDR)
Creating volcano and MA plots
Gene set and pathway enrichment

Run this tutorial: Download rnaseq-de.bl and run it with bl run examples/tutorials/rnaseq-de.bl

Step 1 — Loading the Count Matrix

A count matrix has genes as rows and samples as columns. Each cell contains the number of reads mapped to that gene in that sample.

# requires: data/counts.csv in working directory
# rnaseq.bio — differential expression analysis

# Read the count matrix
let counts = read_csv("data/counts.csv")

print(f"Genes: {nrow(counts)}")
print(f"Samples: {ncol(counts) - 1}")  # minus the gene column
print(head(counts, 5))

# Define sample groups
let design = from_records([
  { sample: "ctrl_1", group: "control" },
  { sample: "ctrl_2", group: "control" },
  { sample: "ctrl_3", group: "control" },
  { sample: "treat_1", group: "treated" },
  { sample: "treat_2", group: "treated" },
  { sample: "treat_3", group: "treated" },
])

print("Design:")
print(design)

Step 2 — Exploratory Data Analysis

# Library sizes (total reads per sample)
let sample_cols = ["ctrl_1", "ctrl_2", "ctrl_3", "treat_1", "treat_2", "treat_3"]

let lib_sizes = sample_cols
  |> map(|s| { sample: s, total_reads: sum(col(counts, s)) })
  |> from_records()

print("\n=== Library Sizes ===")
print(lib_sizes)

# How many genes have zero counts in all samples?
let zero_genes = counts
  |> filter(|row| {
    sample_cols |> all(|s| row[s] == 0)
  })
print(f"Genes with zero counts in all samples: {nrow(zero_genes)}")

# Filter out very low-count genes
let min_count = 10
let min_samples = 3

let expressed = counts |> filter(|row| {
  let n_above = sample_cols |> count_if(|s| row[s] >= min_count)
  n_above >= min_samples
})
print(f"Genes after filtering: {nrow(expressed)}")

Step 3 — Normalization

# requires: data/gene_lengths.csv in working directory
# CPM normalization (Counts Per Million)
# Apply CPM to each sample column
let cpm_table = expressed
let total_reads = sample_cols |> map(|s| sum(col(expressed, s)))
for i in range(0, len(sample_cols)) {
  let s = sample_cols[i]
  let total = total_reads[i]
  cpm_table = mutate(cpm_table, s, |row| row[s] / total * 1_000_000.0)
}

print("CPM matrix:")
print(head(cpm_table, 3))

# TPM normalization (Transcripts Per Kilobase Million)
# Requires gene lengths
let gene_lengths = read_csv("data/expression.csv")  # gene, length

let with_lengths = inner_join(expressed, gene_lengths, "gene")
let lengths = col(with_lengths, "length")

let tpm_table = with_lengths
for s in sample_cols {
  tpm_table = mutate(tpm_table, s, |row| {
    let rpk = row[s] / (row.length / 1000.0)
    rpk
  })
}
# Normalize RPK values to per-million
for s in sample_cols {
  let total_rpk = sum(col(tpm_table, s))
  tpm_table = mutate(tpm_table, s, |row| row[s] / total_rpk * 1_000_000.0)
}

print("\nTPM matrix:")
print(head(tpm_table, 3))

# Log2 transform for visualization
let log_cpm = cpm_table
for s in sample_cols {
  log_cpm = mutate(log_cpm, s, |row| log2(row[s] + 1.0))
}

# Check the distribution
for s in sample_cols {
  let vals = col(log_cpm, s)
  print(f"{s}: mean={round(mean(vals), 2)}, median={round(median(vals), 2)}")
}

Step 4 — Differential Expression Testing

# Run differential expression analysis
# diff_expr takes the count table and the number of control columns
# Columns should be ordered: gene, control_1..N, treated_1..M
let de_results = diff_expr(expressed, 3)

print("DE results:")
print(head(de_results, 5))
# ┌────────┬──────────┬──────────┬──────────┬──────────┐
# │ gene   │ log2fc   │ pvalue   │ fdr      │ mean_expr│
# ├────────┼──────────┼──────────┼──────────┼──────────┤
# │ MYC    │    3.42  │ 1.2e-08  │ 5.6e-06  │   412.5  │
# │ TP53   │   -2.18  │ 3.4e-06  │ 8.1e-04  │   156.3  │
# │ ...    │    ...   │    ...   │    ...   │     ...  │

# Count significant genes
de_results |> filter(|row| row["fdr"] < 0.05 and abs(row["log2fc"]) > 1.0) |> into sig
sig |> filter(|row| row["log2fc"] > 0.0) |> into up
sig |> filter(|row| row["log2fc"] < 0.0) |> into down

print("\n=== Summary ===")
print(f"Significant (FDR<0.05, |log2FC|>1): {nrow(sig)}")
print(f"  Upregulated:   {nrow(up)}")
print(f"  Downregulated: {nrow(down)}")

Step 5 — Multiple Testing Correction

# BioLang supports multiple correction methods
let bh = de_results  # Already has FDR via Benjamini-Hochberg (default)

# Apply Bonferroni for comparison
let n_genes = nrow(de_results)
let pvals = col(de_results, "pvalue")
let bonf = mutate(de_results, "bonf_pvalue", |row| min(row.pvalue * n_genes, 1.0))

# Compare methods
let n_bh = nrow(filter(bh, |row| row["fdr"] < 0.05))
let n_bonf = nrow(filter(bonf, |row| row["bonf_pvalue"] < 0.05))

print("Significant genes (FDR<0.05):")
print(f"  Benjamini-Hochberg: {n_bh}")
print(f"  Bonferroni:         {n_bonf}")

# Q-value approach (same as BH FDR)
let de_with_q = mutate(de_results, "qvalue", |row| row.fdr)

Step 6 — Volcano Plot

# Create a volcano plot
let vp = volcano(de_results, {
  title: "Treated vs Control",
})

save_svg(vp, "results/volcano.svg")
print("Volcano plot saved")

# Volcano plot with custom options
let vp2 = volcano(de_results, {
  title:  "Treated vs Control (custom)",
  fc_threshold: 1.0,
  fdr_threshold: 0.05,
})

Step 7 — MA Plot

# MA plot: log2FC vs mean expression
let ma = ma_plot(de_results, {
  title: "MA Plot — Treated vs Control",
})

save_svg(ma, "results/ma_plot.svg")
print("MA plot saved")

Step 8 — Heatmap of Top Genes

# Select top 50 DE genes for heatmap
let top_sorted = de_results |> arrange("fdr") |> head(50)
let top_genes = col(top_sorted, "gene")

# Extract normalized expression for these genes
let heatmap_data = log_cpm
  |> filter(|row| row["gene"] in top_genes)
  |> select("gene", "ctrl_1", "ctrl_2", "ctrl_3", "treat_1", "treat_2", "treat_3")

# Z-score normalize each sample column for visualization
let z_scored = heatmap_data
for s in sample_cols {
  let m = mean(col(heatmap_data, s))
  let sd = stdev(col(heatmap_data, s))
  z_scored = mutate(z_scored, s, |row| (row[s] - m) / sd)
}

# Generate heatmap
let hm = heatmap(z_scored, {
  title: "Top 50 DE Genes (z-score)",
})

save_svg(hm, "results/heatmap.svg")
print("Heatmap saved to results/heatmap.svg")

Step 9 — Pathway Enrichment

# requires: internet connection
# Pathway enrichment using KEGG
let sig_genes = col(sig, "gene")

# Look up KEGG pathways for significant genes
let kegg_hits = map(sig_genes, |g| kegg_find("genes", g))

# GO annotations for significant genes
let go_hits = map(sig_genes, |g| go_annotations(g))

# Reactome pathways
let pathway_hits = map(sig_genes, |g| reactome_pathways(g))

print("\n=== Pathway Results ===")
print(f"Genes queried: {len(sig_genes)}")

# Collect results into a summary table
let kegg_summary = from_records(map(sig_genes, |g| {
  gene: g,
  kegg: kegg_find("genes", g),
}))

print(head(kegg_summary, 10))

# Visualize with a plot
let p = plot(kegg_summary, {
  title: "KEGG Pathway Enrichment",
})
save_svg(p, "results/enrichment.svg")

Step 10 — Complete Pipeline

# rnaseq_pipeline.bio — full DE analysis pipeline

fn run_de_analysis(counts_file, design_file, output_dir) {
  # Load data
  let counts = read_csv(counts_file)
  let design = read_csv(design_file)
  let samples = col(design, "sample")
  let n_ctrl = nrow(filter(design, |r| r["group"] == "control"))

  # Filter low-count genes
  let expressed = counts |> filter(|row|
    samples |> count_if(|s| row[s] >= 10) >= 3
  )

  # Differential expression
  let de = diff_expr(expressed, n_ctrl)

  # Write full results
  write_csv(de, f"{output_dir}/de_results.csv")

  # Significant genes
  let sig = de |> filter(|r| r["fdr"] < 0.05 and abs(r["log2fc"]) > 1.0)
  write_csv(sig, f"{output_dir}/significant_genes.csv")

  # Plots
  let vp = volcano(de)
  save_svg(vp, f"{output_dir}/volcano.svg")

  let ma = ma_plot(de)
  save_svg(ma, f"{output_dir}/ma_plot.svg")

  # Summary
  print("=== DE Analysis Complete ===")
  print(f"Total genes tested: {nrow(de)}")
  print(f"Significant (FDR<0.05, |FC|>2): {nrow(sig)}")
  let up_count = nrow(filter(sig, |r| r["log2fc"] > 0))
  let down_count = nrow(filter(sig, |r| r["log2fc"] < 0))
  print(f"  Up: {up_count}")
  print(f"  Down: {down_count}")
  print(f"Results saved to {output_dir}/")
}

run_de_analysis("data/counts.csv", "data/design.csv", "results")

Next Steps

Explore protein-level analysis in the Protein Analysis tutorial.