Intermediate ~35 minutes

RNA-seq Differential Expression

Analyze RNA-seq count data to identify differentially expressed genes between conditions. This tutorial covers normalization, statistical testing, visualization with volcano plots, and pathway enrichment analysis.

What you will learn

  • Loading and inspecting count matrices
  • CPM and TPM normalization
  • Running differential expression analysis
  • Multiple testing correction (FDR)
  • Creating volcano and MA plots
  • Gene set and pathway enrichment
Run this tutorial: Download rnaseq-de.bl and run it with bl run examples/tutorials/rnaseq-de.bl

Step 1 — Loading the Count Matrix

A count matrix has genes as rows and samples as columns. Each cell contains the number of reads mapped to that gene in that sample.

# requires: data/counts.csv in working directory
# rnaseq.bio — differential expression analysis

# Read the count matrix
let counts = csv("data/counts.csv")

print(f"Genes: {nrow(counts)}")
print(f"Samples: {ncol(counts) - 1}")  # minus the gene column
print(head(counts, 5))

# Define sample groups
let design = from_records([
  { sample: "ctrl_1", group: "control" },
  { sample: "ctrl_2", group: "control" },
  { sample: "ctrl_3", group: "control" },
  { sample: "treat_1", group: "treated" },
  { sample: "treat_2", group: "treated" },
  { sample: "treat_3", group: "treated" },
])

print("Design:")
print(design)

Step 2 — Exploratory Data Analysis

# Library sizes (total reads per sample)
let sample_cols = ["ctrl_1", "ctrl_2", "ctrl_3", "treat_1", "treat_2", "treat_3"]

let lib_sizes = sample_cols
  |> map(|s| { sample: s, total_reads: sum(col(counts, s)) })
  |> from_records()

print("\n=== Library Sizes ===")
print(lib_sizes)

# How many genes have zero counts in all samples?
let zero_genes = counts
  |> filter(|row| {
    sample_cols |> all(|s| row[s] == 0)
  })
print(f"Genes with zero counts in all samples: {nrow(zero_genes)}")

# Filter out very low-count genes
let min_count = 10
let min_samples = 3

let expressed = counts |> filter(|row| {
  let n_above = sample_cols |> count_if(|s| row[s] >= min_count)
  n_above >= min_samples
})
print(f"Genes after filtering: {nrow(expressed)}")

Step 3 — Normalization

# requires: data/gene_lengths.csv in working directory
# CPM normalization (Counts Per Million)
# Apply CPM to each sample column
let cpm_table = expressed
let total_reads = sample_cols |> map(|s| sum(col(expressed, s)))
for i in range(0, len(sample_cols)) {
  let s = sample_cols[i]
  let total = total_reads[i]
  cpm_table = mutate(cpm_table, s, |row| row[s] / total * 1_000_000.0)
}

print("CPM matrix:")
print(head(cpm_table, 3))

# TPM normalization (Transcripts Per Kilobase Million)
# Requires gene lengths
let gene_lengths = csv("data/gene_lengths.csv")  # gene, length

let with_lengths = inner_join(expressed, gene_lengths, "gene")
let lengths = col(with_lengths, "length")

let tpm_table = with_lengths
for s in sample_cols {
  tpm_table = mutate(tpm_table, s, |row| {
    let rpk = row[s] / (row.length / 1000.0)
    rpk
  })
}
# Normalize RPK values to per-million
for s in sample_cols {
  let total_rpk = sum(col(tpm_table, s))
  tpm_table = mutate(tpm_table, s, |row| row[s] / total_rpk * 1_000_000.0)
}

print("\nTPM matrix:")
print(head(tpm_table, 3))

# Log2 transform for visualization
let log_cpm = cpm_table
for s in sample_cols {
  log_cpm = mutate(log_cpm, s, |row| log2(row[s] + 1.0))
}

# Check the distribution
for s in sample_cols {
  let vals = col(log_cpm, s)
  print(f"{s}: mean={round(mean(vals), 2)}, median={round(median(vals), 2)}")
}

Step 4 — Differential Expression Testing

# Run differential expression analysis
# diff_expr takes the count table and the number of control columns
# Columns should be ordered: gene, control_1..N, treated_1..M
let de_results = diff_expr(expressed, 3)

print("DE results:")
print(head(de_results, 5))
# ┌────────┬──────────┬──────────┬──────────┬──────────┐
# │ gene   │ log2fc   │ pvalue   │ fdr      │ mean_expr│
# ├────────┼──────────┼──────────┼──────────┼──────────┤
# │ MYC    │    3.42  │ 1.2e-08  │ 5.6e-06  │   412.5  │
# │ TP53   │   -2.18  │ 3.4e-06  │ 8.1e-04  │   156.3  │
# │ ...    │    ...   │    ...   │    ...   │     ...  │

# Count significant genes
de_results |> filter(|row| row["fdr"] < 0.05 and abs(row["log2fc"]) > 1.0) |> into sig
sig |> filter(|row| row["log2fc"] > 0.0) |> into up
sig |> filter(|row| row["log2fc"] < 0.0) |> into down

print("\n=== Summary ===")
print(f"Significant (FDR<0.05, |log2FC|>1): {nrow(sig)}")
print(f"  Upregulated:   {nrow(up)}")
print(f"  Downregulated: {nrow(down)}")

Step 5 — Multiple Testing Correction

# BioLang supports multiple correction methods
let bh = de_results  # Already has FDR via Benjamini-Hochberg (default)

# Apply Bonferroni for comparison
let n_genes = nrow(de_results)
let pvals = col(de_results, "pvalue")
let bonf = mutate(de_results, "bonf_pvalue", |row| min(row.pvalue * n_genes, 1.0))

# Compare methods
let n_bh = nrow(filter(bh, |row| row["fdr"] < 0.05))
let n_bonf = nrow(filter(bonf, |row| row["bonf_pvalue"] < 0.05))

print("Significant genes (FDR<0.05):")
print(f"  Benjamini-Hochberg: {n_bh}")
print(f"  Bonferroni:         {n_bonf}")

# Q-value approach (same as BH FDR)
let de_with_q = mutate(de_results, "qvalue", |row| row.fdr)

Step 6 — Volcano Plot

# Create a volcano plot
let vp = volcano(de_results, {
  title: "Treated vs Control",
})

save_svg(vp, "results/volcano.svg")
print("Volcano plot saved")

# Volcano plot with custom options
let vp2 = volcano(de_results, {
  title:  "Treated vs Control (custom)",
  fc_threshold: 1.0,
  fdr_threshold: 0.05,
})

Step 7 — MA Plot

# MA plot: log2FC vs mean expression
let ma = ma_plot(de_results, {
  title: "MA Plot — Treated vs Control",
})

save_svg(ma, "results/ma_plot.svg")
print("MA plot saved")

Step 8 — Heatmap of Top Genes

# Select top 50 DE genes for heatmap
let top_sorted = de_results |> arrange("fdr") |> head(50)
let top_genes = col(top_sorted, "gene")

# Extract normalized expression for these genes
let heatmap_data = log_cpm
  |> filter(|row| row["gene"] in top_genes)
  |> select("gene", "ctrl_1", "ctrl_2", "ctrl_3", "treat_1", "treat_2", "treat_3")

# Z-score normalize each sample column for visualization
let z_scored = heatmap_data
for s in sample_cols {
  let m = mean(col(heatmap_data, s))
  let sd = stdev(col(heatmap_data, s))
  z_scored = mutate(z_scored, s, |row| (row[s] - m) / sd)
}

# Generate heatmap
let hm = heatmap(z_scored, {
  title: "Top 50 DE Genes (z-score)",
})

save_svg(hm, "results/heatmap.svg")
print("Heatmap saved to results/heatmap.svg")

Step 9 — Pathway Enrichment

# requires: internet connection
# Pathway enrichment using KEGG
let sig_genes = col(sig, "gene")

# Look up KEGG pathways for significant genes
let kegg_hits = map(sig_genes, |g| kegg_find("genes", g))

# GO annotations for significant genes
let go_hits = map(sig_genes, |g| go_annotations(g))

# Reactome pathways
let pathway_hits = map(sig_genes, |g| reactome_pathways(g))

print("\n=== Pathway Results ===")
print(f"Genes queried: {len(sig_genes)}")

# Collect results into a summary table
let kegg_summary = from_records(map(sig_genes, |g| {
  gene: g,
  kegg: kegg_find("genes", g),
}))

print(head(kegg_summary, 10))

# Visualize with a plot
let p = plot(kegg_summary, {
  title: "KEGG Pathway Enrichment",
})
save_svg(p, "results/enrichment.svg")

Step 10 — Complete Pipeline

# rnaseq_pipeline.bio — full DE analysis pipeline

fn run_de_analysis(counts_file, design_file, output_dir) {
  # Load data
  let counts = csv(counts_file)
  let design = csv(design_file)
  let samples = col(design, "sample")
  let n_ctrl = nrow(filter(design, |r| r["group"] == "control"))

  # Filter low-count genes
  let expressed = counts |> filter(|row|
    samples |> count_if(|s| row[s] >= 10) >= 3
  )

  # Differential expression
  let de = diff_expr(expressed, n_ctrl)

  # Write full results
  write_csv(de, f"{output_dir}/de_results.csv")

  # Significant genes
  let sig = de |> filter(|r| r["fdr"] < 0.05 and abs(r["log2fc"]) > 1.0)
  write_csv(sig, f"{output_dir}/significant_genes.csv")

  # Plots
  let vp = volcano(de)
  save_svg(vp, f"{output_dir}/volcano.svg")

  let ma = ma_plot(de)
  save_svg(ma, f"{output_dir}/ma_plot.svg")

  # Summary
  print("=== DE Analysis Complete ===")
  print(f"Total genes tested: {nrow(de)}")
  print(f"Significant (FDR<0.05, |FC|>2): {nrow(sig)}")
  let up_count = nrow(filter(sig, |r| r["log2fc"] > 0))
  let down_count = nrow(filter(sig, |r| r["log2fc"] < 0))
  print(f"  Up: {up_count}")
  print(f"  Down: {down_count}")
  print(f"Results saved to {output_dir}/")
}

run_de_analysis("data/counts.csv", "data/design.csv", "results")

Next Steps

Explore protein-level analysis in the Protein Analysis tutorial.