Intermediate
~35 minutes
RNA-seq Differential Expression
Analyze RNA-seq count data to identify differentially expressed genes between conditions. This tutorial covers normalization, statistical testing, visualization with volcano plots, and pathway enrichment analysis.
What you will learn
- Loading and inspecting count matrices
- CPM and TPM normalization
- Running differential expression analysis
- Multiple testing correction (FDR)
- Creating volcano and MA plots
- Gene set and pathway enrichment
Run this tutorial: Download
rnaseq-de.bl
and run it with
bl run examples/tutorials/rnaseq-de.bl
Step 1 — Loading the Count Matrix
A count matrix has genes as rows and samples as columns. Each cell contains the number of reads mapped to that gene in that sample.
# requires: data/counts.csv in working directory
# rnaseq.bio — differential expression analysis
# Read the count matrix
let counts = csv("data/counts.csv")
print(f"Genes: {nrow(counts)}")
print(f"Samples: {ncol(counts) - 1}") # minus the gene column
print(head(counts, 5))
# Define sample groups
let design = from_records([
{ sample: "ctrl_1", group: "control" },
{ sample: "ctrl_2", group: "control" },
{ sample: "ctrl_3", group: "control" },
{ sample: "treat_1", group: "treated" },
{ sample: "treat_2", group: "treated" },
{ sample: "treat_3", group: "treated" },
])
print("Design:")
print(design)
Step 2 — Exploratory Data Analysis
# Library sizes (total reads per sample)
let sample_cols = ["ctrl_1", "ctrl_2", "ctrl_3", "treat_1", "treat_2", "treat_3"]
let lib_sizes = sample_cols
|> map(|s| { sample: s, total_reads: sum(col(counts, s)) })
|> from_records()
print("\n=== Library Sizes ===")
print(lib_sizes)
# How many genes have zero counts in all samples?
let zero_genes = counts
|> filter(|row| {
sample_cols |> all(|s| row[s] == 0)
})
print(f"Genes with zero counts in all samples: {nrow(zero_genes)}")
# Filter out very low-count genes
let min_count = 10
let min_samples = 3
let expressed = counts |> filter(|row| {
let n_above = sample_cols |> count_if(|s| row[s] >= min_count)
n_above >= min_samples
})
print(f"Genes after filtering: {nrow(expressed)}")
Step 3 — Normalization
# requires: data/gene_lengths.csv in working directory
# CPM normalization (Counts Per Million)
# Apply CPM to each sample column
let cpm_table = expressed
let total_reads = sample_cols |> map(|s| sum(col(expressed, s)))
for i in range(0, len(sample_cols)) {
let s = sample_cols[i]
let total = total_reads[i]
cpm_table = mutate(cpm_table, s, |row| row[s] / total * 1_000_000.0)
}
print("CPM matrix:")
print(head(cpm_table, 3))
# TPM normalization (Transcripts Per Kilobase Million)
# Requires gene lengths
let gene_lengths = csv("data/gene_lengths.csv") # gene, length
let with_lengths = inner_join(expressed, gene_lengths, "gene")
let lengths = col(with_lengths, "length")
let tpm_table = with_lengths
for s in sample_cols {
tpm_table = mutate(tpm_table, s, |row| {
let rpk = row[s] / (row.length / 1000.0)
rpk
})
}
# Normalize RPK values to per-million
for s in sample_cols {
let total_rpk = sum(col(tpm_table, s))
tpm_table = mutate(tpm_table, s, |row| row[s] / total_rpk * 1_000_000.0)
}
print("\nTPM matrix:")
print(head(tpm_table, 3))
# Log2 transform for visualization
let log_cpm = cpm_table
for s in sample_cols {
log_cpm = mutate(log_cpm, s, |row| log2(row[s] + 1.0))
}
# Check the distribution
for s in sample_cols {
let vals = col(log_cpm, s)
print(f"{s}: mean={round(mean(vals), 2)}, median={round(median(vals), 2)}")
}
Step 4 — Differential Expression Testing
# Run differential expression analysis
# diff_expr takes the count table and the number of control columns
# Columns should be ordered: gene, control_1..N, treated_1..M
let de_results = diff_expr(expressed, 3)
print("DE results:")
print(head(de_results, 5))
# ┌────────┬──────────┬──────────┬──────────┬──────────┐
# │ gene │ log2fc │ pvalue │ fdr │ mean_expr│
# ├────────┼──────────┼──────────┼──────────┼──────────┤
# │ MYC │ 3.42 │ 1.2e-08 │ 5.6e-06 │ 412.5 │
# │ TP53 │ -2.18 │ 3.4e-06 │ 8.1e-04 │ 156.3 │
# │ ... │ ... │ ... │ ... │ ... │
# Count significant genes
de_results |> filter(|row| row["fdr"] < 0.05 and abs(row["log2fc"]) > 1.0) |> into sig
sig |> filter(|row| row["log2fc"] > 0.0) |> into up
sig |> filter(|row| row["log2fc"] < 0.0) |> into down
print("\n=== Summary ===")
print(f"Significant (FDR<0.05, |log2FC|>1): {nrow(sig)}")
print(f" Upregulated: {nrow(up)}")
print(f" Downregulated: {nrow(down)}")
Step 5 — Multiple Testing Correction
# BioLang supports multiple correction methods
let bh = de_results # Already has FDR via Benjamini-Hochberg (default)
# Apply Bonferroni for comparison
let n_genes = nrow(de_results)
let pvals = col(de_results, "pvalue")
let bonf = mutate(de_results, "bonf_pvalue", |row| min(row.pvalue * n_genes, 1.0))
# Compare methods
let n_bh = nrow(filter(bh, |row| row["fdr"] < 0.05))
let n_bonf = nrow(filter(bonf, |row| row["bonf_pvalue"] < 0.05))
print("Significant genes (FDR<0.05):")
print(f" Benjamini-Hochberg: {n_bh}")
print(f" Bonferroni: {n_bonf}")
# Q-value approach (same as BH FDR)
let de_with_q = mutate(de_results, "qvalue", |row| row.fdr)
Step 6 — Volcano Plot
# Create a volcano plot
let vp = volcano(de_results, {
title: "Treated vs Control",
})
save_svg(vp, "results/volcano.svg")
print("Volcano plot saved")
# Volcano plot with custom options
let vp2 = volcano(de_results, {
title: "Treated vs Control (custom)",
fc_threshold: 1.0,
fdr_threshold: 0.05,
})
Step 7 — MA Plot
# MA plot: log2FC vs mean expression
let ma = ma_plot(de_results, {
title: "MA Plot — Treated vs Control",
})
save_svg(ma, "results/ma_plot.svg")
print("MA plot saved")
Step 8 — Heatmap of Top Genes
# Select top 50 DE genes for heatmap
let top_sorted = de_results |> arrange("fdr") |> head(50)
let top_genes = col(top_sorted, "gene")
# Extract normalized expression for these genes
let heatmap_data = log_cpm
|> filter(|row| row["gene"] in top_genes)
|> select("gene", "ctrl_1", "ctrl_2", "ctrl_3", "treat_1", "treat_2", "treat_3")
# Z-score normalize each sample column for visualization
let z_scored = heatmap_data
for s in sample_cols {
let m = mean(col(heatmap_data, s))
let sd = stdev(col(heatmap_data, s))
z_scored = mutate(z_scored, s, |row| (row[s] - m) / sd)
}
# Generate heatmap
let hm = heatmap(z_scored, {
title: "Top 50 DE Genes (z-score)",
})
save_svg(hm, "results/heatmap.svg")
print("Heatmap saved to results/heatmap.svg")
Step 9 — Pathway Enrichment
# requires: internet connection
# Pathway enrichment using KEGG
let sig_genes = col(sig, "gene")
# Look up KEGG pathways for significant genes
let kegg_hits = map(sig_genes, |g| kegg_find("genes", g))
# GO annotations for significant genes
let go_hits = map(sig_genes, |g| go_annotations(g))
# Reactome pathways
let pathway_hits = map(sig_genes, |g| reactome_pathways(g))
print("\n=== Pathway Results ===")
print(f"Genes queried: {len(sig_genes)}")
# Collect results into a summary table
let kegg_summary = from_records(map(sig_genes, |g| {
gene: g,
kegg: kegg_find("genes", g),
}))
print(head(kegg_summary, 10))
# Visualize with a plot
let p = plot(kegg_summary, {
title: "KEGG Pathway Enrichment",
})
save_svg(p, "results/enrichment.svg")
Step 10 — Complete Pipeline
# rnaseq_pipeline.bio — full DE analysis pipeline
fn run_de_analysis(counts_file, design_file, output_dir) {
# Load data
let counts = csv(counts_file)
let design = csv(design_file)
let samples = col(design, "sample")
let n_ctrl = nrow(filter(design, |r| r["group"] == "control"))
# Filter low-count genes
let expressed = counts |> filter(|row|
samples |> count_if(|s| row[s] >= 10) >= 3
)
# Differential expression
let de = diff_expr(expressed, n_ctrl)
# Write full results
write_csv(de, f"{output_dir}/de_results.csv")
# Significant genes
let sig = de |> filter(|r| r["fdr"] < 0.05 and abs(r["log2fc"]) > 1.0)
write_csv(sig, f"{output_dir}/significant_genes.csv")
# Plots
let vp = volcano(de)
save_svg(vp, f"{output_dir}/volcano.svg")
let ma = ma_plot(de)
save_svg(ma, f"{output_dir}/ma_plot.svg")
# Summary
print("=== DE Analysis Complete ===")
print(f"Total genes tested: {nrow(de)}")
print(f"Significant (FDR<0.05, |FC|>2): {nrow(sig)}")
let up_count = nrow(filter(sig, |r| r["log2fc"] > 0))
let down_count = nrow(filter(sig, |r| r["log2fc"] < 0))
print(f" Up: {up_count}")
print(f" Down: {down_count}")
print(f"Results saved to {output_dir}/")
}
run_de_analysis("data/counts.csv", "data/design.csv", "results")
Next Steps
Explore protein-level analysis in the Protein Analysis tutorial.