Intermediate ~30 min

Tutorial: Enrichment Analysis

In this tutorial you'll run over-representation analysis (ORA) and gene set enrichment analysis (GSEA) on differentially expressed genes. You'll learn how to load gene sets, run statistical tests, apply multiple-testing correction, and interpret the results.

Prerequisites: Familiarity with Tables and Statistics tutorials.

What you'll learn

  • Loading gene sets from GMT files
  • Over-representation analysis (ORA) with the hypergeometric test
  • Gene set enrichment analysis (GSEA) with permutation testing
  • Multiple-testing correction (Benjamini-Hochberg)
  • Combining enrichment with knowledge graphs
Run this tutorial: Download enrichment.bl and run it with bl run examples/tutorials/enrichment.bl

Step 1: Prepare your gene list

Start with a set of differentially expressed genes. In a real workflow these come from your DE analysis; here we'll use a curated list:

# Upregulated genes from a differential expression analysis
let de_genes = [
  "TP53", "CDKN1A", "BAX", "GADD45A", "MDM2",
  "CCND1", "CDK4", "RB1", "E2F1", "BRCA1",
  "CHEK2", "ATM", "ATR", "RAD51", "XRCC5"
]
print(f"DE genes: {len(de_genes)}")

Step 2: Load gene sets

Gene sets are typically distributed in GMT (Gene Matrix Transposed) format. Each line is: SET_NAME <tab> description <tab> gene1 <tab> gene2 ...

# requires: hallmark.gmt in working directory
# Load gene sets from a GMT file
let gene_sets = read_gmt("hallmark.gmt")
print(f"Loaded {len(gene_sets)} gene sets")

# Inspect one gene set
let first = gene_sets |> first()
print(f"Name: {first.name}")
print(f"Genes: {len(first.genes)}")

If you don't have a GMT file, you can also define gene sets inline:

# Manual gene sets for testing
let gene_sets = [
  {name: "HALLMARK_P53_PATHWAY", genes: ["TP53", "CDKN1A", "BAX", "GADD45A", "MDM2", "SESN1", "SESN2", "DDB2", "POLK"]},
  {name: "HALLMARK_G2M_CHECKPOINT", genes: ["CDK1", "CCNB1", "CCND1", "CDK4", "E2F1", "RB1", "CDC25A", "CHEK1"]},
  {name: "HALLMARK_DNA_REPAIR", genes: ["BRCA1", "BRCA2", "RAD51", "XRCC5", "ATM", "ATR", "CHEK2", "MLH1", "MSH2"]},
  {name: "HALLMARK_APOPTOSIS", genes: ["BAX", "BCL2", "CASP3", "CASP9", "BID", "BAK1", "CYCS", "APAF1"]},
  {name: "HALLMARK_MYC_TARGETS", genes: ["MYC", "MAX", "NCL", "NPM1", "RPS3", "RPL5", "NOP56", "FBL"]},
]

Step 3: Over-representation analysis (ORA)

ORA uses the hypergeometric test to determine whether your gene list is significantly enriched for specific gene sets compared to what you'd expect by chance.

# Run ORA
# Arguments: gene list, gene sets, background size (total genes in genome)
let background_size = 20000
let ora_results = enrich(de_genes, gene_sets, background_size)

print("ORA results:")
ora_results |> each(|r| {
  print(f"  {r.term}: p={r.pvalue:.4e}, overlap={r.overlap}/{r.term_size}")
})

The enrich() function returns a list of records, each containing:

Field Description
termGene set name
pvalueHypergeometric p-value
fdrBH-adjusted p-value
overlapNumber of DE genes in this set
term_sizeTotal genes in this set
genesList of overlapping genes

Step 4: Filter significant results

# Filter by FDR < 0.05
let significant = ora_results
  |> filter(|r| r.fdr < 0.05)
  |> sort_by(|r| r.fdr)

print(f"\nSignificant pathways (FDR < 0.05): {len(significant)}")
significant |> each(|r| {
  print(f"  {r.term}")
  print(f"    FDR: {r.fdr:.4e}, Overlap: {r.overlap}/{r.term_size}")
  print(f"    Genes: {r.genes}")
})

Step 5: Gene set enrichment analysis (GSEA)

Unlike ORA, GSEA doesn't require a cutoff. It takes a ranked gene list (e.g. by fold change or test statistic) and tests whether genes in each set tend to cluster at the top or bottom of the ranking.

# Create a ranked gene list (gene name + ranking metric)
# In practice this comes from your DE results
let ranked_genes = [
  {gene: "BAX", score: 4.2},
  {gene: "CDKN1A", score: 3.8},
  {gene: "TP53", score: 3.5},
  {gene: "GADD45A", score: 3.1},
  {gene: "BRCA1", score: 2.7},
  {gene: "ATM", score: 2.3},
  {gene: "RAD51", score: 1.9},
  {gene: "CCND1", score: 0.5},
  {gene: "CDK4", score: 0.3},
  {gene: "E2F1", score: -0.2},
  {gene: "RB1", score: -0.8},
  {gene: "MYC", score: -1.5},
  {gene: "BCL2", score: -2.1},
]

# Run GSEA with permutation testing
let gsea_results = gsea(ranked_genes, gene_sets)

print("GSEA results:")
gsea_results |> each(|r| {
  print(f"  {r.term}: NES={r.nes:.3f}, p={r.pvalue:.4e}, FDR={r.fdr:.4e}")
})

GSEA results include:

Field Description
termGene set name
esEnrichment score
nesNormalized enrichment score
pvaluePermutation p-value
fdrBH-adjusted p-value
sizeGenes in set found in ranked list

Step 6: Multiple-testing correction

Both enrich() and gsea() apply BH correction automatically. You can also apply it manually to any list of p-values:

# Manual BH correction
let raw_pvalues = [0.001, 0.013, 0.039, 0.052, 0.11, 0.23, 0.45]
let adjusted = p_adjust(raw_pvalues, "bh")
print("Raw → Adjusted:")
range(0, len(raw_pvalues)) |> each(|i| {
  print(f"  {raw_pvalues[i]:.3f} → {adjusted[i]:.3f}")
})

Step 7: Combine with knowledge graphs

Use enrichment results to build a pathway-gene network:

# Build a bipartite graph: pathways ↔ genes
let g = graph()

significant |> each(|pathway| {
  pathway.genes |> each(|gene| {
    let g = add_edge(g, pathway.term, gene, {fdr: pathway.fdr})
  })
})

print(f"Pathway-gene network: {node_count(g)} nodes, {edge_count(g)} edges")

# Which genes appear in multiple significant pathways?
nodes(g) |> filter(|n| !starts_with(n, "HALLMARK_")) |> into gene_nodes
let multi_pathway = gene_nodes
  |> map(|n| {gene: n, pathways: degree(g, n)})
  |> filter(|r| r.pathways > 1)
  |> sort_by(|r| r.pathways, desc: true)

print("\nGenes in multiple enriched pathways:")
multi_pathway |> each(|r| print(f"  {r.gene}: {r.pathways} pathways"))

Complete example

# enrichment_analysis.bl — Full ORA + GSEA workflow

# Define gene sets (or load from GMT: read_gmt("hallmark.gmt"))
let gene_sets = [
  {name: "P53_PATHWAY", genes: ["TP53", "CDKN1A", "BAX", "GADD45A", "MDM2", "SESN1", "DDB2"]},
  {name: "DNA_REPAIR", genes: ["BRCA1", "RAD51", "XRCC5", "ATM", "ATR", "CHEK2", "MLH1"]},
  {name: "G2M_CHECKPOINT", genes: ["CDK1", "CCND1", "CDK4", "E2F1", "RB1", "CHEK1"]},
  {name: "APOPTOSIS", genes: ["BAX", "BCL2", "CASP3", "CASP9", "BID", "BAK1"]},
]

# DE genes
let de_genes = ["TP53", "CDKN1A", "BAX", "GADD45A", "MDM2", "BRCA1", "CHEK2",
                "ATM", "ATR", "RAD51", "XRCC5", "CCND1", "CDK4", "RB1", "E2F1"]

# ORA
print("=== Over-Representation Analysis ===")
let ora = enrich(de_genes, gene_sets, 20000)
  |> filter(|r| r.fdr < 0.05)
  |> sort_by(|r| r.fdr)

ora |> each(|r| print(f"  {r.term}: FDR={r.fdr:.4e}, {r.overlap}/{r.term_size} genes"))

# GSEA with ranked list
print("\n=== Gene Set Enrichment Analysis ===")
let ranked = [
  {gene: "BAX", score: 4.2}, {gene: "CDKN1A", score: 3.8},
  {gene: "TP53", score: 3.5}, {gene: "GADD45A", score: 3.1},
  {gene: "BRCA1", score: 2.7}, {gene: "ATM", score: 2.3},
  {gene: "RAD51", score: 1.9}, {gene: "CCND1", score: 0.5},
  {gene: "CDK4", score: 0.3}, {gene: "E2F1", score: -0.2},
  {gene: "RB1", score: -0.8}, {gene: "BCL2", score: -2.1},
]

let gsea_res = gsea(ranked, gene_sets)
  |> sort_by(|r| r.fdr)

gsea_res |> each(|r| print(f"  {r.term}: NES={r.nes:.3f}, FDR={r.fdr:.4e}"))

# Pathway network
print("\n=== Pathway-Gene Network ===")
let g = graph()
ora |> each(|pw| pw.genes |> each(|gene| {
  let g = add_edge(g, pw.term, gene, {fdr: pw.fdr})
}))
print(f"Network: {node_count(g)} nodes, {edge_count(g)} edges")

What's next