Metagenomics

Metagenomic analysis involves taxonomic profiling, diversity estimation, and community comparison from environmental sequencing data. BioLang handles the common data formats and statistical methods used in microbiome research.

Taxonomic Classification

Parsing Kraken2 output

# Parse Kraken2 report file
let report = tsv("kraken2_report.tsv")
  |> map(|row| { {
    pct: row[0] |> trim |> float,
    count_clade: row[1] |> trim |> int,
    count_taxon: row[2] |> trim |> int,
    rank: row[3] |> trim,
    taxid: row[4] |> trim |> int,
    name: row[5] |> trim
  }})

# Top species
let species = report
  |> filter(|r| { r.rank == "S" && r.pct > 0.1 })
  |> sort_by(|a, b| b.pct - a.pct)

print("Top species by abundance:")
for s in species |> take(15) {
  let bar = "#" |> repeat((s.pct * 2.0) |> int)
  print(f"  {s.pct |> round(2)}% {s.name} {bar}")
}

Building abundance tables

# Build a genus-level abundance table from multiple Kraken2 reports
let samples = glob("kraken2_reports/*.tsv")

let abundance = samples |> map(|path| {
  let sample_name = basename(path) |> replace(".tsv", "")
  let report = tsv(path)
    |> filter(|row| { row[3] |> trim == "G" })  # Genus level
    |> map(|row| { { genus: row[5] |> trim, count: row[2] |> trim |> int } })
  { sample: sample_name, genera: report }
})

# Pivot into a genus x sample matrix
let all_genera = abundance
  |> flat_map(|s| { s.genera |> map(|x| x.genus) })
  |> unique
  |> sort

let matrix = all_genera |> map(|genus| {
  let row = { genus: genus }
  for s in abundance {
    let count = s.genera |> find(|g| { g.genus == genus })?.count ?? 0
    row[s.sample] = count
  }
  row
})

matrix |> write_tsv("genus_abundance_table.tsv")
print(f"Abundance table: {all_genera |> len} genera x {samples |> len} samples")

Diversity Analysis

Alpha diversity

# Calculate alpha diversity metrics for each sample
let abundance = tsv("genus_abundance_table.tsv")
let sample_names = abundance |> colnames |> filter(|c| { c != "genus" })

let alpha = sample_names |> map(|sample| {
  let counts = abundance |> map(|row| { row[sample] |> int }) |> filter(|c| { c > 0 })
  let total = counts |> sum |> float
  let proportions = counts |> map(|c| { c |> float / total })

  # Shannon diversity: -sum(p * ln(p))
  let shannon = proportions |> map(|p| { -p * log(p) }) |> sum

  # Simpson diversity: 1 - sum(p^2)
  let simpson = 1.0 - (proportions |> map(|p| { pow(p, 2) }) |> sum)

  # Chao1: richness + (singletons^2 / (2 * doubletons))
  let richness = counts |> len
  let singletons = counts |> filter(|c| { c == 1 }) |> len |> float
  let doubletons = counts |> filter(|c| { c == 2 }) |> len |> float
  let chao1 = if doubletons > 0.0 {
    richness |> float + (singletons * singletons) / (2.0 * doubletons)
  } else {
    richness |> float
  }

  # Pielou evenness: shannon / ln(richness)
  let evenness = if richness > 1 { shannon / log(richness |> float) } else { 0.0 }

  {
    sample: sample,
    richness: richness,
    shannon: shannon,
    simpson: simpson,
    chao1: chao1,
    evenness: evenness
  }
})

print("Alpha Diversity:")
print("{'Sample':-20} {'Richness':>10} {'Shannon':>10} {'Simpson':>10} {'Chao1':>10}")
for a in alpha {
  print(f"{a.sample:-20} {a.richness:>10} {a.shannon |> round(3):>10} {a.simpson |> round(3):>10} {a.chao1 |> round(1):>10}")
}

alpha |> write_tsv("alpha_diversity.tsv")

Beta diversity and ordination

# Bray-Curtis dissimilarity between samples
let abundance = tsv("genus_abundance_table.tsv")
let sample_names = abundance |> colnames |> filter(|c| { c != "genus" })

# Build count matrix
let count_matrix = sample_names |> map(|sample| {
  abundance |> map(|row| { row[sample] |> int })
})

# Pairwise Bray-Curtis dissimilarity
let n = count_matrix |> len
let bc_matrix = range(0, n) |> map(|i| {
  range(0, n) |> map(|j| {
    let a = count_matrix[i]
    let b = count_matrix[j]
    let sum_min = range(0, a |> len) |> map(|k| { min(a[k], b[k]) }) |> sum |> float
    let sum_a = a |> sum |> float
    let sum_b = b |> sum |> float
    1.0 - (2.0 * sum_min / (sum_a + sum_b))
  })
})

# PCA on distance matrix as ordination approximation
let pca_result = matrix(bc_matrix) |> pca(3)

print("PCA on Bray-Curtis (variance explained):")
for i, var in pca_result.variance_ratio |> enumerate |> take(3) {
  print(f"  Axis {i + 1}: {(var * 100.0) |> round(1)}%")
}

# Export coordinates
let coords = sample_names |> enumerate |> map(|i, name| { {
  sample: name,
  axis1: pca_result.components[i][0],
  axis2: pca_result.components[i][1],
  axis3: pca_result.components[i][2]
}})
coords |> write_tsv("ordination_coordinates.tsv")

Differential Abundance

Comparing groups

# Simple differential abundance between two groups
let abundance = tsv("genus_abundance_table.tsv")
let metadata = tsv("sample_metadata.tsv")

let group_a = metadata |> filter(|r| { r["group"] == "healthy" }) |> map(|x| x.sample) |> collect
let group_b = metadata |> filter(|r| { r["group"] == "disease" }) |> map(|x| x.sample) |> collect

let genera = abundance |> map(|x| x.genus) |> collect

let diff_results = genera |> map(|genus| {
  let row = abundance |> find(|r| { r["genus"] == genus })
  let counts_a = group_a |> map(|s| { row[s] |> float })
  let counts_b = group_b |> map(|s| { row[s] |> float })

  let test = wilcoxon(counts_a, counts_b)
  {
    genus: genus,
    mean_healthy: counts_a |> mean,
    mean_disease: counts_b |> mean,
    log2fc: log2((counts_b |> mean + 1.0) / (counts_a |> mean + 1.0)),
    p_value: test.p_value
  }
})

# Multiple testing correction
let pvals = diff_results |> map(|r| r.p_value)
let qvals = p_adjust(pvals, "bh")
let corrected = diff_results |> map(|r, i| { genus: r.genus, mean_healthy: r.mean_healthy, mean_disease: r.mean_disease, log2fc: r.log2fc, p_value: r.p_value, q_value: qvals[i] })

let sig = corrected |> filter(|r| { r.q_value < 0.05 }) |> sort_by(|a, b| a.q_value - b.q_value)
print(f"Differentially abundant genera (FDR < 0.05): {sig |> len}")
sig |> each(|r| {
  let direction = if r.log2fc > 0 { "enriched in disease" } else { "enriched in healthy" }
  print(f"  {r.genus}: log2FC={r.log2fc |> round(2)}, q={r.q_value |> round(4)} ({direction})")
})

Functional Profiling

Pathway abundance from HUMAnN output

# Parse HUMAnN3 pathway abundance output
let pathways = tsv("humann3_pathabundance.tsv")
  |> filter(|row| { !(row["pathway"] |> contains("|")) })  # Remove stratified rows
  |> filter(|row| { row["pathway"] != "UNMAPPED" && row["pathway"] != "UNINTEGRATED" })

# Normalize to relative abundance (CPM)
let sample_cols = pathways |> colnames |> filter(|c| { c != "pathway" })
let total = sample_cols |> map(|col| {
  pathways |> map(|row| { row[col] |> float }) |> sum
})

let normalized = pathways |> map(|row| {
  let new_row = { pathway: row["pathway"] }
  for i, col in sample_cols |> enumerate {
    new_row[col] = row[col] |> float / total[i] * 1e6
  }
  new_row
})

normalized |> write_tsv("pathway_abundance_cpm.tsv")
print(f"{normalized |> len} pathways across {sample_cols |> len} samples")

Resistome profiling

# Analyze antibiotic resistance gene abundance
let amr = tsv("amr_results.tsv")

# Summarize by drug class
let by_class = amr
  |> group_by("drug_class")
  |> summarize(|drug_class, rows| {
    drug_class: drug_class,
    n_genes: rows |> map(|x| x.gene) |> unique |> len,
    total_rpkm: rows |> map(|x| x.rpkm) |> sum,
    samples_present: rows |> map(|x| x.sample) |> unique |> len
  })
  |> sort_by(|a, b| b.total_rpkm - a.total_rpkm)

print("AMR Gene Summary by Drug Class:")
for cls in by_class {
  print(f"  {cls.drug_class}: {cls.n_genes} genes, {cls.total_rpkm |> round(1)} total RPKM")
}