Metagenomics
Metagenomic analysis involves taxonomic profiling, diversity estimation, and community comparison from environmental sequencing data. BioLang handles the common data formats and statistical methods used in microbiome research.
Taxonomic Classification
Parsing Kraken2 output
# Parse Kraken2 report file
let report = tsv("kraken2_report.tsv")
|> map(|row| { {
pct: row[0] |> trim |> float,
count_clade: row[1] |> trim |> int,
count_taxon: row[2] |> trim |> int,
rank: row[3] |> trim,
taxid: row[4] |> trim |> int,
name: row[5] |> trim
}})
# Top species
let species = report
|> filter(|r| { r.rank == "S" && r.pct > 0.1 })
|> sort_by(|a, b| b.pct - a.pct)
print("Top species by abundance:")
for s in species |> take(15) {
let bar = "#" |> repeat((s.pct * 2.0) |> int)
print(f" {s.pct |> round(2)}% {s.name} {bar}")
}
Building abundance tables
# Build a genus-level abundance table from multiple Kraken2 reports
let samples = glob("kraken2_reports/*.tsv")
let abundance = samples |> map(|path| {
let sample_name = basename(path) |> replace(".tsv", "")
let report = tsv(path)
|> filter(|row| { row[3] |> trim == "G" }) # Genus level
|> map(|row| { { genus: row[5] |> trim, count: row[2] |> trim |> int } })
{ sample: sample_name, genera: report }
})
# Pivot into a genus x sample matrix
let all_genera = abundance
|> flat_map(|s| { s.genera |> map(|x| x.genus) })
|> unique
|> sort
let matrix = all_genera |> map(|genus| {
let row = { genus: genus }
for s in abundance {
let count = s.genera |> find(|g| { g.genus == genus })?.count ?? 0
row[s.sample] = count
}
row
})
matrix |> write_tsv("genus_abundance_table.tsv")
print(f"Abundance table: {all_genera |> len} genera x {samples |> len} samples")
Diversity Analysis
Alpha diversity
# Calculate alpha diversity metrics for each sample
let abundance = tsv("genus_abundance_table.tsv")
let sample_names = abundance |> colnames |> filter(|c| { c != "genus" })
let alpha = sample_names |> map(|sample| {
let counts = abundance |> map(|row| { row[sample] |> int }) |> filter(|c| { c > 0 })
let total = counts |> sum |> float
let proportions = counts |> map(|c| { c |> float / total })
# Shannon diversity: -sum(p * ln(p))
let shannon = proportions |> map(|p| { -p * log(p) }) |> sum
# Simpson diversity: 1 - sum(p^2)
let simpson = 1.0 - (proportions |> map(|p| { pow(p, 2) }) |> sum)
# Chao1: richness + (singletons^2 / (2 * doubletons))
let richness = counts |> len
let singletons = counts |> filter(|c| { c == 1 }) |> len |> float
let doubletons = counts |> filter(|c| { c == 2 }) |> len |> float
let chao1 = if doubletons > 0.0 {
richness |> float + (singletons * singletons) / (2.0 * doubletons)
} else {
richness |> float
}
# Pielou evenness: shannon / ln(richness)
let evenness = if richness > 1 { shannon / log(richness |> float) } else { 0.0 }
{
sample: sample,
richness: richness,
shannon: shannon,
simpson: simpson,
chao1: chao1,
evenness: evenness
}
})
print("Alpha Diversity:")
print("{'Sample':-20} {'Richness':>10} {'Shannon':>10} {'Simpson':>10} {'Chao1':>10}")
for a in alpha {
print(f"{a.sample:-20} {a.richness:>10} {a.shannon |> round(3):>10} {a.simpson |> round(3):>10} {a.chao1 |> round(1):>10}")
}
alpha |> write_tsv("alpha_diversity.tsv")
Beta diversity and ordination
# Bray-Curtis dissimilarity between samples
let abundance = tsv("genus_abundance_table.tsv")
let sample_names = abundance |> colnames |> filter(|c| { c != "genus" })
# Build count matrix
let count_matrix = sample_names |> map(|sample| {
abundance |> map(|row| { row[sample] |> int })
})
# Pairwise Bray-Curtis dissimilarity
let n = count_matrix |> len
let bc_matrix = range(0, n) |> map(|i| {
range(0, n) |> map(|j| {
let a = count_matrix[i]
let b = count_matrix[j]
let sum_min = range(0, a |> len) |> map(|k| { min(a[k], b[k]) }) |> sum |> float
let sum_a = a |> sum |> float
let sum_b = b |> sum |> float
1.0 - (2.0 * sum_min / (sum_a + sum_b))
})
})
# PCA on distance matrix as ordination approximation
let pca_result = matrix(bc_matrix) |> pca(3)
print("PCA on Bray-Curtis (variance explained):")
for i, var in pca_result.variance_ratio |> enumerate |> take(3) {
print(f" Axis {i + 1}: {(var * 100.0) |> round(1)}%")
}
# Export coordinates
let coords = sample_names |> enumerate |> map(|i, name| { {
sample: name,
axis1: pca_result.components[i][0],
axis2: pca_result.components[i][1],
axis3: pca_result.components[i][2]
}})
coords |> write_tsv("ordination_coordinates.tsv")
Differential Abundance
Comparing groups
# Simple differential abundance between two groups
let abundance = tsv("genus_abundance_table.tsv")
let metadata = tsv("sample_metadata.tsv")
let group_a = metadata |> filter(|r| { r["group"] == "healthy" }) |> map(|x| x.sample) |> collect
let group_b = metadata |> filter(|r| { r["group"] == "disease" }) |> map(|x| x.sample) |> collect
let genera = abundance |> map(|x| x.genus) |> collect
let diff_results = genera |> map(|genus| {
let row = abundance |> find(|r| { r["genus"] == genus })
let counts_a = group_a |> map(|s| { row[s] |> float })
let counts_b = group_b |> map(|s| { row[s] |> float })
let test = wilcoxon(counts_a, counts_b)
{
genus: genus,
mean_healthy: counts_a |> mean,
mean_disease: counts_b |> mean,
log2fc: log2((counts_b |> mean + 1.0) / (counts_a |> mean + 1.0)),
p_value: test.p_value
}
})
# Multiple testing correction
let pvals = diff_results |> map(|r| r.p_value)
let qvals = p_adjust(pvals, "bh")
let corrected = diff_results |> map(|r, i| { genus: r.genus, mean_healthy: r.mean_healthy, mean_disease: r.mean_disease, log2fc: r.log2fc, p_value: r.p_value, q_value: qvals[i] })
let sig = corrected |> filter(|r| { r.q_value < 0.05 }) |> sort_by(|a, b| a.q_value - b.q_value)
print(f"Differentially abundant genera (FDR < 0.05): {sig |> len}")
sig |> each(|r| {
let direction = if r.log2fc > 0 { "enriched in disease" } else { "enriched in healthy" }
print(f" {r.genus}: log2FC={r.log2fc |> round(2)}, q={r.q_value |> round(4)} ({direction})")
})
Functional Profiling
Pathway abundance from HUMAnN output
# Parse HUMAnN3 pathway abundance output
let pathways = tsv("humann3_pathabundance.tsv")
|> filter(|row| { !(row["pathway"] |> contains("|")) }) # Remove stratified rows
|> filter(|row| { row["pathway"] != "UNMAPPED" && row["pathway"] != "UNINTEGRATED" })
# Normalize to relative abundance (CPM)
let sample_cols = pathways |> colnames |> filter(|c| { c != "pathway" })
let total = sample_cols |> map(|col| {
pathways |> map(|row| { row[col] |> float }) |> sum
})
let normalized = pathways |> map(|row| {
let new_row = { pathway: row["pathway"] }
for i, col in sample_cols |> enumerate {
new_row[col] = row[col] |> float / total[i] * 1e6
}
new_row
})
normalized |> write_tsv("pathway_abundance_cpm.tsv")
print(f"{normalized |> len} pathways across {sample_cols |> len} samples")
Resistome profiling
# Analyze antibiotic resistance gene abundance
let amr = tsv("amr_results.tsv")
# Summarize by drug class
let by_class = amr
|> group_by("drug_class")
|> summarize(|drug_class, rows| {
drug_class: drug_class,
n_genes: rows |> map(|x| x.gene) |> unique |> len,
total_rpkm: rows |> map(|x| x.rpkm) |> sum,
samples_present: rows |> map(|x| x.sample) |> unique |> len
})
|> sort_by(|a, b| b.total_rpkm - a.total_rpkm)
print("AMR Gene Summary by Drug Class:")
for cls in by_class {
print(f" {cls.drug_class}: {cls.n_genes} genes, {cls.total_rpkm |> round(1)} total RPKM")
}