Genomics
Genome-scale analysis is where BioLang shines. These examples cover common genomics tasks from read quality assessment to variant analysis, using BioLang's native format support and streaming capabilities.
Alignment Statistics
BAM file summary
# Comprehensive alignment statistics
let bam = read_bam("aligned.bam")
let stats = bam |> flagstat
print(f"Total reads: {stats.total}")
print(f"Mapped reads: {stats.mapped} ({stats.mapped_pct}%)")
print(f"Duplicates: {stats.duplicates}")
print(f"Mean MAPQ: {stats.mean_mapq}")
print(f"Mean insert: {stats.mean_insert_size}")
Per-chromosome read counts
let bam = read_bam("aligned.bam")
bam
|> filter(|r| r.is_mapped)
|> map(|r| r.chrom)
|> frequencies
|> print
Mapping quality distribution
let bam = read_bam("aligned.bam")
let mapq_dist = bam
|> filter(|r| r.is_mapped)
|> map(|r| r.mapq)
|> histogram(60)
mapq_dist |> write_tsv("mapq_distribution.tsv")
print(f"Reads with MAPQ >= 30: {bam |> filter(|r| { r.mapq >= 30 }) |> len}")
Coverage Analysis
Windowed coverage
# Calculate coverage in fixed-size windows
let bam = read_bam("aligned.bam")
let window_size = 10000
let coverage = range(0, 249250621, window_size)
|> map(|start| {
let end = start + window_size
let depths = bam
|> filter(|r| { r.is_mapped && r.chrom == "chr1" && r.pos >= start && r.pos < end })
|> map(|r| r.mapq)
|> collect
let n = depths |> len
{
chrom: "chr1",
start: start,
end: end,
mean_depth: if n > 0 { depths |> mean } else { 0.0 },
breadth: n |> float / window_size |> float
}
})
coverage |> write_tsv("coverage_windows.tsv")
let avg = coverage |> map(|c| c.mean_depth) |> mean
print(f"Average genome-wide depth: {avg |> round(1)}x")
Coverage at specific regions
# Check coverage over target regions (e.g., exome capture)
let bam = read_bam("aligned.bam")
let targets = read_bed("capture_targets.bed")
let region_cov = targets
|> map(|t| {
let depths = bam
|> filter(|r| { r.is_mapped && r.chrom == t.chrom && r.pos >= t.start && r.pos < t.end })
|> map(|r| r.mapq)
|> collect
let total = depths |> len
let avg = if total > 0 { depths |> mean } else { 0.0 }
let pct_above = |thresh| if total > 0 { (depths |> filter(|d| d >= thresh) |> len) |> float / total |> float } else { 0.0 }
{
gene: t.name,
mean_depth: avg,
pct_10x: pct_above(10),
pct_20x: pct_above(20),
pct_30x: pct_above(30)
}
})
# Find under-covered regions
let low_cov = region_cov |> filter(|r| { r.pct_20x < 0.90 })
print(f"Regions below 90% at 20x: {low_cov |> len}")
low_cov |> write_tsv("low_coverage_regions.tsv")
Variant Analysis
Variant density across the genome
# Calculate variant density in sliding windows
let vcf = read_vcf("variants.vcf.gz")
let density = vcf
|> filter(|v| { v.filter == "PASS" })
|> mutate("window_start", |v| (v.pos / 100000) * 100000)
|> group_by("chrom")
|> summarize(|chrom, rows| {
chrom: chrom,
n_variants: len(rows),
density: len(rows) |> float / 100000.0
})
density |> write_tsv("variant_density.tsv")
Variant type summary
let vcf = read_vcf("variants.vcf.gz")
let summary = vcf
|> filter(|v| { v.filter == "PASS" })
|> map(|v| {
match {
is_snp(v) => "SNP",
is_indel(v) && len(v.alt) > len(v.ref) => "INS",
is_indel(v) && len(v.alt) < len(v.ref) => "DEL",
len(v.ref) > 1 && len(v.alt) == len(v.ref) => "MNP",
_ => "OTHER"
}
})
|> frequencies
for type, count in summary {
print(f"{type}: {count}")
}
Heterozygosity rate
let vcf = read_vcf("sample.vcf.gz")
let genotypes = vcf
|> filter(|v| { v.filter == "PASS" && is_snp(v) })
|> map(|v| v.genotypes |> first)
let het_count = genotypes |> filter(|g| { g == "0/1" || g == "0|1" }) |> len
let total = genotypes |> len
let het_rate = het_count |> float / total |> float
print(f"Heterozygosity rate: {het_rate |> round(4)}")
FASTA Analysis
Assembly statistics
let contigs = read_fasta("data/sequences.fasta")
|> map(|r| len(r.seq))
|> sort_by(|a, b| b - a)
|> collect
let total = contigs |> sum
let longest = contigs |> first
let n_contigs = contigs |> len
print("Assembly Statistics:")
print(f" Contigs: {n_contigs}")
print(f" Total bp: {total |> str}")
print(f" Longest: {longest |> str}")
print(f" GC content: {read_fasta("data/sequences.fasta") |> map(|r| gc_content(r.seq)) |> mean |> round(3)}")
Find motifs in a genome
# Search for a restriction enzyme site (EcoRI: GAATTC)
let genome = read_fasta("data/sequences.fasta")
for record in genome {
let sites = find_motif(record.seq, "GAATTC")
if sites |> len > 0 {
print(f"{record.id}: {sites |> len} EcoRI sites")
sites |> take(5) |> each(|pos| {
print(f" Position: {pos}")
})
}
}
GFF/GTF Annotation
Gene counts per chromosome
let gff = read_gff("annotations.gff3")
gff
|> filter(|f| f.type == "gene")
|> map(|f| f.seqid)
|> frequencies
|> each(|chrom, n| print(f"{chrom}: {n} genes"))
Exon size distribution
let gff = read_gff("annotations.gff3")
let exon_sizes = gff
|> filter(|f| f.type == "exon")
|> map(|f| f.end - f.start + 1)
print(f"Exon count: {exon_sizes |> len}")
print(f"Mean size: {exon_sizes |> mean |> round(1)} bp")
print(f"Median size: {exon_sizes |> median} bp")
print(f"Max size: {exon_sizes |> max} bp")