Genomics

Genome-scale analysis is where BioLang shines. These examples cover common genomics tasks from read quality assessment to variant analysis, using BioLang's native format support and streaming capabilities.

Alignment Statistics

BAM file summary

# Comprehensive alignment statistics
let bam = read_bam("aligned.bam")

let stats = bam |> flagstat
print(f"Total reads:    {stats.total}")
print(f"Mapped reads:   {stats.mapped} ({stats.mapped_pct}%)")
print(f"Duplicates:     {stats.duplicates}")
print(f"Mean MAPQ:      {stats.mean_mapq}")
print(f"Mean insert:    {stats.mean_insert_size}")

Per-chromosome read counts

let bam = read_bam("aligned.bam")

bam
  |> filter(|r| r.is_mapped)
  |> map(|r| r.chrom)
  |> frequencies
  |> print

Mapping quality distribution

let bam = read_bam("aligned.bam")

let mapq_dist = bam
  |> filter(|r| r.is_mapped)
  |> map(|r| r.mapq)
  |> histogram(60)

mapq_dist |> write_tsv("mapq_distribution.tsv")
print(f"Reads with MAPQ >= 30: {bam |> filter(|r| { r.mapq >= 30 }) |> len}")

Coverage Analysis

Windowed coverage

# Calculate coverage in fixed-size windows
let bam = read_bam("aligned.bam")

let window_size = 10000
let coverage = range(0, 249250621, window_size)
  |> map(|start| {
    let end = start + window_size
    let depths = bam
      |> filter(|r| { r.is_mapped && r.chrom == "chr1" && r.pos >= start && r.pos < end })
      |> map(|r| r.mapq)
      |> collect
    let n = depths |> len
    {
      chrom: "chr1",
      start: start,
      end: end,
      mean_depth: if n > 0 { depths |> mean } else { 0.0 },
      breadth: n |> float / window_size |> float
    }
  })

coverage |> write_tsv("coverage_windows.tsv")

let avg = coverage |> map(|c| c.mean_depth) |> mean
print(f"Average genome-wide depth: {avg |> round(1)}x")

Coverage at specific regions

# Check coverage over target regions (e.g., exome capture)
let bam = read_bam("aligned.bam")
let targets = read_bed("capture_targets.bed")

let region_cov = targets
  |> map(|t| {
    let depths = bam
      |> filter(|r| { r.is_mapped && r.chrom == t.chrom && r.pos >= t.start && r.pos < t.end })
      |> map(|r| r.mapq)
      |> collect
    let total = depths |> len
    let avg = if total > 0 { depths |> mean } else { 0.0 }
    let pct_above = |thresh| if total > 0 { (depths |> filter(|d| d >= thresh) |> len) |> float / total |> float } else { 0.0 }
    {
      gene: t.name,
      mean_depth: avg,
      pct_10x: pct_above(10),
      pct_20x: pct_above(20),
      pct_30x: pct_above(30)
    }
  })

# Find under-covered regions
let low_cov = region_cov |> filter(|r| { r.pct_20x < 0.90 })
print(f"Regions below 90% at 20x: {low_cov |> len}")
low_cov |> write_tsv("low_coverage_regions.tsv")

Variant Analysis

Variant density across the genome

# Calculate variant density in sliding windows
let vcf = read_vcf("variants.vcf.gz")

let density = vcf
  |> filter(|v| { v.filter == "PASS" })
  |> mutate("window_start", |v| (v.pos / 100000) * 100000)
  |> group_by("chrom")
  |> summarize(|chrom, rows| {
    chrom: chrom,
    n_variants: len(rows),
    density: len(rows) |> float / 100000.0
  })

density |> write_tsv("variant_density.tsv")

Variant type summary

let vcf = read_vcf("variants.vcf.gz")

let summary = vcf
  |> filter(|v| { v.filter == "PASS" })
  |> map(|v| {
    match {
      is_snp(v) => "SNP",
      is_indel(v) && len(v.alt) > len(v.ref) => "INS",
      is_indel(v) && len(v.alt) < len(v.ref) => "DEL",
      len(v.ref) > 1 && len(v.alt) == len(v.ref) => "MNP",
      _ => "OTHER"
    }
  })
  |> frequencies

for type, count in summary {
  print(f"{type}: {count}")
}

Heterozygosity rate

let vcf = read_vcf("sample.vcf.gz")

let genotypes = vcf
  |> filter(|v| { v.filter == "PASS" && is_snp(v) })
  |> map(|v| v.genotypes |> first)

let het_count = genotypes |> filter(|g| { g == "0/1" || g == "0|1" }) |> len
let total = genotypes |> len
let het_rate = het_count |> float / total |> float

print(f"Heterozygosity rate: {het_rate |> round(4)}")

FASTA Analysis

Assembly statistics

let contigs = read_fasta("data/sequences.fasta")
  |> map(|r| len(r.seq))
  |> sort_by(|a, b| b - a)
  |> collect

let total = contigs |> sum
let longest = contigs |> first
let n_contigs = contigs |> len

print("Assembly Statistics:")
print(f"  Contigs:    {n_contigs}")
print(f"  Total bp:   {total |> str}")
print(f"  Longest:    {longest |> str}")
print(f"  GC content: {read_fasta("data/sequences.fasta") |> map(|r| gc_content(r.seq)) |> mean |> round(3)}")

Find motifs in a genome

# Search for a restriction enzyme site (EcoRI: GAATTC)
let genome = read_fasta("data/sequences.fasta")

for record in genome {
  let sites = find_motif(record.seq, "GAATTC")
  if sites |> len > 0 {
    print(f"{record.id}: {sites |> len} EcoRI sites")
    sites |> take(5) |> each(|pos| {
      print(f"  Position: {pos}")
    })
  }
}

GFF/GTF Annotation

Gene counts per chromosome

let gff = read_gff("annotations.gff3")

gff
  |> filter(|f| f.type == "gene")
  |> map(|f| f.seqid)
  |> frequencies
  |> each(|chrom, n| print(f"{chrom}: {n} genes"))

Exon size distribution

let gff = read_gff("annotations.gff3")

let exon_sizes = gff
  |> filter(|f| f.type == "exon")
  |> map(|f| f.end - f.start + 1)

print(f"Exon count:   {exon_sizes |> len}")
print(f"Mean size:    {exon_sizes |> mean |> round(1)} bp")
print(f"Median size:  {exon_sizes |> median} bp")
print(f"Max size:     {exon_sizes |> max} bp")