Performance Tips

BioLang is designed for performance on bioinformatics workloads. These tips cover streaming, parallelism, memory management, and profiling to help you get the most out of your scripts.

Streaming vs Loading

Understand lazy evaluation

# BioLang pipe chains are lazy -- nothing happens until consumption
let pipeline = read_fastq("data/reads.fastq")
  |> filter(|r| mean_phred(r.quality) >= 30.0)
  |> map(|r| len(r.seq))
# No data has been read yet!

# Consumption happens with terminal operations:
let avg = pipeline |> mean        # Reads file, streams, computes
# OR
pipeline |> collect           # Materializes into a list

Avoid unnecessary collect

# BAD: collects millions of records into memory
let all_reads = read_fastq("data/reads.fastq") |> collect
let filtered = all_reads |> filter(|r| mean_phred(r.quality) >= 30.0)
write_fastq(filtered, "out.fq.gz")

# GOOD: streams everything, constant memory
read_fastq("data/reads.fastq")
  |> filter(|r| mean_phred(r.quality) >= 30.0)
  |> write_fastq("out.fq.gz")

# When you MUST collect, limit the data first
let sample_reads = read_fastq("data/reads.fastq") |> take(100000) |> collect

Streaming aggregations

# Compute stats in a single streaming pass using reduce
let lengths = read_fastq("data/reads.fastq")
  |> map(|r| len(r.seq))
  |> collect

print("Read count: #{len(lengths)}")
print("Mean length: #{round(mean(lengths), 1)}")
print("Min: #{min(lengths)}, Max: #{max(lengths)}")

# Multiple aggregations in a single pass using reduce
let result = read_fastq("data/reads.fastq")
  |> reduce({ total: 0, gc_sum: 0.0, n: 0 }, |acc, r| {
    {
      total: acc.total + len(r.seq),
      gc_sum: acc.gc_sum + (gc_content(r.seq) ?? 0.0),
      n: acc.n + 1
    }
  })

print("Total bases: #{result.total}")
print("Mean GC: #{round(result.gc_sum / result.n, 3)}")

Parallel Processing

par_map for CPU-bound work

# par_map distributes work across CPU cores
let proteins = read_fasta("data/sequences.fasta") |> collect

# Sequential (slow for large datasets)
let results_seq = map(proteins, |p| len(p.seq))

# Parallel (uses all available cores)
let results_par = par_map(proteins, |p| len(p.seq))

# Control parallelism
let results_4 = par_map(proteins, |p| len(p.seq))

Parallel file processing

# Process multiple files in parallel
let fastq_files = glob("samples/*_R1.fq.gz")

let stats = par_map(fastq_files, |fname| {
  let sample = replace(fname, "_R1.fq.gz", "")
  let reads = read_fastq("samples/#{fname}")
  let lengths = map(reads, |r| len(r.seq))
  {
    sample: sample,
    n_reads: len(reads),
    mean_length: round(mean(lengths), 1),
    total_bases: sum(lengths)
  }
})

write_tsv(from_records(stats), "sample_stats.tsv")
print("Processed #{len(stats)} samples")

Chunked parallel processing

# Process a large file in chunks for controlled parallelism
read_fastq("data/reads.fastq")
  |> chunk(10000)
  |> par_map(|ch| {
    # Each chunk processed independently
    ch
      |> filter(|r| mean_phred(r.quality) >= 30.0)
      |> collect
  })
  |> flatten
  |> write_fastq("processed.fq.gz")

Memory Management

Working with large genomes

# Reading a whole genome into memory is sometimes necessary
# but be aware of the cost (~3 GB for human genome)

# Option 1: Stream and process without full load
read_fasta("data/sequences.fasta")
  |> each(|record| {
    let gc = gc_content(record.seq)
    print("#{record.id}: GC=#{round(gc, 3)}")
  })

# Option 2: Load only what you need
let seqs = read_fasta("data/sequences.fasta")
  |> filter(|r| r.id == "chr1")
  |> collect

# Option 3: Process in chunks to limit memory
read_fasta("data/sequences.fasta")
  |> chunk(100)
  |> each(|batch| {
    print("Processing batch of #{len(batch)} sequences")
  })

K-mer counting at scale

kmer_count has three memory strategies depending on data size:

# Small datasets — entirely in-memory (fast)
let counts = kmer_count(dna"ATCGATCGATCG", 4)

# Large FASTQ — stream reads, auto-spill to disk if > 2M unique k-mers
read_fastq("data/reads.fastq")
  |> kmer_count(21)
  |> head(20)
  |> print()

# Bounded memory — only track top N k-mers (periodic pruning)
read_fastq("data/reads.fastq")
  |> kmer_count(21, 100)        # top 100 only
  |> bar_chart("Top 21-mers")

How auto-spill works: k-mers are counted in a HashMap. When unique entries exceed ~2M (~300 MB), BioLang transparently spills to a temporary SQLite database on disk. Counting continues at disk speed. The temp file is cleaned up automatically. The result table is always sorted by count descending — no sort_by needed.

I/O Optimization

Compressed I/O

# BioLang auto-detects compression from file extension
# .gz files are handled transparently
read_fastq("data/reads.fastq")      # Auto-decompresses
  |> write_fastq("out.fq.gz")  # Auto-compresses

# For explicit compression, use the compress functions
read_fastq("data/reads.fastq")
  |> write_fastq("out.fq")

# Then compress the output separately if needed
# zstd is typically faster with better ratios than gzip

Buffered output

# Collect results then write in one operation for efficiency
let results = read_fasta("data/sequences.fasta")
  |> map(|record| {
    let gc = gc_content(record.seq) ?? 0.0
    { name: record.id, length: len(record.seq), gc: round(gc, 4) }
  })
  |> collect

write_tsv(from_records(results), "output.tsv")

Profiling

Timing operations

# Time a block of code using time_it
let result = time_it(|| {
  read_fastq("data/reads.fastq")
    |> filter(|r| mean_phred(r.quality) >= 30.0)
    |> count
})
print("Filtering took #{result}ms")

# Profile individual stages
let t1 = now()
let data = read_vcf("data/variants.vcf") |> collect
let read_time = now() - t1

let t2 = now()
let filtered = data |> filter(|v| v.filter == "PASS") |> collect
let filter_time = now() - t2

print("Read: #{read_time}ms, Filter: #{filter_time}ms")

Memory usage tracking

# Track memory by monitoring data sizes
let data = read_fasta("data/sequences.fasta") |> collect
print("Loaded #{len(data)} sequences")

# For large datasets, use streaming kmer_count (auto-spills to disk)
let top_kmers = read_fasta("data/sequences.fasta")
  |> kmer_count(21, 50)     # top 50 k-mers, bounded memory

print("Top 50 21-mers:")
print(top_kmers)

JIT Compilation

# Write performance-critical functions as plain BioLang
# The interpreter optimizes hot paths automatically

fn custom_quality_score(qual_string) {
  let s = 0.0
  let weight_sum = 0.0
  for i in range(0, len(qual_string)) {
    let q = float(qual_string[i])
    let weight = 1.0 / (1.0 + pow(2.718, -0.1 * (q - 20.0)))
    s = s + q * weight
    weight_sum = weight_sum + weight
  }
  s / weight_sum
}

# Apply to all reads
let scores = read_fastq("data/reads.fastq")
  |> map(|r| custom_quality_score(r.quality))
  |> collect