Edge Cases & Gotchas

Bioinformatics data is messy. This page covers common pitfalls, edge cases, and how BioLang handles them. Understanding these will save you hours of debugging when working with real-world data.

Empty Sequences

Zero-length sequences in FASTA

# Some FASTA files contain empty sequences. BioLang reads them as empty strings.
let records = read_fasta("data/sequences.fasta")

# This will NOT crash, but may give unexpected results
let lengths = records |> map(|r| { r.seq |> len })
print(f"Min length: {lengths |> min}")  # Could be 0

# Always filter empty sequences before analysis
let valid = records |> filter(|r| { r.seq |> len > 0 })
print(f"Sequences with data: {valid |> len}")

# GC content of an empty sequence returns nil, not 0
let empty = dna""
print(empty |> gc_content)  # nil
print(empty |> gc_content ?? 0.0)  # 0.0 (with default)

Empty FASTQ records

# FASTQ files with zero-length reads (can happen after trimming)
let reads = read_fastq("data/reads.fastq")

# mean_qual on an empty quality string returns nil
let safe_reads = reads |> filter(|r| { r.seq |> len > 0 })

# Wrong: will crash on empty reads
# reads |> map(|r| { mean_phred(r.quality) }) |> mean

# Right: filter first or use nil-safe access
let avg_qual = reads
  |> map(|r| { mean_phred(r.quality) })
  |> filter(|q| { q != nil })
  |> mean
print(f"Average quality: {avg_qual |> round(1)}")

Nil Handling

The nil propagation trap

# nil propagates through pipes silently
let x = nil
let result = x |> to_string |> len  # result is nil, not an error

# This means bugs can hide in long pipe chains
let data = read_csv("data/expression.csv")
let values = data |> map(|row| {
  # If "score" column is missing or empty, float returns nil
  row["score"] |> float
})

# sum/mean skip nil values by default
print(values |> sum)    # Skips nils
print(values |> mean)   # Skips nils
print(values |> len)    # Counts ALL entries including nil

# Count non-nil entries
print(values |> filter(|x| x != nil) |> len)  # Only non-nil entries

Nil in conditionals

# nil is falsy, but not the same as false
let x = nil

if x { print("truthy") } else { print("falsy") }  # prints "falsy"
if x == false { print("equal") } else { print("not equal") }  # prints "not equal"
if x == nil { print("is nil") }  # prints "is nil"

# Use ?? (nil coalescing) for defaults
let name = nil
print(f"Hello, {name ?? "unknown"}")  # "Hello, unknown"

# Use ?. (nil-safe access) for chained lookups
let record = { info: nil }
print(record.info?.gene)  # nil, no crash
print(record.info.gene)   # ERROR: cannot access property on nil

Large Files

Streaming vs collecting

# WRONG: loads entire file into memory
let all_reads = read_fastq("data/reads.fastq") |> collect
# This can exhaust memory for large files!

# RIGHT: stream and process without collecting
read_fastq("data/reads.fastq")
  |> filter(|r| { mean_phred(r.quality) >= 30.0 })
  |> write_fastq("filtered.fq.gz")
# Constant memory usage regardless of file size

# If you need to collect, sample first
let sample = read_fastq("data/reads.fastq") |> take(10000) |> collect
let avg_len = sample |> map(|r| { r.seq |> len }) |> mean
print(f"Estimated average read length: {avg_len}")

Memory-safe aggregation

# For large files, use read_stats for streaming aggregations
let stats = read_stats(read_fastq("data/reads.fastq"))

# read_stats computes quality and length metrics in constant memory
print(f"Total reads: {stats.total_reads}")
print(f"Mean length: {stats.mean_length |> round(1)}")
print(f"Mean quality: {stats.mean_quality |> round(1)}")

Encoding Issues

Quality score encoding

# FASTQ files use two quality encodings:
# Phred+33 (Sanger/Illumina 1.8+): ASCII 33-126
# Phred+64 (Illumina 1.3-1.7):    ASCII 64-126

# BioLang auto-detects encoding from quality score ranges
let reads_33 = read_fastq("data/reads.fastq")       # Phred+33 auto-detected
let reads_64 = read_fastq("data/reads.fastq") # Phred+64 auto-detected

# Check what encoding was detected
let reads = read_fastq("data/reads.fastq")
print("Checking first read quality range for encoding detection")

# Common gotcha: mixing encodings in merged files
# BioLang will warn if quality values seem inconsistent
let mixed = read_fastq("data/reads.fastq")  # Warning: inconsistent quality encoding detected

Sequence characters

# DNA sequences may contain ambiguity codes (IUPAC)
let seq = dna"ATCGNNRYSWKM"

# Standard functions handle ambiguity codes
print(seq |> len)              # 12
print(seq |> gc_content)       # Counts only definite G/C bases
print(contains(to_string(seq), "N"))    # true

# Reverse complement preserves ambiguity codes
print(seq |> reverse_complement)  # MKWSRYNNNCGAT

# But k-mer counting may give unexpected results
let kmers = seq |> kmers(3) |> frequencies
# K-mers containing N are valid k-mers in BioLang
print(kmers)  # includes "ATC", "TCG", "CGN", "GNN", etc.

# Filter out ambiguous k-mers if needed
let clean_kmers = seq |> kmers(3) |> filter(|k| { !(k |> contains("N")) })

Numeric Precision

Floating point comparison

# Classic floating point trap
let a = 0.1 + 0.2
print(a == 0.3)          # false!
print(a)                 # 0.30000000000000004

# Use abs() for floating point comparison
print(abs(a - 0.3) < 1e-10)  # true

# This matters for p-value filtering
let pval = 0.05
let computed = 0.01 + 0.04
# WRONG: filter(|v| { v.pval == 0.05 })
# RIGHT: filter(|v| { v.pval <= 0.05 })

Integer overflow

# Genome coordinates can be large
let pos = 2147483647  # Max 32-bit int

# BioLang uses 64-bit integers by default, so this is fine:
let big_pos = pos + 1000
print(big_pos)  # 2147484647

# But be careful with multiplication
let genome_size = 3_000_000_000
let coverage = 30
let total_bases = genome_size * coverage  # 90 billion -- fits in i64
print(f"Total bases: {total_bases}")

File Format Gotchas

VCF multi-allelic sites

# Multi-allelic VCF records have comma-separated ALT alleles
let vcf = read_vcf("variants.vcf")

for v in vcf |> take(5) {
  if v.alt |> contains(",") {
    print(f"Multi-allelic: {v.chrom}:{v.pos} {v.ref} => {v.alt}")
    # Split ALT on comma to get individual alleles
    for allele in v.alt |> split(",") {
      print(f"  Allele: {allele}")
    }
  }
}

# Common mistake: treating ALT as a single string
# WRONG: filter(|v| { v.alt == "A" })
# RIGHT: filter(|v| { v.alt |> split(",") |> contains("A") })

BED coordinate system

# BED is 0-based, half-open: [start, end)
# VCF/GFF are 1-based, closed: [start, end]
# This off-by-one difference causes many bugs

let bed_start = 100   # First base is position 100
let bed_end = 200     # Last base is position 199
let bed_length = bed_end - bed_start  # 100 bases

# Converting BED to 1-based:
let one_based_start = bed_start + 1  # 101
let one_based_end = bed_end          # 200

# BioLang's interval functions are coordinate-system aware
let bed_region = interval("chr1", 100, 200)    # 0-based
let vcf_region = interval("chr1", 101, 200)    # 1-based
let tree = interval_tree([bed_region])
print(query_overlaps(tree, "chr1", 101, 200) |> len > 0)  # true

Windows line endings

# Files from Windows have \r\n line endings
# BioLang handles this automatically for standard formats (FASTQ, VCF, etc.)

# But for plain text, trailing \r can cause problems
let lines = read_text("windows_file.txt") |> split("
")
let first = lines |> first
print(first |> ends_with("gene"))      # might be false!
print(first |> trim |> ends_with("gene"))  # true

# read_lines automatically strips \r, but read_text does not
let raw = read_text("windows_file.txt")
let clean = raw |> replace("\r\n", "\n")