Edge Cases & Gotchas
Bioinformatics data is messy. This page covers common pitfalls, edge cases, and how BioLang handles them. Understanding these will save you hours of debugging when working with real-world data.
Empty Sequences
Zero-length sequences in FASTA
# Some FASTA files contain empty sequences. BioLang reads them as empty strings.
let records = read_fasta("data/sequences.fasta")
# This will NOT crash, but may give unexpected results
let lengths = records |> map(|r| { r.seq |> len })
print(f"Min length: {lengths |> min}") # Could be 0
# Always filter empty sequences before analysis
let valid = records |> filter(|r| { r.seq |> len > 0 })
print(f"Sequences with data: {valid |> len}")
# GC content of an empty sequence returns nil, not 0
let empty = dna""
print(empty |> gc_content) # nil
print(empty |> gc_content ?? 0.0) # 0.0 (with default)
Empty FASTQ records
# FASTQ files with zero-length reads (can happen after trimming)
let reads = read_fastq("data/reads.fastq")
# mean_qual on an empty quality string returns nil
let safe_reads = reads |> filter(|r| { r.seq |> len > 0 })
# Wrong: will crash on empty reads
# reads |> map(|r| { mean_phred(r.quality) }) |> mean
# Right: filter first or use nil-safe access
let avg_qual = reads
|> map(|r| { mean_phred(r.quality) })
|> filter(|q| { q != nil })
|> mean
print(f"Average quality: {avg_qual |> round(1)}")
Nil Handling
The nil propagation trap
# nil propagates through pipes silently
let x = nil
let result = x |> to_string |> len # result is nil, not an error
# This means bugs can hide in long pipe chains
let data = read_csv("data/expression.csv")
let values = data |> map(|row| {
# If "score" column is missing or empty, float returns nil
row["score"] |> float
})
# sum/mean skip nil values by default
print(values |> sum) # Skips nils
print(values |> mean) # Skips nils
print(values |> len) # Counts ALL entries including nil
# Count non-nil entries
print(values |> filter(|x| x != nil) |> len) # Only non-nil entries
Nil in conditionals
# nil is falsy, but not the same as false
let x = nil
if x { print("truthy") } else { print("falsy") } # prints "falsy"
if x == false { print("equal") } else { print("not equal") } # prints "not equal"
if x == nil { print("is nil") } # prints "is nil"
# Use ?? (nil coalescing) for defaults
let name = nil
print(f"Hello, {name ?? "unknown"}") # "Hello, unknown"
# Use ?. (nil-safe access) for chained lookups
let record = { info: nil }
print(record.info?.gene) # nil, no crash
print(record.info.gene) # ERROR: cannot access property on nil
Large Files
Streaming vs collecting
# WRONG: loads entire file into memory
let all_reads = read_fastq("data/reads.fastq") |> collect
# This can exhaust memory for large files!
# RIGHT: stream and process without collecting
read_fastq("data/reads.fastq")
|> filter(|r| { mean_phred(r.quality) >= 30.0 })
|> write_fastq("filtered.fq.gz")
# Constant memory usage regardless of file size
# If you need to collect, sample first
let sample = read_fastq("data/reads.fastq") |> take(10000) |> collect
let avg_len = sample |> map(|r| { r.seq |> len }) |> mean
print(f"Estimated average read length: {avg_len}")
Memory-safe aggregation
# For large files, use read_stats for streaming aggregations
let stats = read_stats(read_fastq("data/reads.fastq"))
# read_stats computes quality and length metrics in constant memory
print(f"Total reads: {stats.total_reads}")
print(f"Mean length: {stats.mean_length |> round(1)}")
print(f"Mean quality: {stats.mean_quality |> round(1)}")
Encoding Issues
Quality score encoding
# FASTQ files use two quality encodings:
# Phred+33 (Sanger/Illumina 1.8+): ASCII 33-126
# Phred+64 (Illumina 1.3-1.7): ASCII 64-126
# BioLang auto-detects encoding from quality score ranges
let reads_33 = read_fastq("data/reads.fastq") # Phred+33 auto-detected
let reads_64 = read_fastq("data/reads.fastq") # Phred+64 auto-detected
# Check what encoding was detected
let reads = read_fastq("data/reads.fastq")
print("Checking first read quality range for encoding detection")
# Common gotcha: mixing encodings in merged files
# BioLang will warn if quality values seem inconsistent
let mixed = read_fastq("data/reads.fastq") # Warning: inconsistent quality encoding detected
Sequence characters
# DNA sequences may contain ambiguity codes (IUPAC)
let seq = dna"ATCGNNRYSWKM"
# Standard functions handle ambiguity codes
print(seq |> len) # 12
print(seq |> gc_content) # Counts only definite G/C bases
print(contains(to_string(seq), "N")) # true
# Reverse complement preserves ambiguity codes
print(seq |> reverse_complement) # MKWSRYNNNCGAT
# But k-mer counting may give unexpected results
let kmers = seq |> kmers(3) |> frequencies
# K-mers containing N are valid k-mers in BioLang
print(kmers) # includes "ATC", "TCG", "CGN", "GNN", etc.
# Filter out ambiguous k-mers if needed
let clean_kmers = seq |> kmers(3) |> filter(|k| { !(k |> contains("N")) })
Numeric Precision
Floating point comparison
# Classic floating point trap
let a = 0.1 + 0.2
print(a == 0.3) # false!
print(a) # 0.30000000000000004
# Use abs() for floating point comparison
print(abs(a - 0.3) < 1e-10) # true
# This matters for p-value filtering
let pval = 0.05
let computed = 0.01 + 0.04
# WRONG: filter(|v| { v.pval == 0.05 })
# RIGHT: filter(|v| { v.pval <= 0.05 })
Integer overflow
# Genome coordinates can be large
let pos = 2147483647 # Max 32-bit int
# BioLang uses 64-bit integers by default, so this is fine:
let big_pos = pos + 1000
print(big_pos) # 2147484647
# But be careful with multiplication
let genome_size = 3_000_000_000
let coverage = 30
let total_bases = genome_size * coverage # 90 billion -- fits in i64
print(f"Total bases: {total_bases}")
File Format Gotchas
VCF multi-allelic sites
# Multi-allelic VCF records have comma-separated ALT alleles
let vcf = read_vcf("variants.vcf")
for v in vcf |> take(5) {
if v.alt |> contains(",") {
print(f"Multi-allelic: {v.chrom}:{v.pos} {v.ref} => {v.alt}")
# Split ALT on comma to get individual alleles
for allele in v.alt |> split(",") {
print(f" Allele: {allele}")
}
}
}
# Common mistake: treating ALT as a single string
# WRONG: filter(|v| { v.alt == "A" })
# RIGHT: filter(|v| { v.alt |> split(",") |> contains("A") })
BED coordinate system
# BED is 0-based, half-open: [start, end)
# VCF/GFF are 1-based, closed: [start, end]
# This off-by-one difference causes many bugs
let bed_start = 100 # First base is position 100
let bed_end = 200 # Last base is position 199
let bed_length = bed_end - bed_start # 100 bases
# Converting BED to 1-based:
let one_based_start = bed_start + 1 # 101
let one_based_end = bed_end # 200
# BioLang's interval functions are coordinate-system aware
let bed_region = interval("chr1", 100, 200) # 0-based
let vcf_region = interval("chr1", 101, 200) # 1-based
let tree = interval_tree([bed_region])
print(query_overlaps(tree, "chr1", 101, 200) |> len > 0) # true
Windows line endings
# Files from Windows have \r\n line endings
# BioLang handles this automatically for standard formats (FASTQ, VCF, etc.)
# But for plain text, trailing \r can cause problems
let lines = read_text("windows_file.txt") |> split("
")
let first = lines |> first
print(first |> ends_with("gene")) # might be false!
print(first |> trim |> ends_with("gene")) # true
# read_lines automatically strips \r, but read_text does not
let raw = read_text("windows_file.txt")
let clean = raw |> replace("\r\n", "\n")