Text Processing
12 functions for pattern matching, text manipulation, file streaming, and running system commands. Designed for processing the large text-based formats common in bioinformatics.
Pattern Matching
grep
Search text for lines matching a pattern (string or regex). Returns a list of matching lines. An optional flags argument supports "i" for case-insensitive matching.
grep(pattern, text, flags?) -> list<string>
# Find all FASTA headers
let fasta = read_text("sequences.fasta")
let headers = grep("^>", fasta)
println("Found", len(headers), "sequences")
# Case-insensitive search for a gene name
let hits = grep("brca1", fasta, "i")
# Regex: find lines with chromosome coordinates
let coords = grep("chr[0-9]+:\\d+-\\d+", annotation)
grep_count
Count the number of lines matching a pattern. More efficient than len(grep(...)) for large inputs since it avoids building the result list.
grep_count(pattern, text) -> int
# Count sequences in a FASTA file
let n_seqs = grep_count("^>", read_text("reference.fa"))
println("Reference contains", n_seqs, "contigs")
# Count variant lines (skip VCF headers)
let vcf = read_text("variants.vcf")
let n_variants = grep_count("^[^#]", vcf)
println("Variants:", n_variants)
Text Manipulation
lines
Split text into a list of lines. Handles \n, \r\n, and \r line endings.
lines(text) -> list<string>
let text = "chr1\t100\t200\nchr2\t300\t400\nchr3\t500\t600"
let rows = lines(text)
println("Rows:", len(rows)) # 3
cut
Extract fields from text by delimiter, similar to Unix cut. The fields argument is a string like "1" or "1,3" (1-based).
cut(text, delimiter, fields) -> string
# Extract chrom and end columns from a BED line
let bed_line = "chr1\t1000\t2000\tpeak_1\t500"
let result = cut(bed_line, "\t", "1,3")
println(result) # "chr1\t2000"
# Extract sample names from a header
let header = "chrom,start,end,sample_A,sample_B"
let samples = cut(header, ",", "4,5")
println(samples) # "sample_A,sample_B"
paste
Join two lists column-by-column, like Unix paste. Default delimiter is tab.
paste(list1, list2, delimiter?) -> string
let genes = ["BRCA1", "TP53", "EGFR"]
let pvalues = ["0.001", "0.023", "0.045"]
let combined = paste(genes, pvalues, "\t")
# "BRCA1\t0.001\nTP53\t0.023\nEGFR\t0.045"
write_text("gene_pvalues.tsv", combined)
uniq_count
Count occurrences of each unique value in a list. Returns a record mapping each value to its count.
uniq_count(list) -> record
# Count reads per chromosome
let chroms = ["chr1", "chr1", "chr2", "chr1", "chrX", "chr2"]
let counts = uniq_count(chroms)
# {chr1: 3, chr2: 2, chrX: 1}
# Count base composition
let bases = split("ATCGATCGAA", "")
let composition = uniq_count(bases)
println(composition) # {A: 4, T: 2, C: 2, G: 2}
wc
Count lines, words, and characters in text, similar to Unix wc.
wc(text) -> record {lines, words, chars}
let stats = wc(read_text("README.md"))
println("Lines:", stats.lines)
println("Words:", stats.words)
println("Chars:", stats.chars)
# Quick record count for a TSV (minus header)
let counts = wc(read_text("results.tsv"))
println("Data rows:", counts.lines - 1)
File Streaming
tee
Write text to a file and return it unchanged, like Unix tee. Useful for saving intermediate results in a pipeline without breaking the data flow.
tee(text, path) -> string
# Save intermediate result while continuing the pipeline
let vcf_text = read_text("variants.vcf")
let filtered = grep("PASS", vcf_text)
|> join("\n")
|> tee("filtered_variants.txt")
let processed = lines(filtered)
|> map(|l| cut(l, "\t", "1,2"))
println("Processed", len(processed), "passing variants")
count_lines
Count lines in a file without loading the entire file into memory. Efficient for large files like whole-genome FASTQs.
count_lines(path) -> int
# Estimate read count in a FASTQ (4 lines per record)
let n_lines = count_lines("sample_R1.fastq")
let n_reads = n_lines / 4
println("Approximately", n_reads, "reads")
stream_lines
Lazily stream lines from a file. Returns a stream that yields one line at a time, enabling processing of files that do not fit in memory.
stream_lines(path) -> stream
# Process a multi-gigabyte FASTQ without loading it all
let headers = stream_lines("large_sample.fastq")
|> filter(|l| starts_with(l, "@"))
|> take(1000)
|> collect()
println("First 1000 read IDs sampled")
# Stream a BED file and count regions per chromosome
stream_lines("peaks.bed")
|> map(|l| cut(l, "\t", "1"))
|> collect()
|> uniq_count()
|> println()
stream_concat
Concatenate two streams into one. The second stream begins yielding after the first is exhausted.
stream_concat(stream1, stream2) -> stream
# Combine paired-end FASTQ headers for a joint summary
let r1 = stream_lines("sample_R1.fastq") |> filter(|l| starts_with(l, "@"))
let r2 = stream_lines("sample_R2.fastq") |> filter(|l| starts_with(l, "@"))
let all_headers = stream_concat(r1, r2) |> collect()
println("Total reads (R1+R2):", len(all_headers))
System Commands
shell
Execute a shell command and capture its output. Returns a record with stdout, stderr, and exit_code. An optional opts record can set cwd, env, or timeout.
shell(command, opts?) -> record {stdout, stderr, exit_code}
# Run samtools flagstat
let result = shell("samtools flagstat aligned.bam")
if result.exit_code == 0 {
println(result.stdout)
} else {
println("Error:", result.stderr)
}
# Run an external tool with a working directory
let r = shell("fastqc *.fastq.gz", {cwd: "raw_data/"})
assert(r.exit_code == 0, "FastQC failed")
# Capture tool version
let ver = shell("bwa-mem2 version")
println("BWA-MEM2:", trim(ver.stdout))
Security note: shell executes arbitrary commands. Avoid passing untrusted user input directly. Use parameterised pipelines or validate inputs before calling shell.