Text Processing

12 functions for pattern matching, text manipulation, file streaming, and running system commands. Designed for processing the large text-based formats common in bioinformatics.

Pattern Matching

grep

Search text for lines matching a pattern (string or regex). Returns a list of matching lines. An optional flags argument supports "i" for case-insensitive matching.

grep(pattern, text, flags?) -> list<string>

# Find all FASTA headers
let fasta = read_text("sequences.fasta")
let headers = grep("^>", fasta)
println("Found", len(headers), "sequences")

# Case-insensitive search for a gene name
let hits = grep("brca1", fasta, "i")

# Regex: find lines with chromosome coordinates
let coords = grep("chr[0-9]+:\\d+-\\d+", annotation)

grep_count

Count the number of lines matching a pattern. More efficient than len(grep(...)) for large inputs since it avoids building the result list.

grep_count(pattern, text) -> int

# Count sequences in a FASTA file
let n_seqs = grep_count("^>", read_text("reference.fa"))
println("Reference contains", n_seqs, "contigs")

# Count variant lines (skip VCF headers)
let vcf = read_text("variants.vcf")
let n_variants = grep_count("^[^#]", vcf)
println("Variants:", n_variants)

Text Manipulation

lines

Split text into a list of lines. Handles \n, \r\n, and \r line endings.

lines(text) -> list<string>

let text = "chr1\t100\t200\nchr2\t300\t400\nchr3\t500\t600"
let rows = lines(text)
println("Rows:", len(rows))   # 3

cut

Extract fields from text by delimiter, similar to Unix cut. The fields argument is a string like "1" or "1,3" (1-based).

cut(text, delimiter, fields) -> string

# Extract chrom and end columns from a BED line
let bed_line = "chr1\t1000\t2000\tpeak_1\t500"
let result = cut(bed_line, "\t", "1,3")
println(result)   # "chr1\t2000"

# Extract sample names from a header
let header = "chrom,start,end,sample_A,sample_B"
let samples = cut(header, ",", "4,5")
println(samples)  # "sample_A,sample_B"

paste

Join two lists column-by-column, like Unix paste. Default delimiter is tab.

paste(list1, list2, delimiter?) -> string

let genes = ["BRCA1", "TP53", "EGFR"]
let pvalues = ["0.001", "0.023", "0.045"]
let combined = paste(genes, pvalues, "\t")
# "BRCA1\t0.001\nTP53\t0.023\nEGFR\t0.045"
write_text("gene_pvalues.tsv", combined)

uniq_count

Count occurrences of each unique value in a list. Returns a record mapping each value to its count.

uniq_count(list) -> record

# Count reads per chromosome
let chroms = ["chr1", "chr1", "chr2", "chr1", "chrX", "chr2"]
let counts = uniq_count(chroms)
# {chr1: 3, chr2: 2, chrX: 1}

# Count base composition
let bases = split("ATCGATCGAA", "")
let composition = uniq_count(bases)
println(composition)  # {A: 4, T: 2, C: 2, G: 2}

wc

Count lines, words, and characters in text, similar to Unix wc.

wc(text) -> record {lines, words, chars}

let stats = wc(read_text("README.md"))
println("Lines:", stats.lines)
println("Words:", stats.words)
println("Chars:", stats.chars)

# Quick record count for a TSV (minus header)
let counts = wc(read_text("results.tsv"))
println("Data rows:", counts.lines - 1)

File Streaming

tee

Write text to a file and return it unchanged, like Unix tee. Useful for saving intermediate results in a pipeline without breaking the data flow.

tee(text, path) -> string

# Save intermediate result while continuing the pipeline
let vcf_text = read_text("variants.vcf")
let filtered = grep("PASS", vcf_text)
  |> join("\n")
  |> tee("filtered_variants.txt")
let processed = lines(filtered)
  |> map(|l| cut(l, "\t", "1,2"))
println("Processed", len(processed), "passing variants")

count_lines

Count lines in a file without loading the entire file into memory. Efficient for large files like whole-genome FASTQs.

count_lines(path) -> int

# Estimate read count in a FASTQ (4 lines per record)
let n_lines = count_lines("sample_R1.fastq")
let n_reads = n_lines / 4
println("Approximately", n_reads, "reads")

stream_lines

Lazily stream lines from a file. Returns a stream that yields one line at a time, enabling processing of files that do not fit in memory.

stream_lines(path) -> stream

# Process a multi-gigabyte FASTQ without loading it all
let headers = stream_lines("large_sample.fastq")
  |> filter(|l| starts_with(l, "@"))
  |> take(1000)
  |> collect()
println("First 1000 read IDs sampled")

# Stream a BED file and count regions per chromosome
stream_lines("peaks.bed")
  |> map(|l| cut(l, "\t", "1"))
  |> collect()
  |> uniq_count()
  |> println()

stream_concat

Concatenate two streams into one. The second stream begins yielding after the first is exhausted.

stream_concat(stream1, stream2) -> stream

# Combine paired-end FASTQ headers for a joint summary
let r1 = stream_lines("sample_R1.fastq") |> filter(|l| starts_with(l, "@"))
let r2 = stream_lines("sample_R2.fastq") |> filter(|l| starts_with(l, "@"))
let all_headers = stream_concat(r1, r2) |> collect()
println("Total reads (R1+R2):", len(all_headers))

System Commands

shell

Execute a shell command and capture its output. Returns a record with stdout, stderr, and exit_code. An optional opts record can set cwd, env, or timeout.

shell(command, opts?) -> record {stdout, stderr, exit_code}

# Run samtools flagstat
let result = shell("samtools flagstat aligned.bam")
if result.exit_code == 0 {
  println(result.stdout)
} else {
  println("Error:", result.stderr)
}

# Run an external tool with a working directory
let r = shell("fastqc *.fastq.gz", {cwd: "raw_data/"})
assert(r.exit_code == 0, "FastQC failed")

# Capture tool version
let ver = shell("bwa-mem2 version")
println("BWA-MEM2:", trim(ver.stdout))

Security note: shell executes arbitrary commands. Avoid passing untrusted user input directly. Use parameterised pipelines or validate inputs before calling shell.