Regex

6 functions for regular expression matching, searching, and replacement. Patterns use Rust/PCRE-compatible syntax.

regex_match

Test if the entire string matches a pattern.

regex_match(s, pattern) -> bool
regex_match("ATCGATCG", "^[ATCG]+$")   # true
regex_match("ATCXATCG", "^[ATCG]+$")   # false (X is not ATCG)
regex_match("NM_001256", "^NM_\\d+$")  # true

# Validate FASTA header
let header = ">sp|P38398|BRCA1_HUMAN"
regex_match(header, "^>\\w+\\|\\w+\\|\\w+$")   # true

regex_find

Find the first match of a pattern in a string. Returns nil if no match.

regex_find(s, pattern) -> string | nil
regex_find("Gene: BRCA1, Variant: c.5266dupC", "\\bc\\.[\\w]+")
# "c.5266dupC"

regex_find("chr1:12345-67890", "\\d+-\\d+")   # "12345-67890"
regex_find("no numbers here", "\\d+")          # nil

regex_find_all

Find all non-overlapping matches.

regex_find_all(s, pattern) -> list
regex_find_all("ATG...TAA...ATG...TGA", "ATG")
# ["ATG", "ATG"]

# Extract all gene symbols from text
let text = "Mutations in BRCA1, TP53, and KRAS are common in cancer"
regex_find_all(text, "\\b[A-Z][A-Z0-9]{2,}\\b")
# ["BRCA1", "TP53", "KRAS"]

# Find all start codons
let seq = "NNNATGNNATGNNTAANNNATGNN"
regex_find_all(seq, "ATG")   # ["ATG", "ATG", "ATG"]

regex_replace

Replace all matches of a pattern. Supports capture group references ($1, $2).

regex_replace(s, pattern, replacement) -> string
regex_replace("chr1:1000-2000", "(\\d+)-(\\d+)", "$1..$2")
# "chr1:1000..2000"

# Clean FASTQ headers
let header = "@SRR1234567.1 length=150"
regex_replace(header, "\\s+length=\\d+", "")   # "@SRR1234567.1"

# Mask low-complexity regions
let seq = "ATCGAAAAAAAATCG"
regex_replace(seq, "A{4,}", |m| repeat("N", len(m)))
# "ATCGNNNNNNNNTCG"

regex_split

Split a string by a regex pattern.

regex_split(s, pattern) -> list
regex_split("gene1  gene2\tgene3", "\\s+")
# ["gene1", "gene2", "gene3"]

# Split CIGAR string into operations
regex_split("50M2I30M1D20M", "(?<=[MIDNSHP=X])(?=\\d)")
# ["50M", "2I", "30M", "1D", "20M"]

regex_captures

Extract named or positional capture groups from the first match.

regex_captures(s, pattern) -> list | nil
let loc = "chr7:55249071-55249171"
let caps = regex_captures(loc, "(chr\\w+):(\\d+)-(\\d+)")
# ["chr7:55249071-55249171", "chr7", "55249071", "55249171"]
let chrom = caps[1]   # "chr7"
let start = int(caps[2])   # 55249071

# Named captures
let header = ">sp|P38398|BRCA1_HUMAN DNA repair"
caps = regex_captures(header, ">(?P<db>\\w+)\\|(?P<id>\\w+)\\|(?P<name>\\w+)")
# Access by index: caps[1] = "sp", caps[2] = "P38398"

# Parse VCF INFO field
let info = "DP=45;AF=0.32;MQ=60"
regex_find_all(info, "(\\w+)=(\\S+)")
  |> map(|m| regex_captures(m, "(\\w+)=(\\S+)"))
  |> map(|c| [c[1], c[2]])