Regex
6 functions for regular expression matching, searching, and replacement. Patterns use Rust/PCRE-compatible syntax.
regex_match
Test if the entire string matches a pattern.
regex_match(s, pattern) -> bool
regex_match("ATCGATCG", "^[ATCG]+$") # true
regex_match("ATCXATCG", "^[ATCG]+$") # false (X is not ATCG)
regex_match("NM_001256", "^NM_\\d+$") # true
# Validate FASTA header
let header = ">sp|P38398|BRCA1_HUMAN"
regex_match(header, "^>\\w+\\|\\w+\\|\\w+$") # true
regex_find
Find the first match of a pattern in a string. Returns nil if no match.
regex_find(s, pattern) -> string | nil
regex_find("Gene: BRCA1, Variant: c.5266dupC", "\\bc\\.[\\w]+")
# "c.5266dupC"
regex_find("chr1:12345-67890", "\\d+-\\d+") # "12345-67890"
regex_find("no numbers here", "\\d+") # nil
regex_find_all
Find all non-overlapping matches.
regex_find_all(s, pattern) -> list
regex_find_all("ATG...TAA...ATG...TGA", "ATG")
# ["ATG", "ATG"]
# Extract all gene symbols from text
let text = "Mutations in BRCA1, TP53, and KRAS are common in cancer"
regex_find_all(text, "\\b[A-Z][A-Z0-9]{2,}\\b")
# ["BRCA1", "TP53", "KRAS"]
# Find all start codons
let seq = "NNNATGNNATGNNTAANNNATGNN"
regex_find_all(seq, "ATG") # ["ATG", "ATG", "ATG"]
regex_replace
Replace all matches of a pattern. Supports capture group references ($1, $2).
regex_replace(s, pattern, replacement) -> string
regex_replace("chr1:1000-2000", "(\\d+)-(\\d+)", "$1..$2")
# "chr1:1000..2000"
# Clean FASTQ headers
let header = "@SRR1234567.1 length=150"
regex_replace(header, "\\s+length=\\d+", "") # "@SRR1234567.1"
# Mask low-complexity regions
let seq = "ATCGAAAAAAAATCG"
regex_replace(seq, "A{4,}", |m| repeat("N", len(m)))
# "ATCGNNNNNNNNTCG"
regex_split
Split a string by a regex pattern.
regex_split(s, pattern) -> list
regex_split("gene1 gene2\tgene3", "\\s+")
# ["gene1", "gene2", "gene3"]
# Split CIGAR string into operations
regex_split("50M2I30M1D20M", "(?<=[MIDNSHP=X])(?=\\d)")
# ["50M", "2I", "30M", "1D", "20M"]
regex_captures
Extract named or positional capture groups from the first match.
regex_captures(s, pattern) -> list | nil
let loc = "chr7:55249071-55249171"
let caps = regex_captures(loc, "(chr\\w+):(\\d+)-(\\d+)")
# ["chr7:55249071-55249171", "chr7", "55249071", "55249171"]
let chrom = caps[1] # "chr7"
let start = int(caps[2]) # 55249071
# Named captures
let header = ">sp|P38398|BRCA1_HUMAN DNA repair"
caps = regex_captures(header, ">(?P<db>\\w+)\\|(?P<id>\\w+)\\|(?P<name>\\w+)")
# Access by index: caps[1] = "sp", caps[2] = "P38398"
# Parse VCF INFO field
let info = "DP=45;AF=0.32;MQ=60"
regex_find_all(info, "(\\w+)=(\\S+)")
|> map(|m| regex_captures(m, "(\\w+)=(\\S+)"))
|> map(|c| [c[1], c[2]])