String Operations

18 functions for string manipulation, searching, and formatting. All string functions are immutable and return new strings.

upper / lower

Convert string to uppercase or lowercase.

upper(s) -> string
lower(s) -> string
upper("atcg")   # "ATCG"
lower("BRCA1")  # "brca1"

# Normalize sequence input
let seq = "atcGATcg"
let normalized = upper(seq)   # "ATCGATCG"

trim / trim_left / trim_right

Remove whitespace from both ends, start, or end of a string.

trim(s) -> string
trim_left(s) -> string
trim_right(s) -> string
trim("  ATCG  ")        # "ATCG"
trim_left("  ATCG  ")   # "ATCG  "
trim_right("  ATCG  ")  # "  ATCG"

# Clean FASTA header
let header = ">NM_001256799.3  Homo sapiens BRCA1  \n"
trim(header)   # ">NM_001256799.3  Homo sapiens BRCA1"

split

Split a string by a delimiter into a list of substrings.

split(s, delimiter) -> list
ParameterTypeDescription
sstringString to split
delimiterstringDelimiter (empty string splits into characters)
split("A,T,C,G", ",")     # ["A", "T", "C", "G"]
split("ATCG", "")          # ["A", "T", "C", "G"]
split("chr1:1000-2000", ":") # ["chr1", "1000-2000"]

# Parse SAM fields
let sam_line = "read1\t0\tchr1\t100\t60\t50M\t*\t0\t0\tATCG\tFFFF"
let fields = split(sam_line, "\t")
println("RNAME:", fields[2])   # RNAME: chr1

join

Join a list of strings with a separator.

join(list, separator) -> string
join(["A", "T", "C", "G"], "")      # "ATCG"
join(["chr1", "1000", "2000"], ":")   # "chr1:1000:2000"
join(["BRCA1", "TP53", "EGFR"], ", ") # "BRCA1, TP53, EGFR"

replace

Replace all occurrences of a substring.

replace(s, old, new) -> string
replace("ATCGATCG", "T", "U")   # "AUCGAUCG"  (DNA to RNA)
replace("chr1:1000-2000", "chr", "")  # "1:1000-2000"

starts_with / ends_with / contains

Test for substring presence at start, end, or anywhere.

starts_with(s, prefix) -> bool
ends_with(s, suffix) -> bool
contains(s, substring) -> bool
starts_with("NM_001256", "NM_")   # true
ends_with("sample.fastq.gz", ".gz") # true
contains("BRCA1 DNA repair", "repair") # true

# Filter FASTA headers
let headers = [">NM_001", ">XM_002", ">NM_003"]
filter(headers, |h| starts_with(h, ">NM_"))
# [">NM_001", ">NM_003"]

repeat

Repeat a string n times.

repeat(s, n) -> string
repeat("AT", 4)    # "ATATATAT"
repeat("-", 40)    # "----------------------------------------"

pad_left / pad_right

Pad a string to a target length with a fill character.

pad_left(s, width, fill?) -> string
pad_right(s, width, fill?) -> string
pad_left("42", 5, "0")    # "00042"
pad_right("BRCA1", 10)    # "BRCA1     "

# Align output columns
let genes = ["TP53", "BRCA1", "EGFR"]
let pvals = [0.001, 0.05, 0.003]
zip(genes, pvals) |> map(|pair| {
  pad_right(pair[0], 8) + str(pair[1])
}) |> map(println)

char_at / substr / index_of

Positional access into strings.

char_at(s, index) -> string
substr(s, start, length?) -> string
index_of(s, substr) -> int | nil
char_at("ATCG", 2)           # "C"
substr("ATCGATCG", 2, 4)    # "CGAT"
substr("ATCGATCG", 4)       # "ATCG"  (to end)
index_of("ATCGATCG", "GAT")  # 3
index_of("ATCGATCG", "XYZ")  # nil

# Extract codon
let seq = "ATGCGATCGTAA"
let codon_start = 3
let codon = substr(seq, codon_start, 3)   # "CGA"

format

Format a string with {} placeholders, replaced in order by arguments.

format(template, args...) -> string
format("chr{}:{}-{}", 1, 1000, 2000)
# "chr1:1000-2000"

format("{} has {} reads ({}% mapped)", "sample1", 1500000, 95.3)
# "sample1 has 1500000 reads (95.3% mapped)"

# Named placeholders with maps
format("Gene: {name}, p-value: {pval}", {"name": "BRCA1", "pval": 0.001})
# "Gene: BRCA1, p-value: 0.001"