Proteomics
BioLang provides tools for protein sequence analysis, mass spectrometry data processing, and structural feature extraction. These examples cover common proteomics workflows from basic sequence properties to peptide identification.
Protein Properties
Molecular weight and amino acid composition
# Amino acid average masses (monoisotopic, Da)
let aa_mass = {
"A": 71.04, "R": 156.10, "N": 114.04, "D": 115.03, "C": 103.01,
"E": 129.04, "Q": 128.06, "G": 57.02, "H": 137.06, "I": 113.08,
"L": 113.08, "K": 128.09, "M": 131.04, "F": 147.07, "P": 97.05,
"S": 87.03, "T": 101.05, "W": 186.08, "Y": 163.06, "V": 99.07
}
let prot = protein"MKWVTFISLLFLFSSAYSRGVFRR"
let residues = to_string(prot) |> split("")
# Calculate molecular weight
let mw = residues |> map(|aa| { aa_mass[aa] }) |> sum
let mw = mw + 18.02 # add water
print(f"Molecular weight: {mw |> round(2)} Da")
print(f"Sequence length: {prot |> seq_len} aa")
Amino acid composition
let prot = protein"MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFK"
let residues = to_string(prot) |> split("")
let total = residues |> len
# Count each amino acid using frequencies()
let composition = residues |> frequencies
|> sort_by(|a, b| b.value - a.value)
for entry in composition {
let pct = entry.value |> float / total |> float * 100.0
print(f"{entry.key}: {entry.value} ({pct |> round(1)}%)")
}
# Count hydrophobic residues (A, V, I, L, M, F, W, P)
let hydrophobic_aa = ["A", "V", "I", "L", "M", "F", "W", "P"]
let hydrophobic = residues |> filter(|aa| { hydrophobic_aa |> contains(aa) }) |> len
print(f"\nHydrophobic residues: {hydrophobic}/{total}")
Extinction coefficient at 280nm
# Extinction coefficient from Trp, Tyr, and Cys counts
# E280 = nW * 5500 + nY * 1490 + nC_pairs * 125
let seq = protein"MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGE"
let residues = to_string(seq) |> split("")
let counts = residues |> frequencies
let n_trp = counts |> find(|x| { x.key == "W" }) |> map(|x| x.value) |> first
let n_tyr = counts |> find(|x| { x.key == "Y" }) |> map(|x| x.value) |> first
let n_cys = counts |> find(|x| { x.key == "C" }) |> map(|x| x.value) |> first
let ext_coeff = (n_trp * 5500) + (n_tyr * 1490) + ((n_cys / 2) * 125)
print(f"Extinction coefficient: {ext_coeff} M^-1 cm^-1")
print(f" Trp: {n_trp}, Tyr: {n_tyr}, Cys: {n_cys}")
Protein Domain Search
Motif scanning
# Search for common protein motifs
let records = read_fasta("protein.fa")
let prot = records |> first
# N-glycosylation sites: N-X-S/T (where X is not P)
let seq_str = to_string(prot.seq)
let nglyc_sites = seq_str |> regex_find_all("N[^P][ST]")
print(f"N-glycosylation sites: {nglyc_sites |> len}")
for site in nglyc_sites {
print(f" Match: {site}")
}
# Phosphorylation sites (simplified: S/T followed by P)
let phos_sites = seq_str |> regex_find_all("[ST]P")
print(f"Potential phosphorylation sites: {phos_sites |> len}")
# N-terminal hydrophobicity check
let hydrophobic_aa = ["A", "V", "I", "L", "M", "F", "W", "P"]
let signal = substr(prot.seq, 0, 30) |> to_string |> split("")
let n_hydro = signal |> filter(|aa| { hydrophobic_aa |> contains(aa) }) |> len
let hydro_frac = n_hydro |> float / (signal |> len) |> float
print(f"N-terminal hydrophobic fraction: {hydro_frac |> round(3)} (>0.5 suggests signal peptide)")
Domain annotation from Pfam hits
# Parse Pfam domain hits
let domains = tsv("pfam_results.tsv")
|> filter(|d| { d["e_value"] |> float < 1e-5 })
|> sort_by(|a, b| a.start - b.start)
print("Significant domains found:")
for d in domains {
print(f" {d["domain_name"]} ({d["start"]}-{d["end"]}) E-value: {d["e_value"]}")
}
# Check for domain architecture
let has_kinase = domains |> any(|d| { d["domain_name"] |> contains("Kinase") })
let has_sh2 = domains |> any(|d| { d["domain_name"] |> contains("SH2") })
if has_kinase && has_sh2 {
print("Receptor tyrosine kinase-like architecture detected")
}
Mass Spectrometry
In-silico tryptic digest
# Trypsin cleaves after K and R (except when followed by P)
let prot = protein"MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGE"
let seq_str = to_string(prot)
# Split at K or R not followed by P using regex
let peptides = seq_str |> regex_split("[KR](?!P)")
|> filter(|p| { p |> len > 0 })
# Amino acid masses for computing peptide mass
let aa_mass = {
"A": 71.04, "R": 156.10, "N": 114.04, "D": 115.03, "C": 103.01,
"E": 129.04, "Q": 128.06, "G": 57.02, "H": 137.06, "I": 113.08,
"L": 113.08, "K": 128.09, "M": 131.04, "F": 147.07, "P": 97.05,
"S": 87.03, "T": 101.05, "W": 186.08, "Y": 163.06, "V": 99.07
}
fn peptide_mass(pep) {
let residues = pep |> split("")
let mass = residues |> map(|aa| { aa_mass[aa] }) |> sum
mass + 18.02 # add water
}
print("Tryptic peptides:")
for pep in peptides |> sort_by(|a, b| len(a) - len(b)) {
let mass = peptide_mass(pep)
let mz = mass + 1.00728 # [M+H]+
print(f" {pep} (m/z: {mz |> round(4)}, length: {pep |> len})")
}
Peptide mass fingerprint
# Generate theoretical peptide masses for database matching
let aa_mass = {
"A": 71.04, "R": 156.10, "N": 114.04, "D": 115.03, "C": 103.01,
"E": 129.04, "Q": 128.06, "G": 57.02, "H": 137.06, "I": 113.08,
"L": 113.08, "K": 128.09, "M": 131.04, "F": 147.07, "P": 97.05,
"S": 87.03, "T": 101.05, "W": 186.08, "Y": 163.06, "V": 99.07
}
fn peptide_mass(pep) {
pep |> split("") |> map(|aa| { aa_mass[aa] }) |> sum |> round(4)
}
let proteins = read_fasta("proteome.fa")
let mass_db = proteins
|> flat_map(|p| {
let peps = to_string(p.seq) |> regex_split("[KR](?!P)")
|> filter(|pep| { pep |> len >= 6 && pep |> len <= 30 })
peps |> map(|pep| { {
protein: p.id,
peptide: pep,
mass: peptide_mass(pep)
}})
})
|> sort_by(|a, b| a.mass - b.mass)
mass_db |> write_tsv("theoretical_masses.tsv")
print(f"Generated {mass_db |> len} theoretical peptide masses")
Matching observed masses to theoretical
# Match observed MS peaks to theoretical peptide masses
let observed = tsv("observed_peaks.tsv")
|> map(|row| { row["mz"] |> float })
let theoretical = tsv("theoretical_masses.tsv")
let tolerance_ppm = 10.0
let matches = observed |> flat_map(|obs_mz| {
theoretical
|> filter(|t| {
let ppm = ((obs_mz - t["mass"] |> float) / (t["mass"] |> float) * 1e6) |> abs
ppm < tolerance_ppm
})
|> map(|t| { {
observed_mz: obs_mz,
theoretical_mass: t["mass"],
protein: t["protein"],
peptide: t["peptide"],
ppm_error: ((obs_mz - t["mass"] |> float) / (t["mass"] |> float) * 1e6)
}})
})
print(f"Matched {matches |> len} peptides to {matches |> map(|x| x.protein) |> unique |> len} proteins")
matches |> write_tsv("peptide_matches.tsv")
Protein Comparison
Pairwise k-mer similarity
# Compare protein sequences using k-mer overlap
let proteins = read_fasta("family.fa") |> collect
for i in 0..(proteins |> len) {
for j in (i + 1)..(proteins |> len) {
let k1 = proteins[i].seq |> kmers(3) |> set
let k2 = proteins[j].seq |> kmers(3) |> set
let shared = intersection(k1, k2)
let similarity = len(shared) |> float / min(len(k1), len(k2)) |> float
print(f"{proteins[i].id} vs {proteins[j].id}: {(similarity * 100.0) |> round(1)}% k-mer similarity")
}
}
Multiple sequence alignment conservation
# Analyze per-column conservation from a pre-aligned FASTA
let seqs = read_fasta("aligned_proteins.fa") |> collect
let aligned = seqs |> map(|s| { to_string(s.seq) |> split("") })
let n_seqs = aligned |> len
let n_cols = aligned |> first |> len
# Compute conservation score per column (fraction of most common residue)
let conservation = range(0, n_cols) |> map(|col| {
let column = aligned |> map(|s| { s[col] })
let freqs = column |> filter(|aa| { aa != "-" }) |> frequencies
let max_count = freqs |> map(|f| { f.value }) |> max
let non_gap = column |> filter(|aa| { aa != "-" }) |> len
if non_gap > 0 { max_count |> float / non_gap |> float } else { 0.0 }
})
let n_conserved = conservation |> filter(|c| { c >= 0.9 }) |> len
let n_variable = conservation |> filter(|c| { c < 0.5 }) |> len
print(f"MSA columns: {n_cols}")
print(f"Highly conserved (>=90%): {n_conserved}")
print(f"Variable (<50%): {n_variable}")
print(f"Overall conservation: {conservation |> mean |> round(3)}")