Proteomics

BioLang provides tools for protein sequence analysis, mass spectrometry data processing, and structural feature extraction. These examples cover common proteomics workflows from basic sequence properties to peptide identification.

Protein Properties

Molecular weight and amino acid composition

# Amino acid average masses (monoisotopic, Da)
let aa_mass = {
  "A": 71.04, "R": 156.10, "N": 114.04, "D": 115.03, "C": 103.01,
  "E": 129.04, "Q": 128.06, "G": 57.02, "H": 137.06, "I": 113.08,
  "L": 113.08, "K": 128.09, "M": 131.04, "F": 147.07, "P": 97.05,
  "S": 87.03, "T": 101.05, "W": 186.08, "Y": 163.06, "V": 99.07
}

let prot = protein"MKWVTFISLLFLFSSAYSRGVFRR"
let residues = to_string(prot) |> split("")

# Calculate molecular weight
let mw = residues |> map(|aa| { aa_mass[aa] }) |> sum
let mw = mw + 18.02  # add water
print(f"Molecular weight: {mw |> round(2)} Da")
print(f"Sequence length: {prot |> seq_len} aa")

Amino acid composition

let prot = protein"MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFK"
let residues = to_string(prot) |> split("")
let total = residues |> len

# Count each amino acid using frequencies()
let composition = residues |> frequencies
  |> sort_by(|a, b| b.value - a.value)

for entry in composition {
  let pct = entry.value |> float / total |> float * 100.0
  print(f"{entry.key}: {entry.value} ({pct |> round(1)}%)")
}

# Count hydrophobic residues (A, V, I, L, M, F, W, P)
let hydrophobic_aa = ["A", "V", "I", "L", "M", "F", "W", "P"]
let hydrophobic = residues |> filter(|aa| { hydrophobic_aa |> contains(aa) }) |> len
print(f"\nHydrophobic residues: {hydrophobic}/{total}")

Extinction coefficient at 280nm

# Extinction coefficient from Trp, Tyr, and Cys counts
# E280 = nW * 5500 + nY * 1490 + nC_pairs * 125
let seq = protein"MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGE"
let residues = to_string(seq) |> split("")
let counts = residues |> frequencies

let n_trp = counts |> find(|x| { x.key == "W" }) |> map(|x| x.value) |> first
let n_tyr = counts |> find(|x| { x.key == "Y" }) |> map(|x| x.value) |> first
let n_cys = counts |> find(|x| { x.key == "C" }) |> map(|x| x.value) |> first

let ext_coeff = (n_trp * 5500) + (n_tyr * 1490) + ((n_cys / 2) * 125)
print(f"Extinction coefficient: {ext_coeff} M^-1 cm^-1")
print(f"  Trp: {n_trp}, Tyr: {n_tyr}, Cys: {n_cys}")

Protein Domain Search

Motif scanning

# Search for common protein motifs
let records = read_fasta("protein.fa")
let prot = records |> first

# N-glycosylation sites: N-X-S/T (where X is not P)
let seq_str = to_string(prot.seq)
let nglyc_sites = seq_str |> regex_find_all("N[^P][ST]")
print(f"N-glycosylation sites: {nglyc_sites |> len}")
for site in nglyc_sites {
  print(f"  Match: {site}")
}

# Phosphorylation sites (simplified: S/T followed by P)
let phos_sites = seq_str |> regex_find_all("[ST]P")
print(f"Potential phosphorylation sites: {phos_sites |> len}")

# N-terminal hydrophobicity check
let hydrophobic_aa = ["A", "V", "I", "L", "M", "F", "W", "P"]
let signal = substr(prot.seq, 0, 30) |> to_string |> split("")
let n_hydro = signal |> filter(|aa| { hydrophobic_aa |> contains(aa) }) |> len
let hydro_frac = n_hydro |> float / (signal |> len) |> float
print(f"N-terminal hydrophobic fraction: {hydro_frac |> round(3)} (>0.5 suggests signal peptide)")

Domain annotation from Pfam hits

# Parse Pfam domain hits
let domains = tsv("pfam_results.tsv")
  |> filter(|d| { d["e_value"] |> float < 1e-5 })
  |> sort_by(|a, b| a.start - b.start)

print("Significant domains found:")
for d in domains {
  print(f"  {d["domain_name"]} ({d["start"]}-{d["end"]}) E-value: {d["e_value"]}")
}

# Check for domain architecture
let has_kinase = domains |> any(|d| { d["domain_name"] |> contains("Kinase") })
let has_sh2 = domains |> any(|d| { d["domain_name"] |> contains("SH2") })
if has_kinase && has_sh2 {
  print("Receptor tyrosine kinase-like architecture detected")
}

Mass Spectrometry

In-silico tryptic digest

# Trypsin cleaves after K and R (except when followed by P)
let prot = protein"MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGE"
let seq_str = to_string(prot)

# Split at K or R not followed by P using regex
let peptides = seq_str |> regex_split("[KR](?!P)")
  |> filter(|p| { p |> len > 0 })

# Amino acid masses for computing peptide mass
let aa_mass = {
  "A": 71.04, "R": 156.10, "N": 114.04, "D": 115.03, "C": 103.01,
  "E": 129.04, "Q": 128.06, "G": 57.02, "H": 137.06, "I": 113.08,
  "L": 113.08, "K": 128.09, "M": 131.04, "F": 147.07, "P": 97.05,
  "S": 87.03, "T": 101.05, "W": 186.08, "Y": 163.06, "V": 99.07
}

fn peptide_mass(pep) {
  let residues = pep |> split("")
  let mass = residues |> map(|aa| { aa_mass[aa] }) |> sum
  mass + 18.02  # add water
}

print("Tryptic peptides:")
for pep in peptides |> sort_by(|a, b| len(a) - len(b)) {
  let mass = peptide_mass(pep)
  let mz = mass + 1.00728  # [M+H]+
  print(f"  {pep} (m/z: {mz |> round(4)}, length: {pep |> len})")
}

Peptide mass fingerprint

# Generate theoretical peptide masses for database matching
let aa_mass = {
  "A": 71.04, "R": 156.10, "N": 114.04, "D": 115.03, "C": 103.01,
  "E": 129.04, "Q": 128.06, "G": 57.02, "H": 137.06, "I": 113.08,
  "L": 113.08, "K": 128.09, "M": 131.04, "F": 147.07, "P": 97.05,
  "S": 87.03, "T": 101.05, "W": 186.08, "Y": 163.06, "V": 99.07
}

fn peptide_mass(pep) {
  pep |> split("") |> map(|aa| { aa_mass[aa] }) |> sum |> round(4)
}

let proteins = read_fasta("proteome.fa")

let mass_db = proteins
  |> flat_map(|p| {
    let peps = to_string(p.seq) |> regex_split("[KR](?!P)")
      |> filter(|pep| { pep |> len >= 6 && pep |> len <= 30 })
    peps |> map(|pep| { {
      protein: p.id,
      peptide: pep,
      mass: peptide_mass(pep)
    }})
  })
  |> sort_by(|a, b| a.mass - b.mass)

mass_db |> write_tsv("theoretical_masses.tsv")
print(f"Generated {mass_db |> len} theoretical peptide masses")

Matching observed masses to theoretical

# Match observed MS peaks to theoretical peptide masses
let observed = tsv("observed_peaks.tsv")
  |> map(|row| { row["mz"] |> float })

let theoretical = tsv("theoretical_masses.tsv")

let tolerance_ppm = 10.0

let matches = observed |> flat_map(|obs_mz| {
  theoretical
    |> filter(|t| {
      let ppm = ((obs_mz - t["mass"] |> float) / (t["mass"] |> float) * 1e6) |> abs
      ppm < tolerance_ppm
    })
    |> map(|t| { {
      observed_mz: obs_mz,
      theoretical_mass: t["mass"],
      protein: t["protein"],
      peptide: t["peptide"],
      ppm_error: ((obs_mz - t["mass"] |> float) / (t["mass"] |> float) * 1e6)
    }})
})

print(f"Matched {matches |> len} peptides to {matches |> map(|x| x.protein) |> unique |> len} proteins")
matches |> write_tsv("peptide_matches.tsv")

Protein Comparison

Pairwise k-mer similarity

# Compare protein sequences using k-mer overlap
let proteins = read_fasta("family.fa") |> collect

for i in 0..(proteins |> len) {
  for j in (i + 1)..(proteins |> len) {
    let k1 = proteins[i].seq |> kmers(3) |> set
    let k2 = proteins[j].seq |> kmers(3) |> set
    let shared = intersection(k1, k2)
    let similarity = len(shared) |> float / min(len(k1), len(k2)) |> float
    print(f"{proteins[i].id} vs {proteins[j].id}: {(similarity * 100.0) |> round(1)}% k-mer similarity")
  }
}

Multiple sequence alignment conservation

# Analyze per-column conservation from a pre-aligned FASTA
let seqs = read_fasta("aligned_proteins.fa") |> collect
let aligned = seqs |> map(|s| { to_string(s.seq) |> split("") })
let n_seqs = aligned |> len
let n_cols = aligned |> first |> len

# Compute conservation score per column (fraction of most common residue)
let conservation = range(0, n_cols) |> map(|col| {
  let column = aligned |> map(|s| { s[col] })
  let freqs = column |> filter(|aa| { aa != "-" }) |> frequencies
  let max_count = freqs |> map(|f| { f.value }) |> max
  let non_gap = column |> filter(|aa| { aa != "-" }) |> len
  if non_gap > 0 { max_count |> float / non_gap |> float } else { 0.0 }
})

let n_conserved = conservation |> filter(|c| { c >= 0.9 }) |> len
let n_variable = conservation |> filter(|c| { c < 0.5 }) |> len

print(f"MSA columns: {n_cols}")
print(f"Highly conserved (>=90%): {n_conserved}")
print(f"Variable (<50%): {n_variable}")
print(f"Overall conservation: {conservation |> mean |> round(3)}")