Structural Biology

BioLang supports PDB and mmCIF file parsing for structural biology analysis. These examples cover common tasks from basic structure statistics to contact maps, Ramachandran analysis, and B-factor exploration.

PDB Parsing

Basic structure information

# Parse a local PDB file by reading ATOM/HETATM lines
let lines = read_lines("1ubq.pdb")

# Extract header info
let title = lines |> filter(|l| { l |> starts_with("TITLE") })
  |> map(|l| { l |> substr(10, 70) |> trim }) |> join(" ")
print(f"Title: {title}")

# Parse ATOM records
let atoms = lines
  |> filter(|l| { l |> starts_with("ATOM") })
  |> map(|l| { {
    name: l |> substr(12, 16) |> trim,
    resname: l |> substr(17, 20) |> trim,
    chain: l |> substr(21, 22) |> trim,
    resid: l |> substr(22, 26) |> trim |> int,
    x: l |> substr(30, 38) |> trim |> float,
    y: l |> substr(38, 46) |> trim |> float,
    z: l |> substr(46, 54) |> trim |> float,
    bfactor: l |> substr(60, 66) |> trim |> float
  }})

let chains = atoms |> map(|a| { a.chain }) |> unique
print(f"Chains: {chains |> join(", ")}")
print(f"Total atoms: {atoms |> len}")

for ch in chains {
  let ch_atoms = atoms |> filter(|a| { a.chain == ch })
  let n_residues = ch_atoms |> map(|a| { a.resid }) |> unique |> len
  print(f"  Chain {ch}: {n_residues} residues, {ch_atoms |> len} atoms")
}

Extracting C-alpha coordinates

# Extract C-alpha atoms from a PDB file
let lines = read_lines("protein.pdb")
let ca_atoms = lines
  |> filter(|l| { l |> starts_with("ATOM") })
  |> map(|l| { {
    name: l |> substr(12, 16) |> trim,
    resname: l |> substr(17, 20) |> trim,
    chain: l |> substr(21, 22) |> trim,
    resid: l |> substr(22, 26) |> trim |> int,
    x: l |> substr(30, 38) |> trim |> float,
    y: l |> substr(38, 46) |> trim |> float,
    z: l |> substr(46, 54) |> trim |> float,
    bfactor: l |> substr(60, 66) |> trim |> float
  }})
  |> filter(|a| { a.name == "CA" && a.chain == "A" })

ca_atoms |> write_tsv("ca_coordinates.tsv")
print(f"Extracted {ca_atoms |> len} C-alpha atoms")

Fetching structures from RCSB

# requires: internet connection (PDB API)
# Fetch structure metadata from RCSB PDB
let entry = pdb_entry("6LU7")  # SARS-CoV-2 main protease
print(f"Title: {entry.title}")
print(f"Method: {entry.method}")
print(f"Resolution: {entry.resolution}")

# Get entity information (chains/subunits)
let entities = pdb_chains("6LU7")
for ent in entities {
  print(f"  Entity {ent.entity_id} ({ent.entity_type}): {ent.description}")
}

# Get protein sequence for entity 1
let seq = pdb_sequence("6LU7", 1)
print(f"Entity 1 sequence ({seq |> seq_len} aa): {substr(seq, 0, 50)}...")

Contact Maps

Residue contact map

# Build a residue contact map from C-alpha distances
fn parse_atoms(path) {
  read_lines(path)
    |> filter(|l| { l |> starts_with("ATOM") })
    |> map(|l| { {
      name: l |> substr(12, 16) |> trim,
      resname: l |> substr(17, 20) |> trim,
      chain: l |> substr(21, 22) |> trim,
      resid: l |> substr(22, 26) |> trim |> int,
      x: l |> substr(30, 38) |> trim |> float,
      y: l |> substr(38, 46) |> trim |> float,
      z: l |> substr(46, 54) |> trim |> float,
      bfactor: l |> substr(60, 66) |> trim |> float
    }})
}

fn atom_dist(a, b) {
  sqrt(pow(a.x - b.x, 2) + pow(a.y - b.y, 2) + pow(a.z - b.z, 2))
}

let all_atoms = parse_atoms("protein.pdb")
let ca_atoms = all_atoms |> filter(|a| { a.name == "CA" && a.chain == "A" }) |> collect
let n = ca_atoms |> len
let cutoff = 8.0  # Angstroms

# Build contact list
let contacts = []
for i in 0..n {
  for j in (i + 1)..n {
    let dist = atom_dist(ca_atoms[i], ca_atoms[j])
    if dist <= cutoff {
      contacts = contacts |> push({
        res_i: ca_atoms[i].resid,
        res_j: ca_atoms[j].resid,
        distance: dist |> round(2)
      })
    }
  }
}

print(f"Contacts (< {cutoff} A): {contacts |> len}")
contacts |> write_tsv("contact_map.tsv")

# Sequence separation statistics
let seq_sep = contacts |> map(|c| { abs(c.res_j - c.res_i) })
print(f"Mean sequence separation: {seq_sep |> mean |> round(1)} residues")
print(f"Long-range contacts (>12 res): {seq_sep |> filter(|s| { s > 12 }) |> len}")

Inter-chain contacts

# Find contacts between two chains (protein-protein interface)
let all_atoms = parse_atoms("complex.pdb")
let chain_a = all_atoms |> filter(|a| { a.chain == "A" }) |> collect
let chain_b = all_atoms |> filter(|a| { a.chain == "B" }) |> collect

let interface_cutoff = 4.5  # Angstroms

let interface = chain_a |> flat_map(|atom_a| {
  chain_b
    |> filter(|atom_b| { atom_dist(atom_a, atom_b) <= interface_cutoff })
    |> map(|atom_b| { {
      chain_a_res: f"{atom_a.resname}{atom_a.resid}",
      chain_a_atom: atom_a.name,
      chain_b_res: f"{atom_b.resname}{atom_b.resid}",
      chain_b_atom: atom_b.name,
      distance: atom_dist(atom_a, atom_b) |> round(2)
    }})
})

let interface_residues_a = interface |> map(|x| { x.chain_a_res }) |> unique
let interface_residues_b = interface |> map(|x| { x.chain_b_res }) |> unique

print(f"Interface contacts: {interface |> len}")
print(f"Interface residues chain A: {interface_residues_a |> len}")
print(f"Interface residues chain B: {interface_residues_b |> len}")
interface |> write_tsv("interface_contacts.tsv")

Ramachandran Analysis

Phi/Psi angle calculation

# Calculate backbone dihedral angles from ATOM coordinates
# Requires N, CA, C atoms for each residue
let all_atoms = parse_atoms("protein.pdb")
let backbone = all_atoms |> filter(|a| { a.chain == "A" && (a.name == "N" || a.name == "CA" || a.name == "C") })

# Group backbone atoms by residue
let residue_ids = backbone |> map(|a| { a.resid }) |> unique |> sort

fn get_atom(atoms, resid, name) {
  atoms |> find(|a| { a.resid == resid && a.name == name })
}

# Dihedral angle from four points
fn dihedral(p1, p2, p3, p4) {
  let b1x = p2.x - p1.x; let b1y = p2.y - p1.y; let b1z = p2.z - p1.z
  let b2x = p3.x - p2.x; let b2y = p3.y - p2.y; let b2z = p3.z - p2.z
  let b3x = p4.x - p3.x; let b3y = p4.y - p3.y; let b3z = p4.z - p3.z
  # Cross products n1 = b1 x b2, n2 = b2 x b3
  let n1x = b1y * b2z - b1z * b2y
  let n1y = b1z * b2x - b1x * b2z
  let n1z = b1x * b2y - b1y * b2x
  let n2x = b2y * b3z - b2z * b3y
  let n2y = b2z * b3x - b2x * b3z
  let n2z = b2x * b3y - b2y * b3x
  let dot_nn = n1x * n2x + n1y * n2y + n1z * n2z
  let cross_x = n1y * n2z - n1z * n2y
  let b2_len = sqrt(b2x * b2x + b2y * b2y + b2z * b2z)
  atan(b2_len * (b1x * n2x + b1y * n2y + b1z * n2z), dot_nn) * 180.0 / 3.14159265
}

# Compute phi/psi for each residue (skip first and last)
let dihedrals = range(1, (residue_ids |> len) - 1) |> map(|idx| {
  let prev = residue_ids[idx - 1]
  let curr = residue_ids[idx]
  let next = residue_ids[idx + 1]
  let phi = dihedral(
    get_atom(backbone, prev, "C"), get_atom(backbone, curr, "N"),
    get_atom(backbone, curr, "CA"), get_atom(backbone, curr, "C")
  )
  let psi = dihedral(
    get_atom(backbone, curr, "N"), get_atom(backbone, curr, "CA"),
    get_atom(backbone, curr, "C"), get_atom(backbone, next, "N")
  )
  let res = get_atom(backbone, curr, "CA")
  { resid: curr, resname: res.resname, phi: phi |> round(1), psi: psi |> round(1) }
})

dihedrals |> write_tsv("ramachandran.tsv")
print(f"Computed dihedral angles for {dihedrals |> len} residues")

# Classify: most-favored regions (simplified)
fn is_favored(phi, psi) {
  # Alpha helix region
  (phi > -180.0 && phi < -20.0 && psi > -80.0 && psi < 0.0) ||
  # Beta sheet region
  (phi > -180.0 && phi < -40.0 && psi > 50.0 && psi < 180.0)
}

let favored = dihedrals |> filter(|d| { is_favored(d.phi, d.psi) }) |> len
let total = dihedrals |> len
print(f"Favored region: {favored} ({(favored |> float / total |> float * 100.0) |> round(1)}%)")
print(f"Other:          {total - favored} ({((total - favored) |> float / total |> float * 100.0) |> round(1)}%)")

B-factor Analysis

B-factor per residue

# Analyze B-factors as a proxy for flexibility
let all_atoms = parse_atoms("protein.pdb")
let chain_a = all_atoms |> filter(|a| { a.chain == "A" })
let residue_ids = chain_a |> map(|a| { a.resid }) |> unique |> sort

let bfactors = residue_ids |> map(|resid| {
  let res_atoms = chain_a |> filter(|a| { a.resid == resid })
  let bfs = res_atoms |> map(|a| { a.bfactor })
  let ca = res_atoms |> find(|a| { a.name == "CA" })
  {
    resid: resid,
    resname: res_atoms |> first |> map(|a| { a.resname }),
    mean_bfactor: bfs |> mean,
    ca_bfactor: if ca != nil { ca.bfactor } else { 0.0 },
    n_atoms: bfs |> len
  }
})

let all_means = bfactors |> map(|x| { x.mean_bfactor })
let overall_mean = all_means |> mean
let overall_std = all_means |> stdev

print("B-factor Statistics:")
print(f"  Mean: {overall_mean |> round(2)} A^2")
print(f"  Std:  {overall_std |> round(2)} A^2")
print(f"  Range: {all_means |> min |> round(2)} - {all_means |> max |> round(2)}")

# High B-factor residues (flexible regions)
let threshold = overall_mean + 2.0 * overall_std
let flexible = bfactors |> filter(|r| { r.mean_bfactor > threshold })
print(f"\nHighly flexible residues (>{threshold |> round(1)} A^2):")
for r in flexible {
  print(f"  {r.resname}{r.resid}: {r.mean_bfactor |> round(2)} A^2")
}

bfactors |> write_tsv("bfactor_per_residue.tsv")

Secondary structure from DSSP records

# Parse HELIX and SHEET records from PDB header
let lines = read_lines("protein.pdb")

let helices = lines |> filter(|l| { l |> starts_with("HELIX") })
  |> map(|l| { {
    type: "H",
    chain: l |> substr(19, 20) |> trim,
    start: l |> substr(21, 25) |> trim |> int,
    end: l |> substr(33, 37) |> trim |> int
  }})

let sheets = lines |> filter(|l| { l |> starts_with("SHEET") })
  |> map(|l| { {
    type: "E",
    chain: l |> substr(21, 22) |> trim,
    start: l |> substr(22, 26) |> trim |> int,
    end: l |> substr(33, 37) |> trim |> int
  }})

let helix_res = helices |> map(|h| { h.end - h.start + 1 }) |> sum
let sheet_res = sheets |> map(|s| { s.end - s.start + 1 }) |> sum
let total_res = residue_ids |> len

print("Secondary Structure (from PDB header):")
print(f"  Helices: {helices |> len} ({helix_res} residues, {(helix_res |> float / total_res |> float * 100.0) |> round(1)}%)")
print(f"  Sheets:  {sheets |> len} ({sheet_res} residues, {(sheet_res |> float / total_res |> float * 100.0) |> round(1)}%)")
print(f"  Coil:    ~{total_res - helix_res - sheet_res} residues")