Structural Biology
BioLang supports PDB and mmCIF file parsing for structural biology analysis. These examples cover common tasks from basic structure statistics to contact maps, Ramachandran analysis, and B-factor exploration.
PDB Parsing
Basic structure information
# Parse a local PDB file by reading ATOM/HETATM lines
let lines = read_lines("1ubq.pdb")
# Extract header info
let title = lines |> filter(|l| { l |> starts_with("TITLE") })
|> map(|l| { l |> substr(10, 70) |> trim }) |> join(" ")
print(f"Title: {title}")
# Parse ATOM records
let atoms = lines
|> filter(|l| { l |> starts_with("ATOM") })
|> map(|l| { {
name: l |> substr(12, 16) |> trim,
resname: l |> substr(17, 20) |> trim,
chain: l |> substr(21, 22) |> trim,
resid: l |> substr(22, 26) |> trim |> int,
x: l |> substr(30, 38) |> trim |> float,
y: l |> substr(38, 46) |> trim |> float,
z: l |> substr(46, 54) |> trim |> float,
bfactor: l |> substr(60, 66) |> trim |> float
}})
let chains = atoms |> map(|a| { a.chain }) |> unique
print(f"Chains: {chains |> join(", ")}")
print(f"Total atoms: {atoms |> len}")
for ch in chains {
let ch_atoms = atoms |> filter(|a| { a.chain == ch })
let n_residues = ch_atoms |> map(|a| { a.resid }) |> unique |> len
print(f" Chain {ch}: {n_residues} residues, {ch_atoms |> len} atoms")
}
Extracting C-alpha coordinates
# Extract C-alpha atoms from a PDB file
let lines = read_lines("protein.pdb")
let ca_atoms = lines
|> filter(|l| { l |> starts_with("ATOM") })
|> map(|l| { {
name: l |> substr(12, 16) |> trim,
resname: l |> substr(17, 20) |> trim,
chain: l |> substr(21, 22) |> trim,
resid: l |> substr(22, 26) |> trim |> int,
x: l |> substr(30, 38) |> trim |> float,
y: l |> substr(38, 46) |> trim |> float,
z: l |> substr(46, 54) |> trim |> float,
bfactor: l |> substr(60, 66) |> trim |> float
}})
|> filter(|a| { a.name == "CA" && a.chain == "A" })
ca_atoms |> write_tsv("ca_coordinates.tsv")
print(f"Extracted {ca_atoms |> len} C-alpha atoms")
Fetching structures from RCSB
# requires: internet connection (PDB API)
# Fetch structure metadata from RCSB PDB
let entry = pdb_entry("6LU7") # SARS-CoV-2 main protease
print(f"Title: {entry.title}")
print(f"Method: {entry.method}")
print(f"Resolution: {entry.resolution}")
# Get entity information (chains/subunits)
let entities = pdb_chains("6LU7")
for ent in entities {
print(f" Entity {ent.entity_id} ({ent.entity_type}): {ent.description}")
}
# Get protein sequence for entity 1
let seq = pdb_sequence("6LU7", 1)
print(f"Entity 1 sequence ({seq |> seq_len} aa): {substr(seq, 0, 50)}...")
Contact Maps
Residue contact map
# Build a residue contact map from C-alpha distances
fn parse_atoms(path) {
read_lines(path)
|> filter(|l| { l |> starts_with("ATOM") })
|> map(|l| { {
name: l |> substr(12, 16) |> trim,
resname: l |> substr(17, 20) |> trim,
chain: l |> substr(21, 22) |> trim,
resid: l |> substr(22, 26) |> trim |> int,
x: l |> substr(30, 38) |> trim |> float,
y: l |> substr(38, 46) |> trim |> float,
z: l |> substr(46, 54) |> trim |> float,
bfactor: l |> substr(60, 66) |> trim |> float
}})
}
fn atom_dist(a, b) {
sqrt(pow(a.x - b.x, 2) + pow(a.y - b.y, 2) + pow(a.z - b.z, 2))
}
let all_atoms = parse_atoms("protein.pdb")
let ca_atoms = all_atoms |> filter(|a| { a.name == "CA" && a.chain == "A" }) |> collect
let n = ca_atoms |> len
let cutoff = 8.0 # Angstroms
# Build contact list
let contacts = []
for i in 0..n {
for j in (i + 1)..n {
let dist = atom_dist(ca_atoms[i], ca_atoms[j])
if dist <= cutoff {
contacts = contacts |> push({
res_i: ca_atoms[i].resid,
res_j: ca_atoms[j].resid,
distance: dist |> round(2)
})
}
}
}
print(f"Contacts (< {cutoff} A): {contacts |> len}")
contacts |> write_tsv("contact_map.tsv")
# Sequence separation statistics
let seq_sep = contacts |> map(|c| { abs(c.res_j - c.res_i) })
print(f"Mean sequence separation: {seq_sep |> mean |> round(1)} residues")
print(f"Long-range contacts (>12 res): {seq_sep |> filter(|s| { s > 12 }) |> len}")
Inter-chain contacts
# Find contacts between two chains (protein-protein interface)
let all_atoms = parse_atoms("complex.pdb")
let chain_a = all_atoms |> filter(|a| { a.chain == "A" }) |> collect
let chain_b = all_atoms |> filter(|a| { a.chain == "B" }) |> collect
let interface_cutoff = 4.5 # Angstroms
let interface = chain_a |> flat_map(|atom_a| {
chain_b
|> filter(|atom_b| { atom_dist(atom_a, atom_b) <= interface_cutoff })
|> map(|atom_b| { {
chain_a_res: f"{atom_a.resname}{atom_a.resid}",
chain_a_atom: atom_a.name,
chain_b_res: f"{atom_b.resname}{atom_b.resid}",
chain_b_atom: atom_b.name,
distance: atom_dist(atom_a, atom_b) |> round(2)
}})
})
let interface_residues_a = interface |> map(|x| { x.chain_a_res }) |> unique
let interface_residues_b = interface |> map(|x| { x.chain_b_res }) |> unique
print(f"Interface contacts: {interface |> len}")
print(f"Interface residues chain A: {interface_residues_a |> len}")
print(f"Interface residues chain B: {interface_residues_b |> len}")
interface |> write_tsv("interface_contacts.tsv")
Ramachandran Analysis
Phi/Psi angle calculation
# Calculate backbone dihedral angles from ATOM coordinates
# Requires N, CA, C atoms for each residue
let all_atoms = parse_atoms("protein.pdb")
let backbone = all_atoms |> filter(|a| { a.chain == "A" && (a.name == "N" || a.name == "CA" || a.name == "C") })
# Group backbone atoms by residue
let residue_ids = backbone |> map(|a| { a.resid }) |> unique |> sort
fn get_atom(atoms, resid, name) {
atoms |> find(|a| { a.resid == resid && a.name == name })
}
# Dihedral angle from four points
fn dihedral(p1, p2, p3, p4) {
let b1x = p2.x - p1.x; let b1y = p2.y - p1.y; let b1z = p2.z - p1.z
let b2x = p3.x - p2.x; let b2y = p3.y - p2.y; let b2z = p3.z - p2.z
let b3x = p4.x - p3.x; let b3y = p4.y - p3.y; let b3z = p4.z - p3.z
# Cross products n1 = b1 x b2, n2 = b2 x b3
let n1x = b1y * b2z - b1z * b2y
let n1y = b1z * b2x - b1x * b2z
let n1z = b1x * b2y - b1y * b2x
let n2x = b2y * b3z - b2z * b3y
let n2y = b2z * b3x - b2x * b3z
let n2z = b2x * b3y - b2y * b3x
let dot_nn = n1x * n2x + n1y * n2y + n1z * n2z
let cross_x = n1y * n2z - n1z * n2y
let b2_len = sqrt(b2x * b2x + b2y * b2y + b2z * b2z)
atan(b2_len * (b1x * n2x + b1y * n2y + b1z * n2z), dot_nn) * 180.0 / 3.14159265
}
# Compute phi/psi for each residue (skip first and last)
let dihedrals = range(1, (residue_ids |> len) - 1) |> map(|idx| {
let prev = residue_ids[idx - 1]
let curr = residue_ids[idx]
let next = residue_ids[idx + 1]
let phi = dihedral(
get_atom(backbone, prev, "C"), get_atom(backbone, curr, "N"),
get_atom(backbone, curr, "CA"), get_atom(backbone, curr, "C")
)
let psi = dihedral(
get_atom(backbone, curr, "N"), get_atom(backbone, curr, "CA"),
get_atom(backbone, curr, "C"), get_atom(backbone, next, "N")
)
let res = get_atom(backbone, curr, "CA")
{ resid: curr, resname: res.resname, phi: phi |> round(1), psi: psi |> round(1) }
})
dihedrals |> write_tsv("ramachandran.tsv")
print(f"Computed dihedral angles for {dihedrals |> len} residues")
# Classify: most-favored regions (simplified)
fn is_favored(phi, psi) {
# Alpha helix region
(phi > -180.0 && phi < -20.0 && psi > -80.0 && psi < 0.0) ||
# Beta sheet region
(phi > -180.0 && phi < -40.0 && psi > 50.0 && psi < 180.0)
}
let favored = dihedrals |> filter(|d| { is_favored(d.phi, d.psi) }) |> len
let total = dihedrals |> len
print(f"Favored region: {favored} ({(favored |> float / total |> float * 100.0) |> round(1)}%)")
print(f"Other: {total - favored} ({((total - favored) |> float / total |> float * 100.0) |> round(1)}%)")
B-factor Analysis
B-factor per residue
# Analyze B-factors as a proxy for flexibility
let all_atoms = parse_atoms("protein.pdb")
let chain_a = all_atoms |> filter(|a| { a.chain == "A" })
let residue_ids = chain_a |> map(|a| { a.resid }) |> unique |> sort
let bfactors = residue_ids |> map(|resid| {
let res_atoms = chain_a |> filter(|a| { a.resid == resid })
let bfs = res_atoms |> map(|a| { a.bfactor })
let ca = res_atoms |> find(|a| { a.name == "CA" })
{
resid: resid,
resname: res_atoms |> first |> map(|a| { a.resname }),
mean_bfactor: bfs |> mean,
ca_bfactor: if ca != nil { ca.bfactor } else { 0.0 },
n_atoms: bfs |> len
}
})
let all_means = bfactors |> map(|x| { x.mean_bfactor })
let overall_mean = all_means |> mean
let overall_std = all_means |> stdev
print("B-factor Statistics:")
print(f" Mean: {overall_mean |> round(2)} A^2")
print(f" Std: {overall_std |> round(2)} A^2")
print(f" Range: {all_means |> min |> round(2)} - {all_means |> max |> round(2)}")
# High B-factor residues (flexible regions)
let threshold = overall_mean + 2.0 * overall_std
let flexible = bfactors |> filter(|r| { r.mean_bfactor > threshold })
print(f"\nHighly flexible residues (>{threshold |> round(1)} A^2):")
for r in flexible {
print(f" {r.resname}{r.resid}: {r.mean_bfactor |> round(2)} A^2")
}
bfactors |> write_tsv("bfactor_per_residue.tsv")
Secondary structure from DSSP records
# Parse HELIX and SHEET records from PDB header
let lines = read_lines("protein.pdb")
let helices = lines |> filter(|l| { l |> starts_with("HELIX") })
|> map(|l| { {
type: "H",
chain: l |> substr(19, 20) |> trim,
start: l |> substr(21, 25) |> trim |> int,
end: l |> substr(33, 37) |> trim |> int
}})
let sheets = lines |> filter(|l| { l |> starts_with("SHEET") })
|> map(|l| { {
type: "E",
chain: l |> substr(21, 22) |> trim,
start: l |> substr(22, 26) |> trim |> int,
end: l |> substr(33, 37) |> trim |> int
}})
let helix_res = helices |> map(|h| { h.end - h.start + 1 }) |> sum
let sheet_res = sheets |> map(|s| { s.end - s.start + 1 }) |> sum
let total_res = residue_ids |> len
print("Secondary Structure (from PDB header):")
print(f" Helices: {helices |> len} ({helix_res} residues, {(helix_res |> float / total_res |> float * 100.0) |> round(1)}%)")
print(f" Sheets: {sheets |> len} ({sheet_res} residues, {(sheet_res |> float / total_res |> float * 100.0) |> round(1)}%)")
print(f" Coil: ~{total_res - helix_res - sheet_res} residues")