Intermediate
~30 minutes
Protein Analysis
Analyze proteins using UniProt and PDB data. Fetch sequences and annotations, search for motifs, explore amino acid composition with k-mers and string ops, examine domain annotations, and visualize results.
What you will learn
- Fetching protein data from UniProt (entry, FASTA, features, GO terms)
- Creating Protein values and inspecting their properties
- Searching for sequence motifs with
find_motif() - Analyzing amino acid k-mers and composition
- Exploring domain annotations and GO terms
- Looking up PDB structures
- Comparing multiple proteins with tables and plots
Run this tutorial: Download
protein-analysis.bl
and run it with
bl run examples/tutorials/protein-analysis.bl
Step 1 — Fetching a Protein from UniProt
# requires: internet connection
# protein.bl — protein analysis
# Fetch TP53 (tumor protein p53) from UniProt
let entry = uniprot_entry("P04637")
print(f"Accession: {entry.accession}")
print(f"Name: {entry.name}")
print(f"Organism: {entry.organism}")
print(f"Length: {entry.sequence_length} aa")
print(f"Genes: {entry.gene_names}")
print(f"Function: {entry.function}")
# Get the FASTA sequence
let fasta = uniprot_fasta("P04637")
print(f"FASTA (first 80 chars): {substr(fasta, 0, 80)}...")
# Extract the raw sequence from FASTA (skip header line), wrap as Protein
let raw_seq = fasta |> split("\n") |> drop(1) |> join("")
let seq = protein(raw_seq)
print(f"Sequence type: {typeof(seq)}") # Protein
print(f"Length: {seq_len(seq)} aa")
# You can also create protein sequences directly
let my_peptide = protein("MASKFPGIDRS")
print(f"Peptide length: {seq_len(my_peptide)}")
Step 2 — Amino Acid Composition via K-mers
# Use kmers(seq, 1) to get every individual amino acid
let residues = kmers(seq, 1)
let total = len(residues)
# Count each amino acid using map/reduce on unique residues
let aa_types = unique(residues)
let counts = aa_types |> map(|aa| {
let n = residues |> count_if(|r| r == aa)
{ amino_acid: aa, count: n, pct: n / total * 100.0 }
})
# Sort by frequency (descending) and display
counts |> sort_by(|a, b| b.count - a.count) |> into sorted
print("\n=== Amino Acid Composition ===")
for item in sorted {
print(f" {item.amino_acid}: {item.count} ({round(item.pct, 1)}%)")
}
# Group into property classes manually
let charged_aa = ["D", "E", "K", "R", "H"]
let polar_aa = ["S", "T", "N", "Q", "Y", "C"]
let hydro_aa = ["A", "V", "I", "L", "M", "F", "W", "P"]
let charged = residues |> count_if(|r| any(charged_aa, |a| a == r))
let polar = residues |> count_if(|r| any(polar_aa, |a| a == r))
let hydro = residues |> count_if(|r| any(hydro_aa, |a| a == r))
print(f"\nCharged: {charged} ({round(charged / total * 100.0, 1)}%)")
print(f"Polar: {polar} ({round(polar / total * 100.0, 1)}%)")
print(f"Hydrophobic: {hydro} ({round(hydro / total * 100.0, 1)}%)")
Step 3 — Amino Acid Composition as a Table
# Build a table of amino acid counts for downstream analysis
let aa_table = sorted |> to_table()
print(aa_table)
# Visualize with a histogram
histogram(col(aa_table, "count"), {
title: "Amino Acid Frequency — TP53",
xlabel: "Count",
ylabel: "Number of amino acid types",
bins: 10,
color: "#8b5cf6",
output: "results/aa_histogram.svg",
})
# Bar-style plot of composition
plot(aa_table, {
x: "amino_acid",
y: "pct",
title: "TP53 Amino Acid Composition (%)",
xlabel: "Amino Acid",
ylabel: "Percentage",
color: "#06b6d4",
output: "results/aa_composition.svg",
})
Step 4 — Motif Searching
# find_motif(seq, pattern) returns a list of start positions (integers)
# Nuclear localization signals: runs of 4+ basic residues (K or R)
let nls_hits = find_motif(seq, "[KR]{4,}")
print(f"\n=== Nuclear Localization Signals ===")
print(f"Found {len(nls_hits)} NLS-like regions at positions: {nls_hits}")
# N-glycosylation sites: N-{P}-[ST]-{P}
let nglyc = find_motif(seq, "N[^P][ST][^P]")
print(f"\nPotential N-glycosylation sites: {len(nglyc)}")
for pos in nglyc {
# Extract the 4-residue motif at each position
let motif_seq = substr(seq, pos, 4)
print(f" Position {pos}: {motif_seq}")
}
# Phosphorylation sites: S or T followed by P (minimal CDK motif)
let phospho = find_motif(seq, "[ST]P")
print(f"\nPotential phosphorylation sites (SP/TP): {len(phospho)}")
for pos in phospho {
let motif_seq = substr(seq, pos, 2)
print(f" Position {pos}: {motif_seq}")
}
# Cysteine residues (important for disulfide bonds)
let cys_positions = find_motif(seq, "C")
print(f"\nCysteine residues: {len(cys_positions)} at positions {cys_positions}")
Step 5 — UniProt Feature Annotations
# requires: internet connection
# Fetch annotated domains and features from UniProt
let features = uniprot_features("P04637")
print(f"Total features: {len(features)}")
# Filter for domains
let domains = features |> filter(|f| f.type == "Domain")
print("\n=== Annotated Domains ===")
for d in domains {
print(f" {d.description}: {d.location}")
}
# Filter for active/binding sites
let sites = features |> filter(|f| {
f.type == "Binding site" or f.type == "Active site" or f.type == "Metal binding"
})
print("\n=== Binding / Active Sites ===")
for s in sites {
print(f" [{s.type}] {s.description}: {s.location}")
}
# Show all feature types present
let feature_types = features |> map(|f| f.type) |> unique()
print(f"\nFeature types: {feature_types}")
# Count features by type
let type_counts = feature_types |> map(|t| {
let n = features |> count_if(|f| f.type == t)
{ type: t, count: n }
}) |> sort_by(|a, b| b.count - a.count)
print("\n=== Feature Type Summary ===")
for tc in type_counts {
print(f" {tc.type}: {tc.count}")
}
Step 6 — Gene Ontology Terms
# requires: internet connection
# Fetch GO annotations for TP53
let go_terms = uniprot_go("P04637")
print(f"GO terms: {len(go_terms)}")
# Display by ontology category
for term in go_terms {
print(f" [{term.category}] {term.id}: {term.name}")
}
# Filter to just molecular function terms
let mol_func = go_terms |> filter(|t| t.category == "molecular_function")
print(f"\nMolecular functions ({len(mol_func)}):")
for mf in mol_func {
print(f" {mf.id}: {mf.name}")
}
# Biological process terms
let bio_proc = go_terms |> filter(|t| t.category == "biological_process")
print(f"\nBiological processes ({len(bio_proc)}):")
for bp in bio_proc {
print(f" {bp.id}: {bp.name}")
}
Step 7 — PDB Structure Lookup
# requires: internet connection
# Look up PDB structure for the TP53 DNA-binding domain
let pdb = pdb_entry("1TUP")
print(f"\n=== PDB Structure ===")
print(f"ID: {pdb.id}")
print(f"Title: {pdb.title}")
print(f"Resolution: {pdb.resolution} A")
print(f"Method: {pdb.method}")
# List entities (chains / molecules)
print(f"\nEntities ({len(pdb.entities)}):")
for entity in pdb.entities {
print(f" {entity}")
}
# List ligands
print(f"\nLigands ({len(pdb.ligands)}):")
for lig in pdb.ligands {
print(f" {lig}")
}
# Search PDB for more TP53 structures
let tp53_structures = pdb_search("TP53")
print(f"\nTP53 PDB hits: {len(tp53_structures)}")
for id in tp53_structures |> take(10) {
print(f" {id}")
}
Step 8 — Comparing Multiple Proteins
# requires: internet connection
# Fetch a family of related proteins
let accessions = ["P04637", "Q00987", "O15350"] # TP53, MDM2, TP73
let proteins = accessions |> map(|acc| uniprot_entry(acc))
# Fetch sequences
let sequences = accessions |> map(|acc| {
let fasta = uniprot_fasta(acc)
protein(fasta |> split("\n") |> drop(1) |> join(""))
})
# Build a comparison table
let comparison = proteins
|> enumerate()
|> map(|i, p| {
let seq = sequences[i]
let residues = kmers(seq, 1)
let total = len(residues)
let charged = residues |> count_if(|r| {
any(["D", "E", "K", "R", "H"], |a| a == r)
})
{
gene: p.gene_names,
accession: p.accession,
length: seq_len(seq),
charged_pct: round(charged / total * 100.0, 1),
organism: p.organism,
}
})
|> to_table()
print("\n=== Protein Family Comparison ===")
print(comparison)
# Compare motif counts across the family
let motif_comparison = proteins
|> enumerate()
|> map(|i, p| {
let seq = sequences[i]
{
gene: p.gene_names,
nls: len(find_motif(seq, "[KR]{4,}")),
phospho: len(find_motif(seq, "[ST]P")),
nglyc: len(find_motif(seq, "N[^P][ST][^P]")),
cys: len(find_motif(seq, "C")),
}
})
|> to_table()
print("\n=== Motif Comparison ===")
print(motif_comparison)
Step 9 — K-mer Analysis
# Analyze dipeptide (2-mer) frequencies in TP53
let dipeptides = kmers(seq, 2)
let dp_types = unique(dipeptides)
let dp_counts = dp_types |> map(|dp| {
let n = dipeptides |> count_if(|d| d == dp)
{ dipeptide: dp, count: n }
}) |> sort_by(|a, b| b.count - a.count)
print("\n=== Top 20 Dipeptides ===")
for item in dp_counts |> take(20) {
print(f" {item.dipeptide}: {item.count}")
}
# Tripeptide (3-mer) analysis
let tripeptides = kmers(seq, 3)
let tp_types = unique(tripeptides)
let tp_counts = tp_types |> map(|tp| {
let n = tripeptides |> count_if(|t| t == tp)
{ tripeptide: tp, count: n }
}) |> sort_by(|a, b| b.count - a.count)
print(f"\nUnique tripeptides: {len(tp_types)}")
print("Top 10 tripeptides:")
for item in tp_counts |> take(10) {
print(f" {item.tripeptide}: {item.count}")
}
# Visualize dipeptide frequency distribution
let dp_table = dp_counts |> to_table()
histogram(col(dp_table, "count"), {
title: "Dipeptide Frequency Distribution — TP53",
xlabel: "Occurrences",
ylabel: "Number of dipeptides",
bins: 15,
color: "#8b5cf6",
output: "results/dipeptide_dist.svg",
})
Step 10 — Complete Protein Report
# requires: internet connection
# protein_report.bl — full protein analysis pipeline
fn analyze_protein(accession) {
print(f"\n{'=' * 60}")
print(f"Analyzing {accession}")
print(f"{'=' * 60}")
# Fetch all data from UniProt
let entry = uniprot_entry(accession)
let fasta = uniprot_fasta(accession)
let features = uniprot_features(accession)
let go_terms = uniprot_go(accession)
# Build the protein sequence
let seq = protein(fasta |> split("\n") |> drop(1) |> join(""))
print(f"Gene: {entry.gene_names}")
print(f"Organism: {entry.organism}")
print(f"Length: {seq_len(seq)} aa")
# Amino acid composition
let residues = kmers(seq, 1)
let total = len(residues)
let aa_types = unique(residues)
let composition = aa_types |> map(|aa| {
let n = residues |> count_if(|r| r == aa)
{ amino_acid: aa, count: n, pct: round(n / total * 100.0, 1) }
}) |> sort_by(|a, b| b.count - a.count)
print(f"\nTop 5 residues:")
for item in composition |> take(5) {
print(f" {item.amino_acid}: {item.count} ({item.pct}%)")
}
# Motif analysis
let nls = find_motif(seq, "[KR]{4,}")
let phospho = find_motif(seq, "[ST]P")
let nglyc = find_motif(seq, "N[^P][ST][^P]")
print(f"\nMotifs:")
print(f" NLS-like regions: {len(nls)}")
print(f" Phosphorylation sites: {len(phospho)}")
print(f" N-glycosylation sites: {len(nglyc)}")
# Domain annotations
let domains = features |> filter(|f| f.type == "Domain")
print(f"\nDomains ({len(domains)}):")
for d in domains {
print(f" {d.description}: {d.location}")
}
# GO summary
let mf = go_terms |> filter(|t| t.category == "molecular_function")
let bp = go_terms |> filter(|t| t.category == "biological_process")
print(f"\nGO: {len(mf)} molecular functions, {len(bp)} biological processes")
# Save structured report
let report = {
accession: entry.accession,
gene: entry.gene_names,
organism: entry.organism,
length: seq_len(seq),
composition: composition,
motifs: {
nls: len(nls),
phospho: len(phospho),
nglyc: len(nglyc),
},
domains: domains |> map(|d| d.description),
go_terms: len(go_terms),
}
let filename = f"results/{accession}_report.json"
write_json(filename, report)
print(f"\nReport saved to {filename}")
report
}
# Analyze TP53 and a related protein
let tp53_report = analyze_protein("P04637")
let mdm2_report = analyze_protein("Q00987")
Next Steps
Learn how to handle large datasets efficiently in the Streaming Large Files tutorial.