Intermediate ~30 minutes

Protein Analysis

Analyze proteins using UniProt and PDB data. Fetch sequences and annotations, search for motifs, explore amino acid composition with k-mers and string ops, examine domain annotations, and visualize results.

What you will learn

Fetching protein data from UniProt (entry, FASTA, features, GO terms)
Creating Protein values and inspecting their properties
Searching for sequence motifs with find_motif()
Analyzing amino acid k-mers and composition
Exploring domain annotations and GO terms
Looking up PDB structures
Comparing multiple proteins with tables and plots

Run this tutorial: Download protein-analysis.bl and run it with bl run examples/tutorials/protein-analysis.bl

Step 1 — Fetching a Protein from UniProt

# requires: internet connection
# protein.bl — protein analysis

# Fetch TP53 (tumor protein p53) from UniProt
let entry = uniprot_entry("P04637")

print(f"Accession:   {entry.accession}")
print(f"Name:        {entry.name}")
print(f"Organism:    {entry.organism}")
print(f"Length:      {entry.sequence_length} aa")
print(f"Genes:       {entry.gene_names}")
print(f"Function:    {entry.function}")

# Get the FASTA sequence
let fasta = uniprot_fasta("P04637")
print(f"FASTA (first 80 chars): {substr(fasta, 0, 80)}...")

# Extract the raw sequence from FASTA (skip header line), wrap as Protein
let raw_seq = fasta |> split("\n") |> drop(1) |> join("")
let seq = protein(raw_seq)
print(f"Sequence type: {typeof(seq)}")    # Protein
print(f"Length:        {seq_len(seq)} aa")

# You can also create protein sequences directly
let my_peptide = protein("MASKFPGIDRS")
print(f"Peptide length: {seq_len(my_peptide)}")

Step 2 — Amino Acid Composition via K-mers

# Use kmers(seq, 1) to get every individual amino acid
let residues = kmers(seq, 1)
let total = len(residues)

# Count each amino acid using map/reduce on unique residues
let aa_types = unique(residues)
let counts = aa_types |> map(|aa| {
  let n = residues |> count_if(|r| r == aa)
  { amino_acid: aa, count: n, pct: n / total * 100.0 }
})

# Sort by frequency (descending) and display
counts |> sort_by(|a, b| b.count - a.count) |> into sorted

print("\n=== Amino Acid Composition ===")
for item in sorted {
  print(f"  {item.amino_acid}: {item.count} ({round(item.pct, 1)}%)")
}

# Group into property classes manually
let charged_aa = ["D", "E", "K", "R", "H"]
let polar_aa   = ["S", "T", "N", "Q", "Y", "C"]
let hydro_aa   = ["A", "V", "I", "L", "M", "F", "W", "P"]

let charged = residues |> count_if(|r| any(charged_aa, |a| a == r))
let polar   = residues |> count_if(|r| any(polar_aa, |a| a == r))
let hydro   = residues |> count_if(|r| any(hydro_aa, |a| a == r))

print(f"\nCharged:     {charged} ({round(charged / total * 100.0, 1)}%)")
print(f"Polar:       {polar} ({round(polar / total * 100.0, 1)}%)")
print(f"Hydrophobic: {hydro} ({round(hydro / total * 100.0, 1)}%)")

Step 3 — Amino Acid Composition as a Table

# Build a table of amino acid counts for downstream analysis
let aa_table = sorted |> to_table()
print(aa_table)

# Visualize with a histogram
histogram(col(aa_table, "count"), {
  title:  "Amino Acid Frequency — TP53",
  xlabel: "Count",
  ylabel: "Number of amino acid types",
  bins:   10,
  color:  "#8b5cf6",
  output: "results/aa_histogram.svg",
})

# Bar-style plot of composition
plot(aa_table, {
  x:      "amino_acid",
  y:      "pct",
  title:  "TP53 Amino Acid Composition (%)",
  xlabel: "Amino Acid",
  ylabel: "Percentage",
  color:  "#06b6d4",
  output: "results/aa_composition.svg",
})

Step 4 — Motif Searching

# find_motif(seq, pattern) returns a list of start positions (integers)

# Nuclear localization signals: runs of 4+ basic residues (K or R)
let nls_hits = find_motif(seq, "[KR]{4,}")
print(f"\n=== Nuclear Localization Signals ===")
print(f"Found {len(nls_hits)} NLS-like regions at positions: {nls_hits}")

# N-glycosylation sites: N-{P}-[ST]-{P}
let nglyc = find_motif(seq, "N[^P][ST][^P]")
print(f"\nPotential N-glycosylation sites: {len(nglyc)}")
for pos in nglyc {
  # Extract the 4-residue motif at each position
  let motif_seq = substr(seq, pos, 4)
  print(f"  Position {pos}: {motif_seq}")
}

# Phosphorylation sites: S or T followed by P (minimal CDK motif)
let phospho = find_motif(seq, "[ST]P")
print(f"\nPotential phosphorylation sites (SP/TP): {len(phospho)}")
for pos in phospho {
  let motif_seq = substr(seq, pos, 2)
  print(f"  Position {pos}: {motif_seq}")
}

# Cysteine residues (important for disulfide bonds)
let cys_positions = find_motif(seq, "C")
print(f"\nCysteine residues: {len(cys_positions)} at positions {cys_positions}")

Step 5 — UniProt Feature Annotations

# requires: internet connection
# Fetch annotated domains and features from UniProt
let features = uniprot_features("P04637")
print(f"Total features: {len(features)}")

# Filter for domains
let domains = features |> filter(|f| f.type == "Domain")
print("\n=== Annotated Domains ===")
for d in domains {
  print(f"  {d.description}: {d.location}")
}

# Filter for active/binding sites
let sites = features |> filter(|f| {
  f.type == "Binding site" or f.type == "Active site" or f.type == "Metal binding"
})
print("\n=== Binding / Active Sites ===")
for s in sites {
  print(f"  [{s.type}] {s.description}: {s.location}")
}

# Show all feature types present
let feature_types = features |> map(|f| f.type) |> unique()
print(f"\nFeature types: {feature_types}")

# Count features by type
let type_counts = feature_types |> map(|t| {
  let n = features |> count_if(|f| f.type == t)
  { type: t, count: n }
}) |> sort_by(|a, b| b.count - a.count)

print("\n=== Feature Type Summary ===")
for tc in type_counts {
  print(f"  {tc.type}: {tc.count}")
}

Step 6 — Gene Ontology Terms

# requires: internet connection
# Fetch GO annotations for TP53
let go_terms = uniprot_go("P04637")
print(f"GO terms: {len(go_terms)}")

# Display by ontology category
for term in go_terms {
  print(f"  [{term.category}] {term.id}: {term.name}")
}

# Filter to just molecular function terms
let mol_func = go_terms |> filter(|t| t.category == "molecular_function")
print(f"\nMolecular functions ({len(mol_func)}):")
for mf in mol_func {
  print(f"  {mf.id}: {mf.name}")
}

# Biological process terms
let bio_proc = go_terms |> filter(|t| t.category == "biological_process")
print(f"\nBiological processes ({len(bio_proc)}):")
for bp in bio_proc {
  print(f"  {bp.id}: {bp.name}")
}

Step 7 — PDB Structure Lookup

# requires: internet connection
# Look up PDB structure for the TP53 DNA-binding domain
let pdb = pdb_entry("1TUP")

print(f"\n=== PDB Structure ===")
print(f"ID:         {pdb.id}")
print(f"Title:      {pdb.title}")
print(f"Resolution: {pdb.resolution} A")
print(f"Method:     {pdb.method}")

# List entities (chains / molecules)
print(f"\nEntities ({len(pdb.entities)}):")
for entity in pdb.entities {
  print(f"  {entity}")
}

# List ligands
print(f"\nLigands ({len(pdb.ligands)}):")
for lig in pdb.ligands {
  print(f"  {lig}")
}

# Search PDB for more TP53 structures
let tp53_structures = pdb_search("TP53")
print(f"\nTP53 PDB hits: {len(tp53_structures)}")
for id in tp53_structures |> take(10) {
  print(f"  {id}")
}

Step 8 — Comparing Multiple Proteins

# requires: internet connection
# Fetch a family of related proteins
let accessions = ["P04637", "Q00987", "O15350"]  # TP53, MDM2, TP73

let proteins = accessions |> map(|acc| uniprot_entry(acc))

# Fetch sequences
let sequences = accessions |> map(|acc| {
  let fasta = uniprot_fasta(acc)
  protein(fasta |> split("\n") |> drop(1) |> join(""))
})

# Build a comparison table
let comparison = proteins
  |> enumerate()
  |> map(|i, p| {
    let seq = sequences[i]
    let residues = kmers(seq, 1)
    let total = len(residues)
    let charged = residues |> count_if(|r| {
      any(["D", "E", "K", "R", "H"], |a| a == r)
    })
    {
      gene:        p.gene_names,
      accession:   p.accession,
      length:      seq_len(seq),
      charged_pct: round(charged / total * 100.0, 1),
      organism:    p.organism,
    }
  })
  |> to_table()

print("\n=== Protein Family Comparison ===")
print(comparison)

# Compare motif counts across the family
let motif_comparison = proteins
  |> enumerate()
  |> map(|i, p| {
    let seq = sequences[i]
    {
      gene:    p.gene_names,
      nls:     len(find_motif(seq, "[KR]{4,}")),
      phospho: len(find_motif(seq, "[ST]P")),
      nglyc:   len(find_motif(seq, "N[^P][ST][^P]")),
      cys:     len(find_motif(seq, "C")),
    }
  })
  |> to_table()

print("\n=== Motif Comparison ===")
print(motif_comparison)

Step 9 — K-mer Analysis

# Analyze dipeptide (2-mer) frequencies in TP53
let dipeptides = kmers(seq, 2)
let dp_types = unique(dipeptides)

let dp_counts = dp_types |> map(|dp| {
  let n = dipeptides |> count_if(|d| d == dp)
  { dipeptide: dp, count: n }
}) |> sort_by(|a, b| b.count - a.count)

print("\n=== Top 20 Dipeptides ===")
for item in dp_counts |> take(20) {
  print(f"  {item.dipeptide}: {item.count}")
}

# Tripeptide (3-mer) analysis
let tripeptides = kmers(seq, 3)
let tp_types = unique(tripeptides)

let tp_counts = tp_types |> map(|tp| {
  let n = tripeptides |> count_if(|t| t == tp)
  { tripeptide: tp, count: n }
}) |> sort_by(|a, b| b.count - a.count)

print(f"\nUnique tripeptides: {len(tp_types)}")
print("Top 10 tripeptides:")
for item in tp_counts |> take(10) {
  print(f"  {item.tripeptide}: {item.count}")
}

# Visualize dipeptide frequency distribution
let dp_table = dp_counts |> to_table()
histogram(col(dp_table, "count"), {
  title:  "Dipeptide Frequency Distribution — TP53",
  xlabel: "Occurrences",
  ylabel: "Number of dipeptides",
  bins:   15,
  color:  "#8b5cf6",
  output: "results/dipeptide_dist.svg",
})

Step 10 — Complete Protein Report

# requires: internet connection
# protein_report.bl — full protein analysis pipeline

fn analyze_protein(accession) {
  print(f"\n{'=' * 60}")
  print(f"Analyzing {accession}")
  print(f"{'=' * 60}")

  # Fetch all data from UniProt
  let entry    = uniprot_entry(accession)
  let fasta    = uniprot_fasta(accession)
  let features = uniprot_features(accession)
  let go_terms = uniprot_go(accession)

  # Build the protein sequence
  let seq = protein(fasta |> split("\n") |> drop(1) |> join(""))

  print(f"Gene:     {entry.gene_names}")
  print(f"Organism: {entry.organism}")
  print(f"Length:   {seq_len(seq)} aa")

  # Amino acid composition
  let residues = kmers(seq, 1)
  let total = len(residues)
  let aa_types = unique(residues)
  let composition = aa_types |> map(|aa| {
    let n = residues |> count_if(|r| r == aa)
    { amino_acid: aa, count: n, pct: round(n / total * 100.0, 1) }
  }) |> sort_by(|a, b| b.count - a.count)

  print(f"\nTop 5 residues:")
  for item in composition |> take(5) {
    print(f"  {item.amino_acid}: {item.count} ({item.pct}%)")
  }

  # Motif analysis
  let nls     = find_motif(seq, "[KR]{4,}")
  let phospho = find_motif(seq, "[ST]P")
  let nglyc   = find_motif(seq, "N[^P][ST][^P]")

  print(f"\nMotifs:")
  print(f"  NLS-like regions:       {len(nls)}")
  print(f"  Phosphorylation sites:  {len(phospho)}")
  print(f"  N-glycosylation sites:  {len(nglyc)}")

  # Domain annotations
  let domains = features |> filter(|f| f.type == "Domain")
  print(f"\nDomains ({len(domains)}):")
  for d in domains {
    print(f"  {d.description}: {d.location}")
  }

  # GO summary
  let mf = go_terms |> filter(|t| t.category == "molecular_function")
  let bp = go_terms |> filter(|t| t.category == "biological_process")
  print(f"\nGO: {len(mf)} molecular functions, {len(bp)} biological processes")

  # Save structured report
  let report = {
    accession:   entry.accession,
    gene:        entry.gene_names,
    organism:    entry.organism,
    length:      seq_len(seq),
    composition: composition,
    motifs: {
      nls:     len(nls),
      phospho: len(phospho),
      nglyc:   len(nglyc),
    },
    domains:  domains |> map(|d| d.description),
    go_terms: len(go_terms),
  }

  let filename = f"results/{accession}_report.json"
  write_json(filename, report)
  print(f"\nReport saved to {filename}")
  report
}

# Analyze TP53 and a related protein
let tp53_report = analyze_protein("P04637")
let mdm2_report = analyze_protein("Q00987")

Next Steps

Learn how to handle large datasets efficiently in the Streaming Large Files tutorial.