Intermediate ~30 minutes

Protein Analysis

Analyze proteins using UniProt and PDB data. Fetch sequences and annotations, search for motifs, explore amino acid composition with k-mers and string ops, examine domain annotations, and visualize results.

What you will learn

  • Fetching protein data from UniProt (entry, FASTA, features, GO terms)
  • Creating Protein values and inspecting their properties
  • Searching for sequence motifs with find_motif()
  • Analyzing amino acid k-mers and composition
  • Exploring domain annotations and GO terms
  • Looking up PDB structures
  • Comparing multiple proteins with tables and plots
Run this tutorial: Download protein-analysis.bl and run it with bl run examples/tutorials/protein-analysis.bl

Step 1 — Fetching a Protein from UniProt

# requires: internet connection
# protein.bl — protein analysis

# Fetch TP53 (tumor protein p53) from UniProt
let entry = uniprot_entry("P04637")

print(f"Accession:   {entry.accession}")
print(f"Name:        {entry.name}")
print(f"Organism:    {entry.organism}")
print(f"Length:      {entry.sequence_length} aa")
print(f"Genes:       {entry.gene_names}")
print(f"Function:    {entry.function}")

# Get the FASTA sequence
let fasta = uniprot_fasta("P04637")
print(f"FASTA (first 80 chars): {substr(fasta, 0, 80)}...")

# Extract the raw sequence from FASTA (skip header line), wrap as Protein
let raw_seq = fasta |> split("\n") |> drop(1) |> join("")
let seq = protein(raw_seq)
print(f"Sequence type: {typeof(seq)}")    # Protein
print(f"Length:        {seq_len(seq)} aa")

# You can also create protein sequences directly
let my_peptide = protein("MASKFPGIDRS")
print(f"Peptide length: {seq_len(my_peptide)}")

Step 2 — Amino Acid Composition via K-mers

# Use kmers(seq, 1) to get every individual amino acid
let residues = kmers(seq, 1)
let total = len(residues)

# Count each amino acid using map/reduce on unique residues
let aa_types = unique(residues)
let counts = aa_types |> map(|aa| {
  let n = residues |> count_if(|r| r == aa)
  { amino_acid: aa, count: n, pct: n / total * 100.0 }
})

# Sort by frequency (descending) and display
counts |> sort_by(|a, b| b.count - a.count) |> into sorted

print("\n=== Amino Acid Composition ===")
for item in sorted {
  print(f"  {item.amino_acid}: {item.count} ({round(item.pct, 1)}%)")
}

# Group into property classes manually
let charged_aa = ["D", "E", "K", "R", "H"]
let polar_aa   = ["S", "T", "N", "Q", "Y", "C"]
let hydro_aa   = ["A", "V", "I", "L", "M", "F", "W", "P"]

let charged = residues |> count_if(|r| any(charged_aa, |a| a == r))
let polar   = residues |> count_if(|r| any(polar_aa, |a| a == r))
let hydro   = residues |> count_if(|r| any(hydro_aa, |a| a == r))

print(f"\nCharged:     {charged} ({round(charged / total * 100.0, 1)}%)")
print(f"Polar:       {polar} ({round(polar / total * 100.0, 1)}%)")
print(f"Hydrophobic: {hydro} ({round(hydro / total * 100.0, 1)}%)")

Step 3 — Amino Acid Composition as a Table

# Build a table of amino acid counts for downstream analysis
let aa_table = sorted |> to_table()
print(aa_table)

# Visualize with a histogram
histogram(col(aa_table, "count"), {
  title:  "Amino Acid Frequency — TP53",
  xlabel: "Count",
  ylabel: "Number of amino acid types",
  bins:   10,
  color:  "#8b5cf6",
  output: "results/aa_histogram.svg",
})

# Bar-style plot of composition
plot(aa_table, {
  x:      "amino_acid",
  y:      "pct",
  title:  "TP53 Amino Acid Composition (%)",
  xlabel: "Amino Acid",
  ylabel: "Percentage",
  color:  "#06b6d4",
  output: "results/aa_composition.svg",
})

Step 4 — Motif Searching

# find_motif(seq, pattern) returns a list of start positions (integers)

# Nuclear localization signals: runs of 4+ basic residues (K or R)
let nls_hits = find_motif(seq, "[KR]{4,}")
print(f"\n=== Nuclear Localization Signals ===")
print(f"Found {len(nls_hits)} NLS-like regions at positions: {nls_hits}")

# N-glycosylation sites: N-{P}-[ST]-{P}
let nglyc = find_motif(seq, "N[^P][ST][^P]")
print(f"\nPotential N-glycosylation sites: {len(nglyc)}")
for pos in nglyc {
  # Extract the 4-residue motif at each position
  let motif_seq = substr(seq, pos, 4)
  print(f"  Position {pos}: {motif_seq}")
}

# Phosphorylation sites: S or T followed by P (minimal CDK motif)
let phospho = find_motif(seq, "[ST]P")
print(f"\nPotential phosphorylation sites (SP/TP): {len(phospho)}")
for pos in phospho {
  let motif_seq = substr(seq, pos, 2)
  print(f"  Position {pos}: {motif_seq}")
}

# Cysteine residues (important for disulfide bonds)
let cys_positions = find_motif(seq, "C")
print(f"\nCysteine residues: {len(cys_positions)} at positions {cys_positions}")

Step 5 — UniProt Feature Annotations

# requires: internet connection
# Fetch annotated domains and features from UniProt
let features = uniprot_features("P04637")
print(f"Total features: {len(features)}")

# Filter for domains
let domains = features |> filter(|f| f.type == "Domain")
print("\n=== Annotated Domains ===")
for d in domains {
  print(f"  {d.description}: {d.location}")
}

# Filter for active/binding sites
let sites = features |> filter(|f| {
  f.type == "Binding site" or f.type == "Active site" or f.type == "Metal binding"
})
print("\n=== Binding / Active Sites ===")
for s in sites {
  print(f"  [{s.type}] {s.description}: {s.location}")
}

# Show all feature types present
let feature_types = features |> map(|f| f.type) |> unique()
print(f"\nFeature types: {feature_types}")

# Count features by type
let type_counts = feature_types |> map(|t| {
  let n = features |> count_if(|f| f.type == t)
  { type: t, count: n }
}) |> sort_by(|a, b| b.count - a.count)

print("\n=== Feature Type Summary ===")
for tc in type_counts {
  print(f"  {tc.type}: {tc.count}")
}

Step 6 — Gene Ontology Terms

# requires: internet connection
# Fetch GO annotations for TP53
let go_terms = uniprot_go("P04637")
print(f"GO terms: {len(go_terms)}")

# Display by ontology category
for term in go_terms {
  print(f"  [{term.category}] {term.id}: {term.name}")
}

# Filter to just molecular function terms
let mol_func = go_terms |> filter(|t| t.category == "molecular_function")
print(f"\nMolecular functions ({len(mol_func)}):")
for mf in mol_func {
  print(f"  {mf.id}: {mf.name}")
}

# Biological process terms
let bio_proc = go_terms |> filter(|t| t.category == "biological_process")
print(f"\nBiological processes ({len(bio_proc)}):")
for bp in bio_proc {
  print(f"  {bp.id}: {bp.name}")
}

Step 7 — PDB Structure Lookup

# requires: internet connection
# Look up PDB structure for the TP53 DNA-binding domain
let pdb = pdb_entry("1TUP")

print(f"\n=== PDB Structure ===")
print(f"ID:         {pdb.id}")
print(f"Title:      {pdb.title}")
print(f"Resolution: {pdb.resolution} A")
print(f"Method:     {pdb.method}")

# List entities (chains / molecules)
print(f"\nEntities ({len(pdb.entities)}):")
for entity in pdb.entities {
  print(f"  {entity}")
}

# List ligands
print(f"\nLigands ({len(pdb.ligands)}):")
for lig in pdb.ligands {
  print(f"  {lig}")
}

# Search PDB for more TP53 structures
let tp53_structures = pdb_search("TP53")
print(f"\nTP53 PDB hits: {len(tp53_structures)}")
for id in tp53_structures |> take(10) {
  print(f"  {id}")
}

Step 8 — Comparing Multiple Proteins

# requires: internet connection
# Fetch a family of related proteins
let accessions = ["P04637", "Q00987", "O15350"]  # TP53, MDM2, TP73

let proteins = accessions |> map(|acc| uniprot_entry(acc))

# Fetch sequences
let sequences = accessions |> map(|acc| {
  let fasta = uniprot_fasta(acc)
  protein(fasta |> split("\n") |> drop(1) |> join(""))
})

# Build a comparison table
let comparison = proteins
  |> enumerate()
  |> map(|i, p| {
    let seq = sequences[i]
    let residues = kmers(seq, 1)
    let total = len(residues)
    let charged = residues |> count_if(|r| {
      any(["D", "E", "K", "R", "H"], |a| a == r)
    })
    {
      gene:        p.gene_names,
      accession:   p.accession,
      length:      seq_len(seq),
      charged_pct: round(charged / total * 100.0, 1),
      organism:    p.organism,
    }
  })
  |> to_table()

print("\n=== Protein Family Comparison ===")
print(comparison)

# Compare motif counts across the family
let motif_comparison = proteins
  |> enumerate()
  |> map(|i, p| {
    let seq = sequences[i]
    {
      gene:    p.gene_names,
      nls:     len(find_motif(seq, "[KR]{4,}")),
      phospho: len(find_motif(seq, "[ST]P")),
      nglyc:   len(find_motif(seq, "N[^P][ST][^P]")),
      cys:     len(find_motif(seq, "C")),
    }
  })
  |> to_table()

print("\n=== Motif Comparison ===")
print(motif_comparison)

Step 9 — K-mer Analysis

# Analyze dipeptide (2-mer) frequencies in TP53
let dipeptides = kmers(seq, 2)
let dp_types = unique(dipeptides)

let dp_counts = dp_types |> map(|dp| {
  let n = dipeptides |> count_if(|d| d == dp)
  { dipeptide: dp, count: n }
}) |> sort_by(|a, b| b.count - a.count)

print("\n=== Top 20 Dipeptides ===")
for item in dp_counts |> take(20) {
  print(f"  {item.dipeptide}: {item.count}")
}

# Tripeptide (3-mer) analysis
let tripeptides = kmers(seq, 3)
let tp_types = unique(tripeptides)

let tp_counts = tp_types |> map(|tp| {
  let n = tripeptides |> count_if(|t| t == tp)
  { tripeptide: tp, count: n }
}) |> sort_by(|a, b| b.count - a.count)

print(f"\nUnique tripeptides: {len(tp_types)}")
print("Top 10 tripeptides:")
for item in tp_counts |> take(10) {
  print(f"  {item.tripeptide}: {item.count}")
}

# Visualize dipeptide frequency distribution
let dp_table = dp_counts |> to_table()
histogram(col(dp_table, "count"), {
  title:  "Dipeptide Frequency Distribution — TP53",
  xlabel: "Occurrences",
  ylabel: "Number of dipeptides",
  bins:   15,
  color:  "#8b5cf6",
  output: "results/dipeptide_dist.svg",
})

Step 10 — Complete Protein Report

# requires: internet connection
# protein_report.bl — full protein analysis pipeline

fn analyze_protein(accession) {
  print(f"\n{'=' * 60}")
  print(f"Analyzing {accession}")
  print(f"{'=' * 60}")

  # Fetch all data from UniProt
  let entry    = uniprot_entry(accession)
  let fasta    = uniprot_fasta(accession)
  let features = uniprot_features(accession)
  let go_terms = uniprot_go(accession)

  # Build the protein sequence
  let seq = protein(fasta |> split("\n") |> drop(1) |> join(""))

  print(f"Gene:     {entry.gene_names}")
  print(f"Organism: {entry.organism}")
  print(f"Length:   {seq_len(seq)} aa")

  # Amino acid composition
  let residues = kmers(seq, 1)
  let total = len(residues)
  let aa_types = unique(residues)
  let composition = aa_types |> map(|aa| {
    let n = residues |> count_if(|r| r == aa)
    { amino_acid: aa, count: n, pct: round(n / total * 100.0, 1) }
  }) |> sort_by(|a, b| b.count - a.count)

  print(f"\nTop 5 residues:")
  for item in composition |> take(5) {
    print(f"  {item.amino_acid}: {item.count} ({item.pct}%)")
  }

  # Motif analysis
  let nls     = find_motif(seq, "[KR]{4,}")
  let phospho = find_motif(seq, "[ST]P")
  let nglyc   = find_motif(seq, "N[^P][ST][^P]")

  print(f"\nMotifs:")
  print(f"  NLS-like regions:       {len(nls)}")
  print(f"  Phosphorylation sites:  {len(phospho)}")
  print(f"  N-glycosylation sites:  {len(nglyc)}")

  # Domain annotations
  let domains = features |> filter(|f| f.type == "Domain")
  print(f"\nDomains ({len(domains)}):")
  for d in domains {
    print(f"  {d.description}: {d.location}")
  }

  # GO summary
  let mf = go_terms |> filter(|t| t.category == "molecular_function")
  let bp = go_terms |> filter(|t| t.category == "biological_process")
  print(f"\nGO: {len(mf)} molecular functions, {len(bp)} biological processes")

  # Save structured report
  let report = {
    accession:   entry.accession,
    gene:        entry.gene_names,
    organism:    entry.organism,
    length:      seq_len(seq),
    composition: composition,
    motifs: {
      nls:     len(nls),
      phospho: len(phospho),
      nglyc:   len(nglyc),
    },
    domains:  domains |> map(|d| d.description),
    go_terms: len(go_terms),
  }

  let filename = f"results/{accession}_report.json"
  write_json(filename, report)
  print(f"\nReport saved to {filename}")
  report
}

# Analyze TP53 and a related protein
let tp53_report = analyze_protein("P04637")
let mdm2_report = analyze_protein("Q00987")

Next Steps

Learn how to handle large datasets efficiently in the Streaming Large Files tutorial.