Advanced ~35 minutes

Multi-species Comparative Analysis

Compare genes and proteins across species using BioLang's API clients, sequence analysis functions, and visualization tools. This tutorial covers cross-species gene lookup, sequence property comparison, k-mer analysis, dotplot visualization, and rendering phylogenetic trees from Newick strings.

What you will learn

  • Looking up orthologous genes across species with ensembl_symbol()
  • Fetching and comparing protein/CDS sequences across species
  • Comparing sequence properties: length, GC content, amino acid composition
  • K-mer analysis to measure sequence similarity
  • Pairwise sequence comparison with dotplot()
  • Rendering phylogenetic trees from Newick strings with phylo_tree()
  • Building comparison tables and visualizing differences
Run this tutorial: Download multispecies.bl and run it with bl run examples/tutorials/multispecies.bl

Prerequisites

This tutorial assumes familiarity with the Querying Databases and Protein Analysis tutorials. You should be comfortable with sequences, tables, and database queries.

Step 1 — Looking Up Genes Across Species

Orthologs are genes in different species that evolved from a common ancestral gene through speciation. We can look up the same gene in multiple species using ensembl_symbol(), which queries the Ensembl REST API.

# multispecies.bl — cross-species comparative analysis
# requires: NCBI_API_KEY (optional, increases rate limit)

# Define species and their gene symbols for TP53
# (gene symbols differ by species naming conventions)
let gene_lookups = [
  { species: "homo_sapiens",             symbol: "TP53" },
  { species: "mus_musculus",             symbol: "Trp53" },
  { species: "rattus_norvegicus",        symbol: "Tp53" },
  { species: "gallus_gallus",            symbol: "TP53" },
  { species: "danio_rerio",             symbol: "tp53" },
]

# Look up TP53 orthologs in each species
let orthologs = gene_lookups |> map(|entry| {
  let gene = ensembl_symbol(entry.species, entry.symbol)
  { species: entry.species, gene_id: gene.id, symbol: gene.symbol,
    description: gene.description }
})

print("=== TP53 Orthologs ===")
for orth in orthologs {
  print(f"{orth.species}: {orth.gene_id} ({orth.symbol})")
}
# homo_sapiens: ENSG00000141510 (TP53)
# mus_musculus: ENSMUSG00000059552 (Trp53)
# ...

Step 2 — Fetching and Comparing Protein Sequences

With the Ensembl gene IDs in hand, we can fetch protein sequences for each ortholog and compare basic properties like length and amino acid composition.

# Fetch protein sequences for each ortholog
let proteins = orthologs |> map(|o| {
  let seq_data = ensembl_sequence(o.gene_id, "protein")
  let seq = protein"{seq_data.seq}"
  { species: o.species, symbol: o.symbol, sequence: seq }
})

# Build a comparison table of protein properties
let protein_table = proteins |> map(|p| {
  let counts = base_counts(p.sequence)
  {
    species:  p.species,
    symbol:   p.symbol,
    length:   len(p.sequence),
  }
}) |> to_table()

print("\n=== Protein Sequence Lengths ===")
print(protein_table)
# species              symbol  length
# homo_sapiens         TP53    393
# mus_musculus         Trp53   387
# rattus_norvegicus    Tp53    391
# gallus_gallus        TP53    367
# danio_rerio          tp53    373

Step 3 — CDS Comparison and GC Content

Coding DNA sequences reveal differences in codon usage and GC content that can reflect evolutionary pressures. Let's fetch the CDS for each ortholog and compare them.

# Fetch CDS (coding DNA) sequences
let cds_data = orthologs |> map(|o| {
  let seq_data = ensembl_sequence(o.gene_id, "cdna")
  let seq = dna"{seq_data.seq}"
  { species: o.species, symbol: o.symbol, sequence: seq }
})

# Compare GC content and length across species
let cds_table = cds_data |> map(|c| {
  let counts = base_counts(c.sequence)
  {
    species:    c.species,
    cds_length: len(c.sequence),
    gc_content: round(gc_content(c.sequence), 3),
    a_count:    counts.A,
    t_count:    counts.T,
    g_count:    counts.G,
    c_count:    counts.C,
  }
}) |> to_table()

print("\n=== CDS Properties ===")
print(cds_table)

# Visualize GC content across species
bar_chart(cds_table, {
  x:      "species",
  y:      "gc_content",
  color:  "#8b5cf6",
  title:  "TP53 GC Content Across Species",
  ylabel: "GC Content",
  output: "results/gc_content_comparison.svg",
})

Step 4 — K-mer Analysis for Sequence Similarity

K-mer frequency profiles provide a quick, alignment-free way to estimate how similar two sequences are. Sequences from closely related species will share more k-mers than distantly related ones.

# Generate k-mer profiles for each CDS
let k = 4
let kmer_profiles = cds_data |> map(|c| {
  let kmer_list = kmers(c.sequence, k)
  let freqs = frequencies(kmer_list)
  { species: c.species, freqs: freqs, total: len(kmer_list) }
})

# Compute shared k-mer fractions between human and each species
let human_profile = kmer_profiles[0]
let human_kmers = keys(human_profile.freqs)

let kmer_similarity = kmer_profiles |> map(|p| {
  let shared = human_kmers |> filter(|km| p.freqs[km] != nil) |> len()
  {
    species:          p.species,
    total_kmers:      p.total,
    unique_kmers:     len(keys(p.freqs)),
    shared_with_human: shared,
    jaccard:          round(shared / len(unique(flatten([human_kmers, keys(p.freqs)]))), 3),
  }
}) |> to_table()

print("\n=== K-mer Similarity to Human ===")
print(kmer_similarity)

# Visualize k-mer similarity
bar_chart(kmer_similarity, {
  x:      "species",
  y:      "jaccard",
  color:  "#22d3ee",
  title:  f"TP53 {k}-mer Jaccard Similarity to Human",
  ylabel: "Jaccard Index",
  output: "results/kmer_similarity.svg",
})

Step 5 — Pairwise Dotplot Comparison

Dotplots visually reveal regions of similarity (and divergence) between two sequences. Diagonal lines indicate conserved regions; gaps in the diagonal suggest insertions, deletions, or rearrangements.

# Compare human TP53 protein with each other species using dotplots
let human_protein = proteins[0].sequence

# Human vs Mouse
dotplot(human_protein, proteins[1].sequence, {
  title:   "TP53 Dotplot: Human vs Mouse",
  xlabel:  "Human TP53",
  ylabel:  "Mouse Trp53",
  word:    3,
  output:  "results/dotplot_human_mouse.svg",
})

# Human vs Zebrafish (more divergent)
dotplot(human_protein, proteins[4].sequence, {
  title:   "TP53 Dotplot: Human vs Zebrafish",
  xlabel:  "Human TP53",
  ylabel:  "Zebrafish tp53",
  word:    3,
  output:  "results/dotplot_human_zebrafish.svg",
})

print("\nDotplots saved — compare the diagonals to see")
print("how human-mouse conservation differs from human-zebrafish")

Step 6 — Gene Family Presence Across Species

# The p53 family includes TP53, TP63, and TP73
# Check which family members exist in each species
let family_queries = [
  { species: "homo_sapiens",       genes: ["TP53", "TP63", "TP73"] },
  { species: "mus_musculus",       genes: ["Trp53", "Trp63", "Trp73"] },
  { species: "rattus_norvegicus",  genes: ["Tp53", "Tp63", "Tp73"] },
  { species: "gallus_gallus",      genes: ["TP53", "TP63", "TP73"] },
  { species: "danio_rerio",       genes: ["tp53", "tp63", "tp73"] },
]

let family_labels = ["TP53", "TP63", "TP73"]

# Try to look up each gene; missing genes return nil
let family_data = family_queries |> map(|entry| {
  let results = entry.genes |> enumerate() |> map(|pair| {
    let i = pair[0]
    let symbol = pair[1]
    let result = try { ensembl_symbol(entry.species, symbol) }
    { species: entry.species, gene: family_labels[i],
      present: if result != nil then "Yes" else "No" }
  })
  results
}) |> flatten() |> to_table()

print("\n=== p53 Family Presence ===")
let family_wide = family_data
  |> pivot_wider(names_from: "gene", values_from: "present")
print(family_wide)

Step 7 — Comparing Genomic Loci

# Compare the TP53 genomic locus between species
let locus_table = orthologs |> map(|o| {
  let gene = ensembl_symbol(o.species, o.symbol)
  let genomic = ensembl_sequence(gene.id, "genomic")
  {
    species:      o.species,
    chromosome:   gene.chromosome,
    start:        gene.start,
    end_pos:      gene.end,
    locus_size:   gene.end - gene.start,
    genomic_len:  len(genomic.seq),
  }
}) |> to_table()

print("\n=== TP53 Locus Comparison ===")
print(locus_table)

# Which species has the largest/smallest TP53 locus?
let sorted_by_size = locus_table |> arrange("locus_size", "desc")
print(f"\nLargest locus:  {sorted_by_size[0].species} ({sorted_by_size[0].locus_size} bp)")
let n = len(sorted_by_size) - 1
print(f"Smallest locus: {sorted_by_size[n].species} ({sorted_by_size[n].locus_size} bp)")

# Visualize locus sizes
bar_chart(locus_table, {
  x:      "species",
  y:      "locus_size",
  color:  "#f59e0b",
  title:  "TP53 Genomic Locus Size by Species",
  ylabel: "Locus Size (bp)",
  output: "results/locus_sizes.svg",
})

Step 8 — Motif Search Across Species

Searching for known functional motifs across orthologous sequences can reveal which motifs are conserved and which have been lost or gained during evolution.

# Search for the p53 DNA-binding domain core motif
# The RCATG motif is part of the p53 response element
let motif_pattern = "CATG"

let motif_results = cds_data |> map(|c| {
  let hits = find_motif(c.sequence, motif_pattern)
  { species: c.species, motif_count: len(hits), cds_length: len(c.sequence),
    density: round(len(hits) / len(c.sequence) * 1000, 2) }
}) |> to_table()

print("\n=== CATG Motif Density in TP53 CDS ===")
print(motif_results)

# Look for start codon context (Kozak-like sequence)
print("\n=== Start Codon Context ===")
for c in cds_data {
  let context = substr(c.sequence, 0, 12)
  print(f"  {c.species}: {context}")
}

Step 9 — Visualizing a Phylogenetic Tree

BioLang can render phylogenetic trees from Newick-format strings using phylo_tree(). The tree itself must be computed externally (e.g., using MEGA, RAxML, IQ-TREE, or FastTree) or obtained from a published source.

# Known phylogeny of our study species (from published data)
# This Newick string encodes the accepted vertebrate phylogeny
# with approximate divergence-based branch lengths
let newick = "((homo_sapiens:0.01,(mus_musculus:0.09,rattus_norvegicus:0.09):0.04):0.1,gallus_gallus:0.2,danio_rerio:0.4);"

# Render the tree
phylo_tree(newick, {
  title:     "TP53 Species Phylogeny",
  scale_bar: true,
  output:    "results/tp53_phylogeny.svg",
})

print("\nPhylogenetic tree saved to results/tp53_phylogeny.svg")

Note: Phylogenetic inference

BioLang's phylo_tree() function visualizes trees from Newick strings — it does not compute phylogenetic inference. For full phylogenetic analysis (neighbor-joining, maximum likelihood, Bayesian), export your sequences with write_fasta() and use dedicated tools like MEGA, RAxML, IQ-TREE, or MrBayes. You can then load the resulting Newick file back into BioLang for visualization.

Step 10 — Exporting for External Analysis

For analyses that require multiple sequence alignment, dN/dS computation, or tree inference, export your collected sequences in standard formats for use with external tools.

# Export protein sequences as FASTA for external alignment
let fasta_records = proteins |> map(|p| {
  { id: p.species, description: f"TP53 ortholog ({p.symbol})", seq: p.sequence }
})
write_fasta(fasta_records, "results/tp53_proteins.fasta")
print("Protein sequences written to results/tp53_proteins.fasta")

# Export CDS sequences for codon-based analysis (dN/dS)
let cds_records = cds_data |> map(|c| {
  { id: c.species, description: f"TP53 CDS ({c.symbol})", seq: c.sequence }
})
write_fasta(cds_records, "results/tp53_cds.fasta")
print("CDS sequences written to results/tp53_cds.fasta")

# Export the comparison data as CSV
write_csv(cds_table, "results/tp53_cds_comparison.csv")
write_csv(kmer_similarity, "results/tp53_kmer_similarity.csv")
print("Comparison tables written to results/")

# Export summary as JSON
let summary = {
  gene:           "TP53",
  n_species:      len(orthologs),
  species:        orthologs |> map(|o| o.species),
  mean_protein_len: round(mean(proteins |> map(|p| len(p.sequence))), 1),
  gc_range:       [
    min(cds_data |> map(|c| gc_content(c.sequence))),
    max(cds_data |> map(|c| gc_content(c.sequence))),
  ],
}
write_json(summary, "results/tp53_summary.json")
print("Summary written to results/tp53_summary.json")

Step 11 — Complete Multi-Gene Comparison

Let's wrap everything into a reusable function that compares a gene across species and produces a comprehensive report.

# requires: NCBI_API_KEY (optional, increases rate limit)

fn compare_gene_across_species(gene_symbols, gene_label, output_dir) {
  print(f"\n=== Comparing {gene_label} ===")

  # 1. Look up genes
  let species_list = keys(gene_symbols)
  let genes = species_list |> map(|sp| {
    let info = ensembl_symbol(sp, gene_symbols[sp])
    { species: sp, gene_id: info.id, symbol: info.symbol }
  })
  print(f"  Found in {len(genes)} species")

  # 2. Fetch protein sequences
  let prots = genes |> map(|g| {
    let seq = ensembl_sequence(g.gene_id, "protein")
    { species: g.species, symbol: g.symbol, sequence: protein"{seq.seq}" }
  })

  # 3. Fetch CDS and compute GC content
  let cds = genes |> map(|g| {
    let seq = ensembl_sequence(g.gene_id, "cdna")
    let s = dna"{seq.seq}"
    { species: g.species, gc: round(gc_content(s), 3), cds_len: len(s) }
  })

  # 4. Build comparison table
  let comparison = zip(prots, cds) |> map(|pair| {
    { species: pair[0].species, symbol: pair[0].symbol,
      protein_len: len(pair[0].sequence), cds_len: pair[1].cds_len,
      gc_content: pair[1].gc }
  }) |> to_table()
  print(comparison)

  # 5. Export sequences for external phylogenetic analysis
  let records = prots |> map(|p| {
    { id: p.species, description: f"{gene_label} ({p.symbol})", seq: p.sequence }
  })
  write_fasta(records, f"{output_dir}/{gene_label}_proteins.fasta")

  # 6. Pairwise dotplot: first species vs last (most divergent)
  dotplot(prots[0].sequence, prots[len(prots) - 1].sequence, {
    title:  f"{gene_label}: {prots[0].species} vs {prots[len(prots) - 1].species}",
    xlabel: prots[0].species,
    ylabel: prots[len(prots) - 1].species,
    word:   3,
    output: f"{output_dir}/{gene_label}_dotplot.svg",
  })

  write_csv(comparison, f"{output_dir}/{gene_label}_comparison.csv")
  print(f"  Results saved to {output_dir}/")
}

# Run for multiple tumor suppressor genes
let genes_to_compare = [
  { label: "TP53", symbols: { homo_sapiens: "TP53", mus_musculus: "Trp53",
    rattus_norvegicus: "Tp53", gallus_gallus: "TP53", danio_rerio: "tp53" } },
  { label: "BRCA1", symbols: { homo_sapiens: "BRCA1", mus_musculus: "Brca1",
    rattus_norvegicus: "Brca1", gallus_gallus: "BRCA1", danio_rerio: "brca1" } },
  { label: "RB1", symbols: { homo_sapiens: "RB1", mus_musculus: "Rb1",
    rattus_norvegicus: "Rb1", gallus_gallus: "RB1", danio_rerio: "rb1" } },
]

for gene in genes_to_compare {
  compare_gene_across_species(gene.symbols, gene.label, "results/comparative")
}

# Visualize the known species tree for context
let species_tree = "((homo_sapiens:0.01,(mus_musculus:0.09,rattus_norvegicus:0.09):0.04):0.1,gallus_gallus:0.2,danio_rerio:0.4);"

phylo_tree(species_tree, {
  title:     "Species Phylogeny (3-gene comparison)",
  scale_bar: true,
  output:    "results/comparative/species_tree.svg",
})

print("\n=== All comparisons complete ===")

Next Steps

  • For phylogenetic inference, export your FASTA files and use IQ-TREE (iqtree2 -s proteins.fasta -m JTT+G -bb 1000), then load the resulting .treefile back into BioLang with phylo_tree() for visualization.
  • For multiple sequence alignment, use MUSCLE or MAFFT on the exported FASTA.
  • Learn how to extend BioLang with custom functionality in the Building Custom Plugins tutorial.