Multi-species Comparative Analysis
Compare genes and proteins across species using BioLang's API clients, sequence analysis functions, and visualization tools. This tutorial covers cross-species gene lookup, sequence property comparison, k-mer analysis, dotplot visualization, and rendering phylogenetic trees from Newick strings.
What you will learn
- Looking up orthologous genes across species with
ensembl_symbol() - Fetching and comparing protein/CDS sequences across species
- Comparing sequence properties: length, GC content, amino acid composition
- K-mer analysis to measure sequence similarity
- Pairwise sequence comparison with
dotplot() - Rendering phylogenetic trees from Newick strings with
phylo_tree() - Building comparison tables and visualizing differences
bl run examples/tutorials/multispecies.bl
Prerequisites
This tutorial assumes familiarity with the Querying Databases and Protein Analysis tutorials. You should be comfortable with sequences, tables, and database queries.
Step 1 — Looking Up Genes Across Species
Orthologs are genes in different species that evolved from a common ancestral gene
through speciation. We can look up the same gene in multiple species using
ensembl_symbol(), which queries the Ensembl REST API.
# multispecies.bl — cross-species comparative analysis
# requires: NCBI_API_KEY (optional, increases rate limit)
# Define species and their gene symbols for TP53
# (gene symbols differ by species naming conventions)
let gene_lookups = [
{ species: "homo_sapiens", symbol: "TP53" },
{ species: "mus_musculus", symbol: "Trp53" },
{ species: "rattus_norvegicus", symbol: "Tp53" },
{ species: "gallus_gallus", symbol: "TP53" },
{ species: "danio_rerio", symbol: "tp53" },
]
# Look up TP53 orthologs in each species
let orthologs = gene_lookups |> map(|entry| {
let gene = ensembl_symbol(entry.species, entry.symbol)
{ species: entry.species, gene_id: gene.id, symbol: gene.symbol,
description: gene.description }
})
print("=== TP53 Orthologs ===")
for orth in orthologs {
print(f"{orth.species}: {orth.gene_id} ({orth.symbol})")
}
# homo_sapiens: ENSG00000141510 (TP53)
# mus_musculus: ENSMUSG00000059552 (Trp53)
# ...
Step 2 — Fetching and Comparing Protein Sequences
With the Ensembl gene IDs in hand, we can fetch protein sequences for each ortholog and compare basic properties like length and amino acid composition.
# Fetch protein sequences for each ortholog
let proteins = orthologs |> map(|o| {
let seq_data = ensembl_sequence(o.gene_id, "protein")
let seq = protein"{seq_data.seq}"
{ species: o.species, symbol: o.symbol, sequence: seq }
})
# Build a comparison table of protein properties
let protein_table = proteins |> map(|p| {
let counts = base_counts(p.sequence)
{
species: p.species,
symbol: p.symbol,
length: len(p.sequence),
}
}) |> to_table()
print("\n=== Protein Sequence Lengths ===")
print(protein_table)
# species symbol length
# homo_sapiens TP53 393
# mus_musculus Trp53 387
# rattus_norvegicus Tp53 391
# gallus_gallus TP53 367
# danio_rerio tp53 373
Step 3 — CDS Comparison and GC Content
Coding DNA sequences reveal differences in codon usage and GC content that can reflect evolutionary pressures. Let's fetch the CDS for each ortholog and compare them.
# Fetch CDS (coding DNA) sequences
let cds_data = orthologs |> map(|o| {
let seq_data = ensembl_sequence(o.gene_id, "cdna")
let seq = dna"{seq_data.seq}"
{ species: o.species, symbol: o.symbol, sequence: seq }
})
# Compare GC content and length across species
let cds_table = cds_data |> map(|c| {
let counts = base_counts(c.sequence)
{
species: c.species,
cds_length: len(c.sequence),
gc_content: round(gc_content(c.sequence), 3),
a_count: counts.A,
t_count: counts.T,
g_count: counts.G,
c_count: counts.C,
}
}) |> to_table()
print("\n=== CDS Properties ===")
print(cds_table)
# Visualize GC content across species
bar_chart(cds_table, {
x: "species",
y: "gc_content",
color: "#8b5cf6",
title: "TP53 GC Content Across Species",
ylabel: "GC Content",
output: "results/gc_content_comparison.svg",
})
Step 4 — K-mer Analysis for Sequence Similarity
K-mer frequency profiles provide a quick, alignment-free way to estimate how similar two sequences are. Sequences from closely related species will share more k-mers than distantly related ones.
# Generate k-mer profiles for each CDS
let k = 4
let kmer_profiles = cds_data |> map(|c| {
let kmer_list = kmers(c.sequence, k)
let freqs = frequencies(kmer_list)
{ species: c.species, freqs: freqs, total: len(kmer_list) }
})
# Compute shared k-mer fractions between human and each species
let human_profile = kmer_profiles[0]
let human_kmers = keys(human_profile.freqs)
let kmer_similarity = kmer_profiles |> map(|p| {
let shared = human_kmers |> filter(|km| p.freqs[km] != nil) |> len()
{
species: p.species,
total_kmers: p.total,
unique_kmers: len(keys(p.freqs)),
shared_with_human: shared,
jaccard: round(shared / len(unique(flatten([human_kmers, keys(p.freqs)]))), 3),
}
}) |> to_table()
print("\n=== K-mer Similarity to Human ===")
print(kmer_similarity)
# Visualize k-mer similarity
bar_chart(kmer_similarity, {
x: "species",
y: "jaccard",
color: "#22d3ee",
title: f"TP53 {k}-mer Jaccard Similarity to Human",
ylabel: "Jaccard Index",
output: "results/kmer_similarity.svg",
})
Step 5 — Pairwise Dotplot Comparison
Dotplots visually reveal regions of similarity (and divergence) between two sequences. Diagonal lines indicate conserved regions; gaps in the diagonal suggest insertions, deletions, or rearrangements.
# Compare human TP53 protein with each other species using dotplots
let human_protein = proteins[0].sequence
# Human vs Mouse
dotplot(human_protein, proteins[1].sequence, {
title: "TP53 Dotplot: Human vs Mouse",
xlabel: "Human TP53",
ylabel: "Mouse Trp53",
word: 3,
output: "results/dotplot_human_mouse.svg",
})
# Human vs Zebrafish (more divergent)
dotplot(human_protein, proteins[4].sequence, {
title: "TP53 Dotplot: Human vs Zebrafish",
xlabel: "Human TP53",
ylabel: "Zebrafish tp53",
word: 3,
output: "results/dotplot_human_zebrafish.svg",
})
print("\nDotplots saved — compare the diagonals to see")
print("how human-mouse conservation differs from human-zebrafish")
Step 6 — Gene Family Presence Across Species
# The p53 family includes TP53, TP63, and TP73
# Check which family members exist in each species
let family_queries = [
{ species: "homo_sapiens", genes: ["TP53", "TP63", "TP73"] },
{ species: "mus_musculus", genes: ["Trp53", "Trp63", "Trp73"] },
{ species: "rattus_norvegicus", genes: ["Tp53", "Tp63", "Tp73"] },
{ species: "gallus_gallus", genes: ["TP53", "TP63", "TP73"] },
{ species: "danio_rerio", genes: ["tp53", "tp63", "tp73"] },
]
let family_labels = ["TP53", "TP63", "TP73"]
# Try to look up each gene; missing genes return nil
let family_data = family_queries |> map(|entry| {
let results = entry.genes |> enumerate() |> map(|pair| {
let i = pair[0]
let symbol = pair[1]
let result = try { ensembl_symbol(entry.species, symbol) }
{ species: entry.species, gene: family_labels[i],
present: if result != nil then "Yes" else "No" }
})
results
}) |> flatten() |> to_table()
print("\n=== p53 Family Presence ===")
let family_wide = family_data
|> pivot_wider(names_from: "gene", values_from: "present")
print(family_wide)
Step 7 — Comparing Genomic Loci
# Compare the TP53 genomic locus between species
let locus_table = orthologs |> map(|o| {
let gene = ensembl_symbol(o.species, o.symbol)
let genomic = ensembl_sequence(gene.id, "genomic")
{
species: o.species,
chromosome: gene.chromosome,
start: gene.start,
end_pos: gene.end,
locus_size: gene.end - gene.start,
genomic_len: len(genomic.seq),
}
}) |> to_table()
print("\n=== TP53 Locus Comparison ===")
print(locus_table)
# Which species has the largest/smallest TP53 locus?
let sorted_by_size = locus_table |> arrange("locus_size", "desc")
print(f"\nLargest locus: {sorted_by_size[0].species} ({sorted_by_size[0].locus_size} bp)")
let n = len(sorted_by_size) - 1
print(f"Smallest locus: {sorted_by_size[n].species} ({sorted_by_size[n].locus_size} bp)")
# Visualize locus sizes
bar_chart(locus_table, {
x: "species",
y: "locus_size",
color: "#f59e0b",
title: "TP53 Genomic Locus Size by Species",
ylabel: "Locus Size (bp)",
output: "results/locus_sizes.svg",
})
Step 8 — Motif Search Across Species
Searching for known functional motifs across orthologous sequences can reveal which motifs are conserved and which have been lost or gained during evolution.
# Search for the p53 DNA-binding domain core motif
# The RCATG motif is part of the p53 response element
let motif_pattern = "CATG"
let motif_results = cds_data |> map(|c| {
let hits = find_motif(c.sequence, motif_pattern)
{ species: c.species, motif_count: len(hits), cds_length: len(c.sequence),
density: round(len(hits) / len(c.sequence) * 1000, 2) }
}) |> to_table()
print("\n=== CATG Motif Density in TP53 CDS ===")
print(motif_results)
# Look for start codon context (Kozak-like sequence)
print("\n=== Start Codon Context ===")
for c in cds_data {
let context = substr(c.sequence, 0, 12)
print(f" {c.species}: {context}")
}
Step 9 — Visualizing a Phylogenetic Tree
BioLang can render phylogenetic trees from Newick-format strings using
phylo_tree(). The tree itself must be computed externally
(e.g., using MEGA, RAxML, IQ-TREE, or FastTree) or obtained from a
published source.
# Known phylogeny of our study species (from published data)
# This Newick string encodes the accepted vertebrate phylogeny
# with approximate divergence-based branch lengths
let newick = "((homo_sapiens:0.01,(mus_musculus:0.09,rattus_norvegicus:0.09):0.04):0.1,gallus_gallus:0.2,danio_rerio:0.4);"
# Render the tree
phylo_tree(newick, {
title: "TP53 Species Phylogeny",
scale_bar: true,
output: "results/tp53_phylogeny.svg",
})
print("\nPhylogenetic tree saved to results/tp53_phylogeny.svg")
Note: Phylogenetic inference
BioLang's phylo_tree() function visualizes trees from
Newick strings — it does not compute phylogenetic inference. For full
phylogenetic analysis (neighbor-joining, maximum likelihood, Bayesian), export
your sequences with write_fasta() and use dedicated tools like
MEGA, RAxML, IQ-TREE, or
MrBayes. You can then load the resulting Newick file back
into BioLang for visualization.
Step 10 — Exporting for External Analysis
For analyses that require multiple sequence alignment, dN/dS computation, or tree inference, export your collected sequences in standard formats for use with external tools.
# Export protein sequences as FASTA for external alignment
let fasta_records = proteins |> map(|p| {
{ id: p.species, description: f"TP53 ortholog ({p.symbol})", seq: p.sequence }
})
write_fasta(fasta_records, "results/tp53_proteins.fasta")
print("Protein sequences written to results/tp53_proteins.fasta")
# Export CDS sequences for codon-based analysis (dN/dS)
let cds_records = cds_data |> map(|c| {
{ id: c.species, description: f"TP53 CDS ({c.symbol})", seq: c.sequence }
})
write_fasta(cds_records, "results/tp53_cds.fasta")
print("CDS sequences written to results/tp53_cds.fasta")
# Export the comparison data as CSV
write_csv(cds_table, "results/tp53_cds_comparison.csv")
write_csv(kmer_similarity, "results/tp53_kmer_similarity.csv")
print("Comparison tables written to results/")
# Export summary as JSON
let summary = {
gene: "TP53",
n_species: len(orthologs),
species: orthologs |> map(|o| o.species),
mean_protein_len: round(mean(proteins |> map(|p| len(p.sequence))), 1),
gc_range: [
min(cds_data |> map(|c| gc_content(c.sequence))),
max(cds_data |> map(|c| gc_content(c.sequence))),
],
}
write_json(summary, "results/tp53_summary.json")
print("Summary written to results/tp53_summary.json")
Step 11 — Complete Multi-Gene Comparison
Let's wrap everything into a reusable function that compares a gene across species and produces a comprehensive report.
# requires: NCBI_API_KEY (optional, increases rate limit)
fn compare_gene_across_species(gene_symbols, gene_label, output_dir) {
print(f"\n=== Comparing {gene_label} ===")
# 1. Look up genes
let species_list = keys(gene_symbols)
let genes = species_list |> map(|sp| {
let info = ensembl_symbol(sp, gene_symbols[sp])
{ species: sp, gene_id: info.id, symbol: info.symbol }
})
print(f" Found in {len(genes)} species")
# 2. Fetch protein sequences
let prots = genes |> map(|g| {
let seq = ensembl_sequence(g.gene_id, "protein")
{ species: g.species, symbol: g.symbol, sequence: protein"{seq.seq}" }
})
# 3. Fetch CDS and compute GC content
let cds = genes |> map(|g| {
let seq = ensembl_sequence(g.gene_id, "cdna")
let s = dna"{seq.seq}"
{ species: g.species, gc: round(gc_content(s), 3), cds_len: len(s) }
})
# 4. Build comparison table
let comparison = zip(prots, cds) |> map(|pair| {
{ species: pair[0].species, symbol: pair[0].symbol,
protein_len: len(pair[0].sequence), cds_len: pair[1].cds_len,
gc_content: pair[1].gc }
}) |> to_table()
print(comparison)
# 5. Export sequences for external phylogenetic analysis
let records = prots |> map(|p| {
{ id: p.species, description: f"{gene_label} ({p.symbol})", seq: p.sequence }
})
write_fasta(records, f"{output_dir}/{gene_label}_proteins.fasta")
# 6. Pairwise dotplot: first species vs last (most divergent)
dotplot(prots[0].sequence, prots[len(prots) - 1].sequence, {
title: f"{gene_label}: {prots[0].species} vs {prots[len(prots) - 1].species}",
xlabel: prots[0].species,
ylabel: prots[len(prots) - 1].species,
word: 3,
output: f"{output_dir}/{gene_label}_dotplot.svg",
})
write_csv(comparison, f"{output_dir}/{gene_label}_comparison.csv")
print(f" Results saved to {output_dir}/")
}
# Run for multiple tumor suppressor genes
let genes_to_compare = [
{ label: "TP53", symbols: { homo_sapiens: "TP53", mus_musculus: "Trp53",
rattus_norvegicus: "Tp53", gallus_gallus: "TP53", danio_rerio: "tp53" } },
{ label: "BRCA1", symbols: { homo_sapiens: "BRCA1", mus_musculus: "Brca1",
rattus_norvegicus: "Brca1", gallus_gallus: "BRCA1", danio_rerio: "brca1" } },
{ label: "RB1", symbols: { homo_sapiens: "RB1", mus_musculus: "Rb1",
rattus_norvegicus: "Rb1", gallus_gallus: "RB1", danio_rerio: "rb1" } },
]
for gene in genes_to_compare {
compare_gene_across_species(gene.symbols, gene.label, "results/comparative")
}
# Visualize the known species tree for context
let species_tree = "((homo_sapiens:0.01,(mus_musculus:0.09,rattus_norvegicus:0.09):0.04):0.1,gallus_gallus:0.2,danio_rerio:0.4);"
phylo_tree(species_tree, {
title: "Species Phylogeny (3-gene comparison)",
scale_bar: true,
output: "results/comparative/species_tree.svg",
})
print("\n=== All comparisons complete ===")
Next Steps
-
For phylogenetic inference, export your FASTA files and use
IQ-TREE (
iqtree2 -s proteins.fasta -m JTT+G -bb 1000), then load the resulting.treefileback into BioLang withphylo_tree()for visualization. - For multiple sequence alignment, use MUSCLE or MAFFT on the exported FASTA.
- Learn how to extend BioLang with custom functionality in the Building Custom Plugins tutorial.