Intermediate ~30 minutes

Querying Biological Databases

BioLang has built-in functions for major biological databases: NCBI, Ensembl, UniProt, KEGG, PDB, and more. This tutorial shows how to query each one and combine results across databases.

What you will learn

Querying NCBI (Entrez) for genes, sequences, and publications
Using the Ensembl REST API for gene annotations and variant effects
Fetching protein data from UniProt
Exploring pathways with KEGG
Looking up protein structures from PDB
Cross-database queries that combine multiple sources

Run this tutorial: Download databases.bl and run it with bl run examples/tutorials/databases.bl

Network access: All database functions make HTTP requests to public APIs. Set NCBI_API_KEY for higher NCBI rate limits (10 req/s vs 3 req/s). See the API Reference for full details on each function.

Step 1 — NCBI: Gene Search

Use ncbi_gene() to search for a gene by symbol. If a single gene matches, it returns a record with gene details. For multiple matches, it returns a list of IDs.

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# databases.bio — querying biological databases

# Search for a gene by symbol
let gene = ncbi_gene("TP53")
print(f"Gene ID:     {gene.id}")
print(f"Symbol:      {gene.symbol}")
print(f"Name:        {gene.name}")
print(f"Organism:    {gene.organism}")
print(f"Chromosome:  {gene.chromosome}")
print(f"Location:    {gene.location}")
print(f"Description: {gene.description}")

# For broader searches, use ncbi_search(db, query)
# Returns a list of IDs
let ids = ncbi_search("gene", "BRCA1 human")
print(f"Found {len(ids)} gene IDs")
print(f"First ID: {ids[0]}")

# Get summaries for those IDs
let summaries = ncbi_summary(ids, "gene")
for s in summaries {
  print(f"  {s.uid}: {s}"  )
}

Step 2 — NCBI: Sequences and PubMed

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Fetch a sequence by accession (returns FASTA text)
let fasta = ncbi_sequence("NM_000546.6")
print(f"FASTA (first 200 chars): {fasta[0..200]}...")

# Or use ncbi_fetch for more control — ncbi_fetch(ids, db, [rettype])
let xml = ncbi_fetch(["7157"], "gene")  # fetch gene XML by ID
print(f"Gene XML length: {len(xml)} chars")

# Fetch nucleotide sequences
let seq_fasta = ncbi_fetch("NM_000546.6", "nucleotide")  # single ID string works too
print(seq_fasta[0..100])

# Search PubMed — returns list of PMIDs
let pmids = ncbi_pubmed("TP53 mutation cancer", 5)
print(f"Found {len(pmids)} PubMed IDs: {pmids}")

# Get summaries for the publications
let papers = ncbi_summary(pmids, "pubmed")
for p in papers {
  print(f"  PMID {p.uid}: {p}")
}

Step 3 — Ensembl: Gene Annotations

Use ensembl_symbol(species, symbol) to look up a gene by symbol, or ensembl_gene(id) to look up by Ensembl ID. Both return a record with gene location and metadata.

# requires: internet connection
# Look up a gene by symbol
let gene = ensembl_symbol("human", "TP53")
print(f"Ensembl ID:  {gene.id}")
print(f"Symbol:      {gene.symbol}")
print(f"Biotype:     {gene.biotype}")
print(f"Chromosome:  {gene.chromosome}")
print(f"Location:    {gene.chromosome}:{gene.start}-{gene.end}")
print(f"Strand:      {gene.strand}")
print(f"Description: {gene.description}")

# Look up by Ensembl ID directly
let gene2 = ensembl_gene("ENSG00000141510")
print(f"Same gene: {gene2.symbol}")

# Get the coding sequence for a transcript
let seq = ensembl_sequence("ENST00000269305", "cds")
print(f"CDS ID:     {seq.id}")
print(f"Molecule:   {seq.molecule}")
print(f"CDS length: {len(seq.seq)} nt")
print(f"Sequence:   {seq.seq[0..60]}...")

Step 4 — Ensembl: Variant Effect Predictor

Use ensembl_vep(hgvs) to predict the effect of a variant using HGVS notation. This queries Ensembl's VEP REST API and returns transcript consequences with impact predictions.

# requires: internet connection
# Predict variant effect using HGVS notation
let results = ensembl_vep("17:g.7675088C>T")

for r in results {
  print(f"Alleles: {r.allele_string}")
  print(f"Most severe: {r.most_severe_consequence}")

  # Each result has transcript consequences
  for tc in r.transcript_consequences {
    print(f"  Gene:        {tc.gene_id}")
    print(f"  Transcript:  {tc.transcript_id}")
    print(f"  Impact:      {tc.impact}")
    print(f"  Consequences: {tc.consequences}")
  }
}

# Try another variant — BRCA1 pathogenic variant
let brca1_vep = ensembl_vep("17:g.43091434C>T")
for r in brca1_vep {
  print(f"\nBRCA1 variant: {r.allele_string}")
  print(f"Consequence: {r.most_severe_consequence}")
}

Step 5 — UniProt: Protein Data

Use uniprot_search(query) to search and uniprot_entry(accession) to fetch a full protein record. Additional functions provide FASTA sequences, features, and GO annotations.

# requires: internet connection
# Search UniProt for human TP53
let results = uniprot_search("TP53 AND organism_id:9606", 5)
for entry in results {
  print(f"{entry.accession}: {entry.name} — {entry.organism}")
}

# Fetch full entry for p53
let p53 = uniprot_entry("P04637")
print(f"Protein:   {p53.name}")
print(f"Accession: {p53.accession}")
print(f"Length:    {p53.sequence_length} aa")
print(f"Organism:  {p53.organism}")
print(f"Genes:     {p53.gene_names}")
print(f"Function:  {p53.function}")

# Get the FASTA sequence
let fasta = uniprot_fasta("P04637")
print(f"\nFASTA:\n{fasta[0..200]}...")

# Get protein features (domains, modifications, etc.)
let features = uniprot_features("P04637")
print(f"\nFeatures: {len(features)}")
for f in take(features, 10) {
  print(f"  {f.type}: {f.location} — {f.description}")
}

# Get GO annotations
let go_terms = uniprot_go("P04637")
print(f"\nGO terms: {len(go_terms)}")
for t in take(go_terms, 10) {
  print(f"  {t.id} [{t.aspect}]: {t.term}")
}

Step 6 — KEGG: Pathways

KEGG provides three main functions: kegg_find(db, query) to search, kegg_get(entry) to fetch details, and kegg_link(target, source) to find relationships between databases.

# requires: internet connection
# Search KEGG pathways
let pathways = kegg_find("pathway", "p53 signaling")
for p in pathways {
  print(f"{p.id}: {p.description}")
}

# Get full pathway details (returns text record)
let p53_pathway = kegg_get("hsa04115")
print(f"Pathway details:\n{p53_pathway[0..500]}...")

# Find genes linked to a pathway
let genes = kegg_link("hsa", "hsa04115")
print(f"\nGenes in p53 pathway: {len(genes)}")
for g in take(genes, 10) {
  print(f"  {g.source} -> {g.target}")
}

# Find all pathways for a specific gene (TP53 = hsa:7157)
let tp53_pathways = kegg_link("pathway", "hsa:7157")
print(f"\nTP53 participates in {len(tp53_pathways)} pathways:")
for p in tp53_pathways {
  print(f"  {p.target}")
}

# Search KEGG compounds
let compounds = kegg_find("compound", "ATP")
for c in compounds {
  print(f"{c.id}: {c.description}")
}

Step 7 — PDB: Protein Structures

Use pdb_search(query) to find structures and pdb_entry(id) to get details about a specific PDB entry.

# requires: internet connection
# Search PDB for TP53 structures
let pdb_ids = pdb_search("TP53")
print(f"Found {len(pdb_ids)} PDB structures for TP53")

# Get details for a specific structure
let entry = pdb_entry("1TUP")
print(f"PDB ID:     {entry.id}")
print(f"Title:      {entry.title}")
print(f"Method:     {entry.method}")
print(f"Resolution: {entry.resolution}")
print(f"Released:   {entry.release_date}")
print(f"Organism:   {entry.organism}")

Step 8 — Cross-Database Queries

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Gather information about a gene from multiple databases

fn gene_report(symbol: String) {
  print(f"\n=== Report for {symbol} ===\n")

  # NCBI: basic gene info
  let ncbi = ncbi_gene(symbol)
  print(f"NCBI Gene ID: {ncbi.id}")
  print(f"Name:         {ncbi.name}")
  print(f"Chromosome:   {ncbi.chromosome}")
  print(f"Summary:      {ncbi.summary[0..200]}...")

  # Ensembl: genomic coordinates
  let ens = ensembl_symbol("human", symbol)
  print(f"\nEnsembl ID:   {ens.id}")
  print(f"Location:     {ens.chromosome}:{ens.start}-{ens.end}")
  print(f"Biotype:      {ens.biotype}")

  # UniProt: protein info
  let up_results = uniprot_search(f"gene:{symbol} AND organism_id:9606", 1)
  if len(up_results) > 0 {
    let up = uniprot_entry(up_results[0].accession)
    print(f"\nUniProt:      {up.accession}")
    print(f"Protein:      {up.name}")
    print(f"Length:       {up.sequence_length} aa")

    # Features
    let feats = uniprot_features(up.accession)
    print(f"Features:     {len(feats)}")

    # GO terms
    let go = uniprot_go(up.accession)
    print(f"GO terms:     {len(go)}")
  }

  # KEGG: pathways
  let kegg_paths = kegg_link("pathway", f"hsa:{ncbi.id}")
  print(f"\nKEGG pathways: {len(kegg_paths)}")
  for p in take(kegg_paths, 5) {
    print(f"  {p.target}")
  }

  # PDB: structures
  let structures = pdb_search(symbol)
  print(f"PDB structures: {len(structures)}")
}

# Run report for a cancer-related gene
gene_report("TP53")

Step 9 — Batch Queries

When working with multiple genes, you can build batch pipelines that query each database and combine results into a table.

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Annotate a list of genes from multiple databases
let gene_symbols = ["TP53", "BRCA1", "EGFR", "KRAS", "MYC"]

let annotations = gene_symbols |> map(|symbol| {
  # Ensembl: location
  let ens = ensembl_symbol("human", symbol)

  # UniProt: protein info
  let up_hits = uniprot_search(f"gene:{symbol} AND organism_id:9606", 1)
  let prot_len = if len(up_hits) > 0 {
    up_hits[0].sequence_length
  } else {
    0
  }

  # KEGG: pathway count
  let ncbi = ncbi_gene(symbol)
  let paths = kegg_link("pathway", f"hsa:{ncbi.id}")

  {
    symbol:     symbol,
    ensembl_id: ens.id,
    chromosome: ens.chromosome,
    start:      ens.start,
    end:        ens.end,
    biotype:    ens.biotype,
    protein_aa: prot_len,
    n_pathways: len(paths),
  }
}) |> to_table()

print("\n=== Gene Annotation Summary ===")
print(annotations)

Step 10 — Tips and Best Practices

Rate Limits

NCBI: 3 requests/second without an API key, 10 requests/second with one. Set NCBI_API_KEY in your environment.
Ensembl: 15 requests/second. BioLang automatically throttles to stay within limits.
UniProt: No hard rate limit, but be considerate with batch queries.
KEGG: No authentication required. Moderate request rates are expected.

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Set your NCBI API key for higher rate limits
# In your shell: export NCBI_API_KEY="your-key-here"

# For large batch queries, use sleep() between requests
let gene_ids = ncbi_search("gene", "cancer human", 100)
print(f"Found {len(gene_ids)} genes")

# Process in chunks to be respectful of rate limits
let chunk_size = 10
let chunks = chunk(gene_ids, chunk_size)

for i, batch in enumerate(chunks) {
  let summaries = ncbi_summary(batch, "gene")
  print(f"Batch {i + 1}: processed {len(summaries)} genes")
  sleep(500)  # half-second pause between batches
}

Available API Functions

NCBI: ncbi_search(db, query), ncbi_fetch(ids, db), ncbi_summary(ids, db), ncbi_gene(symbol), ncbi_pubmed(query), ncbi_sequence(accession)

Ensembl: ensembl_gene(id), ensembl_symbol(species, symbol), ensembl_sequence(id, type?), ensembl_vep(hgvs)

UniProt: uniprot_search(query, limit?), uniprot_entry(accession), uniprot_fasta(accession), uniprot_features(accession), uniprot_go(accession)

KEGG: kegg_find(db, query), kegg_get(entry), kegg_link(target, source)

PDB: pdb_entry(id), pdb_search(query)

Next Steps

Dive into statistical methods in the Statistical Analysis tutorial.