Intermediate ~30 minutes

Querying Biological Databases

BioLang has built-in functions for major biological databases: NCBI, Ensembl, UniProt, KEGG, PDB, and more. This tutorial shows how to query each one and combine results across databases.

What you will learn

  • Querying NCBI (Entrez) for genes, sequences, and publications
  • Using the Ensembl REST API for gene annotations and variant effects
  • Fetching protein data from UniProt
  • Exploring pathways with KEGG
  • Looking up protein structures from PDB
  • Cross-database queries that combine multiple sources
Run this tutorial: Download databases.bl and run it with bl run examples/tutorials/databases.bl

Network access: All database functions make HTTP requests to public APIs. Set NCBI_API_KEY for higher NCBI rate limits (10 req/s vs 3 req/s). See the API Reference for full details on each function.

Step 1 — NCBI: Gene Search

Use ncbi_gene() to search for a gene by symbol. If a single gene matches, it returns a record with gene details. For multiple matches, it returns a list of IDs.

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# databases.bio — querying biological databases

# Search for a gene by symbol
let gene = ncbi_gene("TP53")
print(f"Gene ID:     {gene.id}")
print(f"Symbol:      {gene.symbol}")
print(f"Name:        {gene.name}")
print(f"Organism:    {gene.organism}")
print(f"Chromosome:  {gene.chromosome}")
print(f"Location:    {gene.location}")
print(f"Description: {gene.description}")

# For broader searches, use ncbi_search(db, query)
# Returns a list of IDs
let ids = ncbi_search("gene", "BRCA1 human")
print(f"Found {len(ids)} gene IDs")
print(f"First ID: {ids[0]}")

# Get summaries for those IDs
let summaries = ncbi_summary(ids, "gene")
for s in summaries {
  print(f"  {s.uid}: {s}"  )
}

Step 2 — NCBI: Sequences and PubMed

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Fetch a sequence by accession (returns FASTA text)
let fasta = ncbi_sequence("NM_000546.6")
print(f"FASTA (first 200 chars): {fasta[0..200]}...")

# Or use ncbi_fetch for more control — ncbi_fetch(ids, db, [rettype])
let xml = ncbi_fetch(["7157"], "gene")  # fetch gene XML by ID
print(f"Gene XML length: {len(xml)} chars")

# Fetch nucleotide sequences
let seq_fasta = ncbi_fetch("NM_000546.6", "nucleotide")  # single ID string works too
print(seq_fasta[0..100])

# Search PubMed — returns list of PMIDs
let pmids = ncbi_pubmed("TP53 mutation cancer", 5)
print(f"Found {len(pmids)} PubMed IDs: {pmids}")

# Get summaries for the publications
let papers = ncbi_summary(pmids, "pubmed")
for p in papers {
  print(f"  PMID {p.uid}: {p}")
}

Step 3 — Ensembl: Gene Annotations

Use ensembl_symbol(species, symbol) to look up a gene by symbol, or ensembl_gene(id) to look up by Ensembl ID. Both return a record with gene location and metadata.

# requires: internet connection
# Look up a gene by symbol
let gene = ensembl_symbol("human", "TP53")
print(f"Ensembl ID:  {gene.id}")
print(f"Symbol:      {gene.symbol}")
print(f"Biotype:     {gene.biotype}")
print(f"Chromosome:  {gene.chromosome}")
print(f"Location:    {gene.chromosome}:{gene.start}-{gene.end}")
print(f"Strand:      {gene.strand}")
print(f"Description: {gene.description}")

# Look up by Ensembl ID directly
let gene2 = ensembl_gene("ENSG00000141510")
print(f"Same gene: {gene2.symbol}")

# Get the coding sequence for a transcript
let seq = ensembl_sequence("ENST00000269305", "cds")
print(f"CDS ID:     {seq.id}")
print(f"Molecule:   {seq.molecule}")
print(f"CDS length: {len(seq.seq)} nt")
print(f"Sequence:   {seq.seq[0..60]}...")

Step 4 — Ensembl: Variant Effect Predictor

Use ensembl_vep(hgvs) to predict the effect of a variant using HGVS notation. This queries Ensembl's VEP REST API and returns transcript consequences with impact predictions.

# requires: internet connection
# Predict variant effect using HGVS notation
let results = ensembl_vep("17:g.7675088C>T")

for r in results {
  print(f"Alleles: {r.allele_string}")
  print(f"Most severe: {r.most_severe_consequence}")

  # Each result has transcript consequences
  for tc in r.transcript_consequences {
    print(f"  Gene:        {tc.gene_id}")
    print(f"  Transcript:  {tc.transcript_id}")
    print(f"  Impact:      {tc.impact}")
    print(f"  Consequences: {tc.consequences}")
  }
}

# Try another variant — BRCA1 pathogenic variant
let brca1_vep = ensembl_vep("17:g.43091434C>T")
for r in brca1_vep {
  print(f"\nBRCA1 variant: {r.allele_string}")
  print(f"Consequence: {r.most_severe_consequence}")
}

Step 5 — UniProt: Protein Data

Use uniprot_search(query) to search and uniprot_entry(accession) to fetch a full protein record. Additional functions provide FASTA sequences, features, and GO annotations.

# requires: internet connection
# Search UniProt for human TP53
let results = uniprot_search("TP53 AND organism_id:9606", 5)
for entry in results {
  print(f"{entry.accession}: {entry.name} — {entry.organism}")
}

# Fetch full entry for p53
let p53 = uniprot_entry("P04637")
print(f"Protein:   {p53.name}")
print(f"Accession: {p53.accession}")
print(f"Length:    {p53.sequence_length} aa")
print(f"Organism:  {p53.organism}")
print(f"Genes:     {p53.gene_names}")
print(f"Function:  {p53.function}")

# Get the FASTA sequence
let fasta = uniprot_fasta("P04637")
print(f"\nFASTA:\n{fasta[0..200]}...")

# Get protein features (domains, modifications, etc.)
let features = uniprot_features("P04637")
print(f"\nFeatures: {len(features)}")
for f in take(features, 10) {
  print(f"  {f.type}: {f.location} — {f.description}")
}

# Get GO annotations
let go_terms = uniprot_go("P04637")
print(f"\nGO terms: {len(go_terms)}")
for t in take(go_terms, 10) {
  print(f"  {t.id} [{t.aspect}]: {t.term}")
}

Step 6 — KEGG: Pathways

KEGG provides three main functions: kegg_find(db, query) to search, kegg_get(entry) to fetch details, and kegg_link(target, source) to find relationships between databases.

# requires: internet connection
# Search KEGG pathways
let pathways = kegg_find("pathway", "p53 signaling")
for p in pathways {
  print(f"{p.id}: {p.description}")
}

# Get full pathway details (returns text record)
let p53_pathway = kegg_get("hsa04115")
print(f"Pathway details:\n{p53_pathway[0..500]}...")

# Find genes linked to a pathway
let genes = kegg_link("hsa", "hsa04115")
print(f"\nGenes in p53 pathway: {len(genes)}")
for g in take(genes, 10) {
  print(f"  {g.source} -> {g.target}")
}

# Find all pathways for a specific gene (TP53 = hsa:7157)
let tp53_pathways = kegg_link("pathway", "hsa:7157")
print(f"\nTP53 participates in {len(tp53_pathways)} pathways:")
for p in tp53_pathways {
  print(f"  {p.target}")
}

# Search KEGG compounds
let compounds = kegg_find("compound", "ATP")
for c in compounds {
  print(f"{c.id}: {c.description}")
}

Step 7 — PDB: Protein Structures

Use pdb_search(query) to find structures and pdb_entry(id) to get details about a specific PDB entry.

# requires: internet connection
# Search PDB for TP53 structures
let pdb_ids = pdb_search("TP53")
print(f"Found {len(pdb_ids)} PDB structures for TP53")

# Get details for a specific structure
let entry = pdb_entry("1TUP")
print(f"PDB ID:     {entry.id}")
print(f"Title:      {entry.title}")
print(f"Method:     {entry.method}")
print(f"Resolution: {entry.resolution}")
print(f"Released:   {entry.release_date}")
print(f"Organism:   {entry.organism}")

Step 8 — Cross-Database Queries

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Gather information about a gene from multiple databases

fn gene_report(symbol: String) {
  print(f"\n=== Report for {symbol} ===\n")

  # NCBI: basic gene info
  let ncbi = ncbi_gene(symbol)
  print(f"NCBI Gene ID: {ncbi.id}")
  print(f"Name:         {ncbi.name}")
  print(f"Chromosome:   {ncbi.chromosome}")
  print(f"Summary:      {ncbi.summary[0..200]}...")

  # Ensembl: genomic coordinates
  let ens = ensembl_symbol("human", symbol)
  print(f"\nEnsembl ID:   {ens.id}")
  print(f"Location:     {ens.chromosome}:{ens.start}-{ens.end}")
  print(f"Biotype:      {ens.biotype}")

  # UniProt: protein info
  let up_results = uniprot_search(f"gene:{symbol} AND organism_id:9606", 1)
  if len(up_results) > 0 {
    let up = uniprot_entry(up_results[0].accession)
    print(f"\nUniProt:      {up.accession}")
    print(f"Protein:      {up.name}")
    print(f"Length:       {up.sequence_length} aa")

    # Features
    let feats = uniprot_features(up.accession)
    print(f"Features:     {len(feats)}")

    # GO terms
    let go = uniprot_go(up.accession)
    print(f"GO terms:     {len(go)}")
  }

  # KEGG: pathways
  let kegg_paths = kegg_link("pathway", f"hsa:{ncbi.id}")
  print(f"\nKEGG pathways: {len(kegg_paths)}")
  for p in take(kegg_paths, 5) {
    print(f"  {p.target}")
  }

  # PDB: structures
  let structures = pdb_search(symbol)
  print(f"PDB structures: {len(structures)}")
}

# Run report for a cancer-related gene
gene_report("TP53")

Step 9 — Batch Queries

When working with multiple genes, you can build batch pipelines that query each database and combine results into a table.

# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Annotate a list of genes from multiple databases
let gene_symbols = ["TP53", "BRCA1", "EGFR", "KRAS", "MYC"]

let annotations = gene_symbols |> map(|symbol| {
  # Ensembl: location
  let ens = ensembl_symbol("human", symbol)

  # UniProt: protein info
  let up_hits = uniprot_search(f"gene:{symbol} AND organism_id:9606", 1)
  let prot_len = if len(up_hits) > 0 {
    up_hits[0].sequence_length
  } else {
    0
  }

  # KEGG: pathway count
  let ncbi = ncbi_gene(symbol)
  let paths = kegg_link("pathway", f"hsa:{ncbi.id}")

  {
    symbol:     symbol,
    ensembl_id: ens.id,
    chromosome: ens.chromosome,
    start:      ens.start,
    end:        ens.end,
    biotype:    ens.biotype,
    protein_aa: prot_len,
    n_pathways: len(paths),
  }
}) |> to_table()

print("\n=== Gene Annotation Summary ===")
print(annotations)

Step 10 — Tips and Best Practices

Rate Limits

  • NCBI: 3 requests/second without an API key, 10 requests/second with one. Set NCBI_API_KEY in your environment.
  • Ensembl: 15 requests/second. BioLang automatically throttles to stay within limits.
  • UniProt: No hard rate limit, but be considerate with batch queries.
  • KEGG: No authentication required. Moderate request rates are expected.
# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Set your NCBI API key for higher rate limits
# In your shell: export NCBI_API_KEY="your-key-here"

# For large batch queries, use sleep() between requests
let gene_ids = ncbi_search("gene", "cancer human", 100)
print(f"Found {len(gene_ids)} genes")

# Process in chunks to be respectful of rate limits
let chunk_size = 10
let chunks = chunk(gene_ids, chunk_size)

for i, batch in enumerate(chunks) {
  let summaries = ncbi_summary(batch, "gene")
  print(f"Batch {i + 1}: processed {len(summaries)} genes")
  sleep(500)  # half-second pause between batches
}

Available API Functions

NCBI: ncbi_search(db, query), ncbi_fetch(ids, db), ncbi_summary(ids, db), ncbi_gene(symbol), ncbi_pubmed(query), ncbi_sequence(accession)

Ensembl: ensembl_gene(id), ensembl_symbol(species, symbol), ensembl_sequence(id, type?), ensembl_vep(hgvs)

UniProt: uniprot_search(query, limit?), uniprot_entry(accession), uniprot_fasta(accession), uniprot_features(accession), uniprot_go(accession)

KEGG: kegg_find(db, query), kegg_get(entry), kegg_link(target, source)

PDB: pdb_entry(id), pdb_search(query)

Next Steps

Dive into statistical methods in the Statistical Analysis tutorial.