Querying Biological Databases
BioLang has built-in functions for major biological databases: NCBI, Ensembl, UniProt, KEGG, PDB, and more. This tutorial shows how to query each one and combine results across databases.
What you will learn
- Querying NCBI (Entrez) for genes, sequences, and publications
- Using the Ensembl REST API for gene annotations and variant effects
- Fetching protein data from UniProt
- Exploring pathways with KEGG
- Looking up protein structures from PDB
- Cross-database queries that combine multiple sources
bl run examples/tutorials/databases.bl
Network access: All database functions make
HTTP requests to public APIs. Set NCBI_API_KEY for higher NCBI rate
limits (10 req/s vs 3 req/s). See the
API Reference
for full details on each function.
Step 1 — NCBI: Gene Search
Use ncbi_gene() to search for a gene by symbol. If a single gene matches,
it returns a record with gene details. For multiple matches, it returns a list of IDs.
# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# databases.bio — querying biological databases
# Search for a gene by symbol
let gene = ncbi_gene("TP53")
print(f"Gene ID: {gene.id}")
print(f"Symbol: {gene.symbol}")
print(f"Name: {gene.name}")
print(f"Organism: {gene.organism}")
print(f"Chromosome: {gene.chromosome}")
print(f"Location: {gene.location}")
print(f"Description: {gene.description}")
# For broader searches, use ncbi_search(db, query)
# Returns a list of IDs
let ids = ncbi_search("gene", "BRCA1 human")
print(f"Found {len(ids)} gene IDs")
print(f"First ID: {ids[0]}")
# Get summaries for those IDs
let summaries = ncbi_summary(ids, "gene")
for s in summaries {
print(f" {s.uid}: {s}" )
}
Step 2 — NCBI: Sequences and PubMed
# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Fetch a sequence by accession (returns FASTA text)
let fasta = ncbi_sequence("NM_000546.6")
print(f"FASTA (first 200 chars): {fasta[0..200]}...")
# Or use ncbi_fetch for more control — ncbi_fetch(ids, db, [rettype])
let xml = ncbi_fetch(["7157"], "gene") # fetch gene XML by ID
print(f"Gene XML length: {len(xml)} chars")
# Fetch nucleotide sequences
let seq_fasta = ncbi_fetch("NM_000546.6", "nucleotide") # single ID string works too
print(seq_fasta[0..100])
# Search PubMed — returns list of PMIDs
let pmids = ncbi_pubmed("TP53 mutation cancer", 5)
print(f"Found {len(pmids)} PubMed IDs: {pmids}")
# Get summaries for the publications
let papers = ncbi_summary(pmids, "pubmed")
for p in papers {
print(f" PMID {p.uid}: {p}")
}
Step 3 — Ensembl: Gene Annotations
Use ensembl_symbol(species, symbol) to look up a gene by symbol, or
ensembl_gene(id) to look up by Ensembl ID. Both return a record with
gene location and metadata.
# requires: internet connection
# Look up a gene by symbol
let gene = ensembl_symbol("human", "TP53")
print(f"Ensembl ID: {gene.id}")
print(f"Symbol: {gene.symbol}")
print(f"Biotype: {gene.biotype}")
print(f"Chromosome: {gene.chromosome}")
print(f"Location: {gene.chromosome}:{gene.start}-{gene.end}")
print(f"Strand: {gene.strand}")
print(f"Description: {gene.description}")
# Look up by Ensembl ID directly
let gene2 = ensembl_gene("ENSG00000141510")
print(f"Same gene: {gene2.symbol}")
# Get the coding sequence for a transcript
let seq = ensembl_sequence("ENST00000269305", "cds")
print(f"CDS ID: {seq.id}")
print(f"Molecule: {seq.molecule}")
print(f"CDS length: {len(seq.seq)} nt")
print(f"Sequence: {seq.seq[0..60]}...")
Step 4 — Ensembl: Variant Effect Predictor
Use ensembl_vep(hgvs) to predict the effect of a variant using HGVS
notation. This queries Ensembl's VEP REST API and returns transcript consequences
with impact predictions.
# requires: internet connection
# Predict variant effect using HGVS notation
let results = ensembl_vep("17:g.7675088C>T")
for r in results {
print(f"Alleles: {r.allele_string}")
print(f"Most severe: {r.most_severe_consequence}")
# Each result has transcript consequences
for tc in r.transcript_consequences {
print(f" Gene: {tc.gene_id}")
print(f" Transcript: {tc.transcript_id}")
print(f" Impact: {tc.impact}")
print(f" Consequences: {tc.consequences}")
}
}
# Try another variant — BRCA1 pathogenic variant
let brca1_vep = ensembl_vep("17:g.43091434C>T")
for r in brca1_vep {
print(f"\nBRCA1 variant: {r.allele_string}")
print(f"Consequence: {r.most_severe_consequence}")
}
Step 5 — UniProt: Protein Data
Use uniprot_search(query) to search and uniprot_entry(accession)
to fetch a full protein record. Additional functions provide FASTA sequences, features,
and GO annotations.
# requires: internet connection
# Search UniProt for human TP53
let results = uniprot_search("TP53 AND organism_id:9606", 5)
for entry in results {
print(f"{entry.accession}: {entry.name} — {entry.organism}")
}
# Fetch full entry for p53
let p53 = uniprot_entry("P04637")
print(f"Protein: {p53.name}")
print(f"Accession: {p53.accession}")
print(f"Length: {p53.sequence_length} aa")
print(f"Organism: {p53.organism}")
print(f"Genes: {p53.gene_names}")
print(f"Function: {p53.function}")
# Get the FASTA sequence
let fasta = uniprot_fasta("P04637")
print(f"\nFASTA:\n{fasta[0..200]}...")
# Get protein features (domains, modifications, etc.)
let features = uniprot_features("P04637")
print(f"\nFeatures: {len(features)}")
for f in take(features, 10) {
print(f" {f.type}: {f.location} — {f.description}")
}
# Get GO annotations
let go_terms = uniprot_go("P04637")
print(f"\nGO terms: {len(go_terms)}")
for t in take(go_terms, 10) {
print(f" {t.id} [{t.aspect}]: {t.term}")
}
Step 6 — KEGG: Pathways
KEGG provides three main functions: kegg_find(db, query) to search,
kegg_get(entry) to fetch details, and kegg_link(target, source)
to find relationships between databases.
# requires: internet connection
# Search KEGG pathways
let pathways = kegg_find("pathway", "p53 signaling")
for p in pathways {
print(f"{p.id}: {p.description}")
}
# Get full pathway details (returns text record)
let p53_pathway = kegg_get("hsa04115")
print(f"Pathway details:\n{p53_pathway[0..500]}...")
# Find genes linked to a pathway
let genes = kegg_link("hsa", "hsa04115")
print(f"\nGenes in p53 pathway: {len(genes)}")
for g in take(genes, 10) {
print(f" {g.source} -> {g.target}")
}
# Find all pathways for a specific gene (TP53 = hsa:7157)
let tp53_pathways = kegg_link("pathway", "hsa:7157")
print(f"\nTP53 participates in {len(tp53_pathways)} pathways:")
for p in tp53_pathways {
print(f" {p.target}")
}
# Search KEGG compounds
let compounds = kegg_find("compound", "ATP")
for c in compounds {
print(f"{c.id}: {c.description}")
}
Step 7 — PDB: Protein Structures
Use pdb_search(query) to find structures and pdb_entry(id)
to get details about a specific PDB entry.
# requires: internet connection
# Search PDB for TP53 structures
let pdb_ids = pdb_search("TP53")
print(f"Found {len(pdb_ids)} PDB structures for TP53")
# Get details for a specific structure
let entry = pdb_entry("1TUP")
print(f"PDB ID: {entry.id}")
print(f"Title: {entry.title}")
print(f"Method: {entry.method}")
print(f"Resolution: {entry.resolution}")
print(f"Released: {entry.release_date}")
print(f"Organism: {entry.organism}")
Step 8 — Cross-Database Queries
# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Gather information about a gene from multiple databases
fn gene_report(symbol: String) {
print(f"\n=== Report for {symbol} ===\n")
# NCBI: basic gene info
let ncbi = ncbi_gene(symbol)
print(f"NCBI Gene ID: {ncbi.id}")
print(f"Name: {ncbi.name}")
print(f"Chromosome: {ncbi.chromosome}")
print(f"Summary: {ncbi.summary[0..200]}...")
# Ensembl: genomic coordinates
let ens = ensembl_symbol("human", symbol)
print(f"\nEnsembl ID: {ens.id}")
print(f"Location: {ens.chromosome}:{ens.start}-{ens.end}")
print(f"Biotype: {ens.biotype}")
# UniProt: protein info
let up_results = uniprot_search(f"gene:{symbol} AND organism_id:9606", 1)
if len(up_results) > 0 {
let up = uniprot_entry(up_results[0].accession)
print(f"\nUniProt: {up.accession}")
print(f"Protein: {up.name}")
print(f"Length: {up.sequence_length} aa")
# Features
let feats = uniprot_features(up.accession)
print(f"Features: {len(feats)}")
# GO terms
let go = uniprot_go(up.accession)
print(f"GO terms: {len(go)}")
}
# KEGG: pathways
let kegg_paths = kegg_link("pathway", f"hsa:{ncbi.id}")
print(f"\nKEGG pathways: {len(kegg_paths)}")
for p in take(kegg_paths, 5) {
print(f" {p.target}")
}
# PDB: structures
let structures = pdb_search(symbol)
print(f"PDB structures: {len(structures)}")
}
# Run report for a cancer-related gene
gene_report("TP53")
Step 9 — Batch Queries
When working with multiple genes, you can build batch pipelines that query each database and combine results into a table.
# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Annotate a list of genes from multiple databases
let gene_symbols = ["TP53", "BRCA1", "EGFR", "KRAS", "MYC"]
let annotations = gene_symbols |> map(|symbol| {
# Ensembl: location
let ens = ensembl_symbol("human", symbol)
# UniProt: protein info
let up_hits = uniprot_search(f"gene:{symbol} AND organism_id:9606", 1)
let prot_len = if len(up_hits) > 0 {
up_hits[0].sequence_length
} else {
0
}
# KEGG: pathway count
let ncbi = ncbi_gene(symbol)
let paths = kegg_link("pathway", f"hsa:{ncbi.id}")
{
symbol: symbol,
ensembl_id: ens.id,
chromosome: ens.chromosome,
start: ens.start,
end: ens.end,
biotype: ens.biotype,
protein_aa: prot_len,
n_pathways: len(paths),
}
}) |> to_table()
print("\n=== Gene Annotation Summary ===")
print(annotations)
Step 10 — Tips and Best Practices
Rate Limits
-
NCBI: 3 requests/second without an API key,
10 requests/second with one. Set
NCBI_API_KEYin your environment. - Ensembl: 15 requests/second. BioLang automatically throttles to stay within limits.
- UniProt: No hard rate limit, but be considerate with batch queries.
- KEGG: No authentication required. Moderate request rates are expected.
# requires: internet connection (optional: NCBI_API_KEY for higher rate limits)
# Set your NCBI API key for higher rate limits
# In your shell: export NCBI_API_KEY="your-key-here"
# For large batch queries, use sleep() between requests
let gene_ids = ncbi_search("gene", "cancer human", 100)
print(f"Found {len(gene_ids)} genes")
# Process in chunks to be respectful of rate limits
let chunk_size = 10
let chunks = chunk(gene_ids, chunk_size)
for i, batch in enumerate(chunks) {
let summaries = ncbi_summary(batch, "gene")
print(f"Batch {i + 1}: processed {len(summaries)} genes")
sleep(500) # half-second pause between batches
}
Available API Functions
NCBI:
ncbi_search(db, query),
ncbi_fetch(ids, db),
ncbi_summary(ids, db),
ncbi_gene(symbol),
ncbi_pubmed(query),
ncbi_sequence(accession)
Ensembl:
ensembl_gene(id),
ensembl_symbol(species, symbol),
ensembl_sequence(id, type?),
ensembl_vep(hgvs)
UniProt:
uniprot_search(query, limit?),
uniprot_entry(accession),
uniprot_fasta(accession),
uniprot_features(accession),
uniprot_go(accession)
KEGG:
kegg_find(db, query),
kegg_get(entry),
kegg_link(target, source)
PDB:
pdb_entry(id),
pdb_search(query)
Next Steps
Dive into statistical methods in the Statistical Analysis tutorial.