Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Chapter 13: Biological Database APIs

Bioinformatics analysis rarely lives in isolation. You query NCBI for gene annotations, check UniProt for protein function, pull variant consequences from Ensembl VEP, and look up pathways in KEGG. BioLang provides 16 built-in database client functions so you can fetch, cross-reference, and integrate biological data without leaving your script.

All bio API functions are builtins. No imports are needed.

NCBI

The National Center for Biotechnology Information hosts PubMed, Gene, Nucleotide, and dozens of other databases. BioLang provides three NCBI functions.

Search any NCBI database (PubMed, Gene, Nucleotide, Protein, etc.).

# Search PubMed for recent CRISPR-Cas9 papers
# ncbi_search(db, query, max_results?)
let ids = ncbi_search("pubmed", "CRISPR-Cas9 delivery 2024", 20)
# ids => ["39012345", "39012346", ...]

# Get summaries for the IDs
let summaries = ncbi_summary(ids, "pubmed")
summaries |> each(|s| {
  print(s.uid)
})

ncbi_gene

Fetch detailed gene information by gene ID or symbol.

# ncbi_gene(symbol_or_query, max_results?)
let tp53 = ncbi_gene("TP53")
# If a single gene matches, returns a record:
# {id, symbol, name, description, organism, chromosome, location, summary}

print(tp53.name + " on chr" + tp53.chromosome + ": " + tp53.location)

ncbi_sequence

Retrieve nucleotide or protein sequences from NCBI.

# ncbi_sequence(accession) — returns FASTA text
let fasta = ncbi_sequence("NM_000546.6")
print("FASTA (first 100 chars): " + fasta[0..100])

Set the NCBI_API_KEY environment variable to get 10 requests/second instead of the default 3.

Ensembl

ensembl_gene

Look up a gene via the Ensembl REST API.

# ensembl_symbol(species, symbol) — lookup by symbol
let brca1 = ensembl_symbol("human", "BRCA1")
# Returns: {id, symbol, description, species, biotype, start, end, strand, chromosome}

print("Ensembl ID: " + brca1.id)
print("Location: " + brca1.chromosome + ":" + str(brca1.start) + "-" + str(brca1.end))

# ensembl_gene(ensembl_id) — lookup by Ensembl ID
let same = ensembl_gene("ENSG00000012048")
print("Symbol: " + same.symbol)

ensembl_vep

Predict the functional consequences of variants using the Variant Effect Predictor.

# ensembl_vep(hgvs) — predict variant consequences
let variants = [
  "17:g.43091434C>T",   # BRCA1 splice donor
  "7:g.140753336A>T",   # BRAF V600E
  "12:g.25245350C>A",   # KRAS G12V
]

let predictions = variants |> map(|v| ensembl_vep(v))

# Each result is a list of records with:
# {allele_string, most_severe_consequence, transcript_consequences: [...]}
predictions |> each(|pred| {
  if len(pred) > 0 then {
    let r = pred[0]
    print(r.allele_string + " => " + r.most_severe_consequence)
  }
})

UniProt

Search the UniProt protein database.

# uniprot_search(query, limit?)
let kinases = uniprot_search("kinase AND organism_id:9606", 50)
# Returns list of records: {accession, name, organism, sequence_length, gene_names, function}

print(str(len(kinases)) + " human kinases found")
kinases |> take(5) |> each(|k| print(k.accession + " " + k.gene_names))

uniprot_entry

Get full details for a single UniProt accession.

let entry = uniprot_entry("P04637")  # TP53
# Returns: {accession, name, organism, sequence_length, gene_names, function}

print(entry.name + ": " + str(entry.sequence_length) + " aa")
print("Genes: " + entry.gene_names)
print("Function: " + entry.function)

# Get protein features separately
let features = uniprot_features("P04637")
# Returns list of: {type, location, description}
let domains = features |> filter(|f| f.type == "Domain")
domains |> each(|d| print("  " + d.description + ": " + d.location))

UCSC Genome Browser

ucsc_sequence

Retrieve genomic sequences from the UCSC Genome Browser.

# Get the sequence of the BRCA1 promoter region
# ucsc_sequence(genome, chrom, start, end)
let promoter = ucsc_sequence("hg38", "chr17", 43170245, 43172245)
# Returns DNA sequence as a string

print("BRCA1 promoter length: " + str(len(promoter)) + " bp")
let gc = gc_content(dna(promoter))
print("GC content: " + str(gc))

KEGG

kegg_find

Search KEGG databases (pathway, enzyme, compound, etc.).

# kegg_find(db, query) — search KEGG databases
let pathways = kegg_find("pathway", "apoptosis human")
# Returns list of: {id, description}

kegg_get

Retrieve a specific KEGG entry.

# kegg_get(entry_id) — returns raw KEGG text
let apoptosis = kegg_get("hsa04210")
# Returns the KEGG flat-file text for the entry
print("Entry text length: " + str(len(apoptosis)) + " chars")

STRING

string_network

Query protein-protein interaction networks from the STRING database.

# string_network(identifiers, species)
# First argument must be a list of protein names
let network = string_network(["TP53"], 9606)
# Returns list of: {protein_a, protein_b, score}

network |> each(|i| print(i.protein_a + " <-> " + i.protein_b + " (" + str(i.score) + ")"))

PDB

pdb_entry

Retrieve protein structure metadata from the Protein Data Bank.

let structure = pdb_entry("1TUP")  # TP53 DNA-binding domain
# structure => {
#   id: "1TUP",
#   title: "Crystal structure of the p53 core domain...",
#   resolution: 2.2,
#   method: "X-RAY DIFFRACTION",
#   chains: [{id: "A", entity: "Tumor suppressor p53", length: 195}, ...],
#   ligands: [...],
# }

print(structure.title)
print("Resolution: " + str(structure.resolution) + " A")

Reactome

reactome_pathways

Find pathways associated with a gene or set of genes.

# reactome_pathways(gene_or_genes)
let pathways = reactome_pathways("BRCA1")
# Returns pathway records

pathways |> each(|p| print(p))

Gene Ontology

go_term

Look up a GO term by its identifier.

let term = go_term("GO:0006915")
# term => {id: "GO:0006915", name: "apoptotic process",
#           namespace: "biological_process",
#           definition: "A programmed cell death process..."}

go_annotations

Get GO annotations for a gene.

# go_annotations(gene_or_accession)
let annotations = go_annotations("TP53")
# Returns list of: {go_id, term, aspect}

annotations |> take(10) |> each(|a| print("  " + a.go_id + " [" + a.aspect + "]: " + a.term))

COSMIC

cosmic_gene

Query the Catalogue Of Somatic Mutations In Cancer. Requires COSMIC_API_KEY.

# cosmic_gene(gene) — requires COSMIC_API_KEY env var
let cosmic = cosmic_gene("BRAF")
# Returns mutation data for the gene

print(cosmic)

NCBI Datasets

datasets_gene

Use the NCBI Datasets API for gene metadata.

# datasets_gene(symbol_or_id)
let info = datasets_gene("EGFR")
print(info)

Environment Variables

Some APIs require or benefit from API keys:

VariableEffect
NCBI_API_KEYNCBI: 10 req/sec instead of 3
COSMIC_API_KEYRequired for COSMIC queries

Set these in your shell or in a .env file before running your script.

Example: Gene Annotation Pipeline

Fetch gene info from NCBI, cross-reference with UniProt, and pull pathways from KEGG.

let gene_list = ["TP53", "BRCA1", "EGFR", "KRAS", "PIK3CA"]

let annotated = gene_list |> map(|symbol| {
  let ncbi = ncbi_gene(symbol)
  let up_hits = uniprot_search("gene:" + symbol + " AND organism_id:9606", 1)
  let prot_len = if len(up_hits) > 0 then up_hits[0].sequence_length else 0

  let pathways = kegg_find("pathway", symbol + " human")
    |> take(3)

  {
    symbol:         symbol,
    name:           ncbi.name,
    chromosome:     ncbi.chromosome,
    location:       ncbi.location,
    protein_length: prot_len,
    pathways:       pathways |> map(|p| p.description),
  }
})

annotated |> each(|g| {
  print(g.symbol + " (" + g.name + ") - chr" + g.chromosome)
  print("  Protein: " + str(g.protein_length) + " aa")
  print("  Pathways: " + str(g.pathways))
})

annotated |> write_json("gene_annotations.json")

Example: Variant Interpretation

Predict variant effects with Ensembl VEP and check for known cancer mutations in COSMIC.

let variants = tsv("candidate_variants.tsv")
  |> map(|r| r.chrom + ":" + str(r.pos) + ":" + r.ref + ":" + r.alt)

let interpreted = variants |> map(|v| {
  let vep = ensembl_vep(v)

  let worst = vep.consequences
    |> sort_by(|c| c.impact_rank)
    |> first()

  let gene = worst.gene_symbol

  # Check COSMIC if it is a missense or nonsense variant
  let cosmic_info = if worst.consequence == "missense_variant" or
                       worst.consequence == "stop_gained" then
    cosmic_gene(gene)
      |> |cg| cg.mutations
      |> find(|m| m.aa_change == worst.amino_acid_change)
  else
    None

  {
    variant: v,
    gene: gene,
    consequence: worst.consequence,
    impact: worst.impact,
    sift: worst.sift_prediction,
    polyphen: worst.polyphen_prediction,
    cosmic_count: if cosmic_info != None then cosmic_info.count else 0,
    cosmic_known: cosmic_info != None,
  }
})

# Flag high-impact or COSMIC-known variants
let flagged = interpreted
  |> filter(|v| v.impact == "HIGH" or v.cosmic_known)
  |> sort_by(|v| -v.cosmic_count)

print(str(len(flagged)) + " variants flagged for review")
flagged |> write_tsv("flagged_variants.tsv")

Example: Protein Interaction Network

Build a STRING interaction network for a set of differentially expressed genes and annotate with Reactome pathways.

let de_genes = tsv("de_results.tsv")
  |> filter(|r| r.q_value < 0.01 and abs(r.log2fc) > 2.0)
  |> map(|r| r.gene)

# Get STRING interactions for the top 50 DE genes
let top_genes = de_genes |> take(50)
let network = string_network(top_genes, 9606)
# Returns list of: {protein_a, protein_b, score}

print(str(len(network)) + " interactions")

# Find hub genes (most connections)
let all_proteins = network |> map(|i| i.protein_a) + network |> map(|i| i.protein_b)
let unique_proteins = all_proteins |> unique()
let degree = unique_proteins |> map(|p| {
  let edges = network |> filter(|i| i.protein_a == p || i.protein_b == p)
  {gene: p, degree: len(edges)}
})
  |> sort(|a, b| b.degree - a.degree)

print("Hub genes:")
degree |> take(10) |> each(|d| print("  " + d.gene + ": " + str(d.degree) + " interactions"))

# Pathway enrichment for hub genes
let hub_genes = degree |> take(10) |> map(|d| d.gene)
let pathways = reactome_pathways(hub_genes)
  |> filter(|p| p.p_value < 0.05)
  |> sort_by(|p| p.p_value)

print("Enriched pathways:")
pathways |> take(10) |> each(|p| print("  " + p.name + " (p=" + str(p.p_value) + ")"))

Summary

BioLang’s built-in bio API functions let you query 12 major biological databases directly from your scripts. Combine them with pipes, maps, and filters to build annotation pipelines that cross-reference genes, variants, proteins, and pathways in a few lines of code. Set API keys via environment variables to increase rate limits where supported.