Tables

Tables are a first-class data type in BioLang, designed for the tabular data that dominates bioinformatics workflows. BioLang provides dplyr-inspired verbs — select, filter, arrange, group_by, summarize, join, pivot, and window — that compose naturally with pipes.

Creating Tables

# From named columns
let variants = table(
  gene = ["BRCA1", "TP53", "EGFR", "KRAS", "BRAF"],
  chrom = ["chr17", "chr17", "chr7", "chr12", "chr7"],
  position = [43044295, 7577120, 55259515, 25398284, 140453136],
  impact = ["HIGH", "HIGH", "MODERATE", "HIGH", "MODERATE"],
  frequency = [0.012, 0.034, 0.089, 0.156, 0.023]
)

# From a CSV file
let data = csv("results.csv")

# From a TSV file
let bed = tsv("regions.bed", header = false,
  col_names = ["chrom", "start", "end", "name", "score"])

# From VCF
let vcf_data = read_vcf("variants.vcf")

# Empty table with schema
let empty = table(gene: List[String] = [], score: List[Float] = [])

Table Information

let t = csv("experiment.csv")

t.num_rows       # Number of rows
t.num_cols       # Number of columns
t.columns        # List of column names
describe(t)      # Summary statistics with types and sample values
head(t, 5)       # First 5 rows
tail(t, 5)       # Last 5 rows

Select

Choose columns to keep, rename columns, or reorder them:

# Keep specific columns
let subset = variants |> select("gene", "impact", "frequency")

# Drop columns (negative selection)
let trimmed = variants |> select(-"position", -"chrom")

# Rename while selecting
let renamed = variants |> select(
  gene_name = "gene",
  chr = "chrom",
  freq = "frequency"
)

# Select with a filter
let qual_cols = data |> select("qual_a", "qual_b", "qual_c")

Filter

Keep rows matching a predicate:

# Simple filter
let high_impact = variants |> filter(|r| r.impact == "HIGH")

# Multiple conditions
let significant = variants
  |> filter(|r| r.impact == "HIGH" && r.frequency > 0.01)

# Filter with string operations
let chr17 = variants |> filter(|r| r.chrom == "chr17")

# Filter with 'in' operator
let target_genes = ["BRCA1", "TP53", "KRAS"]
let targets = variants |> filter(|r| r.gene in target_genes)

# Remove None values
let clean = data |> filter(|r| r.score != None)

Arrange

Sort rows by one or more columns:

# Sort ascending (default)
let sorted = variants |> arrange("frequency")

# Sort descending
let top_freq = variants |> arrange(desc("frequency"))

# Multi-column sort
let ordered = variants |> arrange("chrom", desc("position"))

# Sort by computed value
let by_name_len = variants |> arrange(|r| len(r.gene))

Mutate

Add new columns or transform existing ones:

# Add a new column
let enriched = variants
  |> mutate(log_freq = |r| log10(r.frequency))

# Multiple new columns
let annotated = variants
  |> mutate(
    is_rare = |r| r.frequency < 0.01,
    label = |r| f"{r.gene}:{r.chrom}:{r.position}",
    severity = |r| if r.impact == "HIGH" { 3 } else { 1 }
  )

# Overwrite an existing column
let updated = variants
  |> mutate(frequency = |r| round(r.frequency, 4))

Group By and Summarize

Group rows by one or more columns, then compute aggregate statistics per group:

# Group and summarize
let by_impact = variants
  |> group_by("impact")
  |> summarize(
    count = n(),
    avg_freq = mean(frequency),
    max_freq = max(frequency),
    genes = collect(gene)
  )

# Multiple grouping columns
let by_chrom_impact = variants
  |> group_by("chrom", "impact")
  |> summarize(
    count = n(),
    total_freq = sum(frequency)
  )

# Available aggregation functions:
# n(), sum(), mean(), median(), min(), max(), stdev(), variance(),
# first(), last(), collect(), count(), quantile()

Join

Combine two tables based on matching columns:

let annotations = table(
  gene = ["BRCA1", "TP53", "EGFR"],
  pathway = ["DNA repair", "Cell cycle", "Growth signaling"],
  druggable = [false, false, true]
)

# Inner join — only matching rows
let joined = variants |> inner_join(annotations, by = "gene")

# Left join — keep all rows from left table
let enriched = variants |> left_join(annotations, by = "gene")

# Full outer join — keep all rows from both (not built-in, use left_join both ways)
# let all_data = ...

# Join on different column names
let merged = table_a |> inner_join(table_b, left = "gene_id", right = "id")

Pivot

Reshape tables between long and wide formats:

# Long to wide (pivot_wider)
let expression = table(
  gene = ["BRCA1", "BRCA1", "TP53", "TP53"],
  sample = ["tumor", "normal", "tumor", "normal"],
  fpkm = [12.5, 8.3, 45.2, 42.1]
)

let wide = expression |> pivot_wider(
  names_from = "sample",
  values_from = "fpkm"
)
# Result: gene | tumor | normal
#         BRCA1  12.5    8.3
#         TP53   45.2    42.1

# Wide to long (pivot_longer)
let long = wide |> pivot_longer(
  cols = ["tumor", "normal"],
  names_to = "sample",
  values_to = "fpkm"
)

Window Functions

Compute values across a sliding window of rows without collapsing groups:

# Running average
let smoothed = signal_data
  |> arrange("position")
  |> mutate(
    rm = |r| rolling_mean(r.value, 5),
    rs = |r| rolling_sum(r.value, 5)
  )

# Rank within groups
let ranked = variants
  |> group_by("chrom")
  |> mutate(
    rk = |r| row_number(),
    cs = |r| cumsum(r.frequency)
  )

# Lead and lag
let time_series = data
  |> arrange("timepoint")
  |> mutate(
    prev_value = lag(value, 1),
    next_value = lead(value, 1),
    delta = value - lag(value, 1)
  )

Distinct and Count

# Unique rows
let unique_genes = variants |> distinct("gene")

# Unique combinations
let unique_pairs = variants |> distinct("chrom", "impact")

# Count occurrences
let counts = variants |> count("impact")
# impact   | n
# HIGH     | 3
# MODERATE | 2

# Count with sort
let top_chroms = variants |> count("chrom", sort = true)

Slice Operations

# First/last n rows
let top5 = variants |> head(5)
let bottom5 = variants |> tail(5)

# Slice by index range
let middle = variants |> slice(10..20)

# Top n by value (arrange + head)
let top_freq = variants |> arrange(desc("frequency")) |> head(3)
let bottom_freq = variants |> arrange("frequency") |> head(3)

# Random sample
let sampled = variants |> sample(100)

Chaining Table Operations

The real power of table verbs is in composing them into complete analysis pipelines:

# Complete analysis pipeline
let report = read_vcf("cohort.vcf")
  |> filter(|v| v.qual >= 30.0 && v.depth >= 10)
  |> mutate(
    af = |v| v.alt_count / v.total_count,
    is_rare = |v| v.af < 0.01
  )
  |> left_join(gene_annotations, by = "gene")
  |> filter(|v| v.impact == "HIGH" || v.impact == "MODERATE")
  |> group_by("gene", "impact")
  |> summarize(
    variant_count = n(),
    avg_af = mean(af),
    rare_count = sum(is_rare),
    samples = count(distinct(sample_id))
  )
  |> arrange(desc(variant_count))
  |> head(50)

Writing Tables

# Write to CSV
report |> write_csv("report.csv")

# Write to TSV
report |> write_tsv("report.tsv")

# Write with options
report |> write_csv("report.csv", quote = true, na = "NA")

# Pretty-print to console
print(report)