Tables
Tables are a first-class data type in BioLang, designed for the tabular data that
dominates bioinformatics workflows. BioLang provides dplyr-inspired verbs —
select, filter, arrange, group_by,
summarize, join, pivot, and window
— that compose naturally with pipes.
Creating Tables
# From named columns
let variants = table(
gene = ["BRCA1", "TP53", "EGFR", "KRAS", "BRAF"],
chrom = ["chr17", "chr17", "chr7", "chr12", "chr7"],
position = [43044295, 7577120, 55259515, 25398284, 140453136],
impact = ["HIGH", "HIGH", "MODERATE", "HIGH", "MODERATE"],
frequency = [0.012, 0.034, 0.089, 0.156, 0.023]
)
# From a CSV file
let data = csv("results.csv")
# From a TSV file
let bed = tsv("regions.bed", header = false,
col_names = ["chrom", "start", "end", "name", "score"])
# From VCF
let vcf_data = read_vcf("variants.vcf")
# Empty table with schema
let empty = table(gene: List[String] = [], score: List[Float] = [])
Table Information
let t = csv("experiment.csv")
t.num_rows # Number of rows
t.num_cols # Number of columns
t.columns # List of column names
describe(t) # Summary statistics with types and sample values
head(t, 5) # First 5 rows
tail(t, 5) # Last 5 rows
Select
Choose columns to keep, rename columns, or reorder them:
# Keep specific columns
let subset = variants |> select("gene", "impact", "frequency")
# Drop columns (negative selection)
let trimmed = variants |> select(-"position", -"chrom")
# Rename while selecting
let renamed = variants |> select(
gene_name = "gene",
chr = "chrom",
freq = "frequency"
)
# Select with a filter
let qual_cols = data |> select("qual_a", "qual_b", "qual_c")
Filter
Keep rows matching a predicate:
# Simple filter
let high_impact = variants |> filter(|r| r.impact == "HIGH")
# Multiple conditions
let significant = variants
|> filter(|r| r.impact == "HIGH" && r.frequency > 0.01)
# Filter with string operations
let chr17 = variants |> filter(|r| r.chrom == "chr17")
# Filter with 'in' operator
let target_genes = ["BRCA1", "TP53", "KRAS"]
let targets = variants |> filter(|r| r.gene in target_genes)
# Remove None values
let clean = data |> filter(|r| r.score != None)
Arrange
Sort rows by one or more columns:
# Sort ascending (default)
let sorted = variants |> arrange("frequency")
# Sort descending
let top_freq = variants |> arrange(desc("frequency"))
# Multi-column sort
let ordered = variants |> arrange("chrom", desc("position"))
# Sort by computed value
let by_name_len = variants |> arrange(|r| len(r.gene))
Mutate
Add new columns or transform existing ones:
# Add a new column
let enriched = variants
|> mutate(log_freq = |r| log10(r.frequency))
# Multiple new columns
let annotated = variants
|> mutate(
is_rare = |r| r.frequency < 0.01,
label = |r| f"{r.gene}:{r.chrom}:{r.position}",
severity = |r| if r.impact == "HIGH" { 3 } else { 1 }
)
# Overwrite an existing column
let updated = variants
|> mutate(frequency = |r| round(r.frequency, 4))
Group By and Summarize
Group rows by one or more columns, then compute aggregate statistics per group:
# Group and summarize
let by_impact = variants
|> group_by("impact")
|> summarize(
count = n(),
avg_freq = mean(frequency),
max_freq = max(frequency),
genes = collect(gene)
)
# Multiple grouping columns
let by_chrom_impact = variants
|> group_by("chrom", "impact")
|> summarize(
count = n(),
total_freq = sum(frequency)
)
# Available aggregation functions:
# n(), sum(), mean(), median(), min(), max(), stdev(), variance(),
# first(), last(), collect(), count(), quantile()
Join
Combine two tables based on matching columns:
let annotations = table(
gene = ["BRCA1", "TP53", "EGFR"],
pathway = ["DNA repair", "Cell cycle", "Growth signaling"],
druggable = [false, false, true]
)
# Inner join — only matching rows
let joined = variants |> inner_join(annotations, by = "gene")
# Left join — keep all rows from left table
let enriched = variants |> left_join(annotations, by = "gene")
# Full outer join — keep all rows from both (not built-in, use left_join both ways)
# let all_data = ...
# Join on different column names
let merged = table_a |> inner_join(table_b, left = "gene_id", right = "id")
Pivot
Reshape tables between long and wide formats:
# Long to wide (pivot_wider)
let expression = table(
gene = ["BRCA1", "BRCA1", "TP53", "TP53"],
sample = ["tumor", "normal", "tumor", "normal"],
fpkm = [12.5, 8.3, 45.2, 42.1]
)
let wide = expression |> pivot_wider(
names_from = "sample",
values_from = "fpkm"
)
# Result: gene | tumor | normal
# BRCA1 12.5 8.3
# TP53 45.2 42.1
# Wide to long (pivot_longer)
let long = wide |> pivot_longer(
cols = ["tumor", "normal"],
names_to = "sample",
values_to = "fpkm"
)
Window Functions
Compute values across a sliding window of rows without collapsing groups:
# Running average
let smoothed = signal_data
|> arrange("position")
|> mutate(
rm = |r| rolling_mean(r.value, 5),
rs = |r| rolling_sum(r.value, 5)
)
# Rank within groups
let ranked = variants
|> group_by("chrom")
|> mutate(
rk = |r| row_number(),
cs = |r| cumsum(r.frequency)
)
# Lead and lag
let time_series = data
|> arrange("timepoint")
|> mutate(
prev_value = lag(value, 1),
next_value = lead(value, 1),
delta = value - lag(value, 1)
)
Distinct and Count
# Unique rows
let unique_genes = variants |> distinct("gene")
# Unique combinations
let unique_pairs = variants |> distinct("chrom", "impact")
# Count occurrences
let counts = variants |> count("impact")
# impact | n
# HIGH | 3
# MODERATE | 2
# Count with sort
let top_chroms = variants |> count("chrom", sort = true)
Slice Operations
# First/last n rows
let top5 = variants |> head(5)
let bottom5 = variants |> tail(5)
# Slice by index range
let middle = variants |> slice(10..20)
# Top n by value (arrange + head)
let top_freq = variants |> arrange(desc("frequency")) |> head(3)
let bottom_freq = variants |> arrange("frequency") |> head(3)
# Random sample
let sampled = variants |> sample(100)
Chaining Table Operations
The real power of table verbs is in composing them into complete analysis pipelines:
# Complete analysis pipeline
let report = read_vcf("cohort.vcf")
|> filter(|v| v.qual >= 30.0 && v.depth >= 10)
|> mutate(
af = |v| v.alt_count / v.total_count,
is_rare = |v| v.af < 0.01
)
|> left_join(gene_annotations, by = "gene")
|> filter(|v| v.impact == "HIGH" || v.impact == "MODERATE")
|> group_by("gene", "impact")
|> summarize(
variant_count = n(),
avg_af = mean(af),
rare_count = sum(is_rare),
samples = count(distinct(sample_id))
)
|> arrange(desc(variant_count))
|> head(50)
Writing Tables
# Write to CSV
report |> write_csv("report.csv")
# Write to TSV
report |> write_tsv("report.tsv")
# Write with options
report |> write_csv("report.csv", quote = true, na = "NA")
# Pretty-print to console
print(report)