Table Operations

32 dplyr-inspired functions for columnar data. Tables are BioLang's primary data structure for CSV files, variant calls, and expression matrices. All operations are immutable and pipeable.

table

Create a table from a column map. Each key is a column name, each value is a list of equal length.

table(columns) -> table

let t = table({
  "gene":   ["BRCA1", "TP53", "EGFR", "KRAS"],
  "log2fc": [2.3, -1.5, 3.1, -0.8],
  "pvalue": [0.001, 0.05, 0.0001, 0.3]
})

csv / write_csv

Read a CSV/TSV file into a table or write a table to file.

csv(path, opts?) -> table
write_csv(tbl, path, opts?) -> string

Option	Type	Default	Description
separator	string	","	Field delimiter
header	bool	true	First row is header
comment	string	nil	Comment prefix to skip lines
skip	int	0	Lines to skip before header

# Read CSV
let de = read_csv("data/expression.csv")

# Read TSV (e.g., VCF or BED)
let bed = read_csv("data/regions.bed", {separator: "\t", header: false})

# Write filtered results
de |> filter(|row| row.padj < 0.05)
   |> write_csv("significant.csv")

select

Select a subset of columns by name.

select(tbl, columns...) -> table

de |> select("gene", "log2fc", "padj")

filter (table)

Filter rows where a predicate returns true. The callback receives each row as a map.

filter(tbl, fn) -> table

de |> filter(|row| row.padj < 0.05 and abs(row.log2fc) > 1.0)

arrange

Sort table rows by one or more columns. Prefix with "-" for descending.

arrange(tbl, columns...) -> table

de |> arrange("padj")           # ascending by p-value
de |> arrange("-log2fc")        # descending by fold change
de |> arrange("chrom", "start") # multi-column sort

group_by / summarize

Group rows and compute aggregate statistics per group.

group_by(tbl, columns...) -> grouped_table
summarize(grouped, aggregations) -> table

let counts = read_csv("data/counts.csv")

counts
  |> group_by("sample")
  |> summarize(|key, rows| {
    sample: key,
    total_reads: sum(col(rows, "count")),
    num_genes: len(unique(col(rows, "gene"))),
    mean_count: mean(col(rows, "count"))
  })

mutate

Add or modify a column using a row-wise function.

mutate(tbl, name, fn) -> table

de |> mutate("neg_log10p", |row| -log10(row.padj))
   |> mutate("significant", |row| row.padj < 0.05 and abs(row.log2fc) > 1)
   |> mutate("direction", |row| if row.log2fc > 0 { "up" } else { "down" })

Joins

SQL-style table joins. The on parameter specifies the join key column(s).

left_join(a, b, on) -> table
inner_join(a, b, on) -> table
right_join(a, b, on) -> table
outer_join(a, b, on) -> table
semi_join(a, b, on) -> table    # rows in a that have a match in b
anti_join(a, b, on) -> table    # rows in a that have NO match in b
cross_join(a, b) -> table       # cartesian product

let expression = read_csv("data/expression.csv")   # gene, log2fc, padj
let annotation = read_csv("data/counts.csv")    # gene, chrom, start, end, biotype

# Annotate DE results with genomic coordinates
let annotated = left_join(expression, annotation, "gene")

# Keep only protein-coding genes with significant changes
annotated
  |> filter(|row| row.biotype == "protein_coding" and row.padj < 0.05)

# Anti-join: find genes in expression but NOT in annotation
let missing = anti_join(expression, annotation, "gene")

pivot_wider / pivot_longer

Reshape between long and wide formats.

pivot_wider(tbl, names_from, values_from) -> table
pivot_longer(tbl, cols, names_to, values_to) -> table

# Long to wide: one column per sample
let long_counts = read_csv("data/counts.csv")   # gene, sample, count
let wide = pivot_wider(long_counts, "sample", "count")
# gene | sample_A | sample_B | sample_C

# Wide to long for plotting
let wide_expr = read_csv("data/expression.csv")
let long = pivot_longer(wide_expr, ["S1", "S2", "S3"], "sample", "expression")

Window Functions

Ordered operations computed within groups or the full table.

window(tbl, name, fn, opts?) -> table
row_number(tbl, name, order_by) -> table
rank(tbl, name, order_by) -> table
lead(tbl, col, n?) -> table
lag(tbl, col, n?) -> table

# Rank genes by p-value per chromosome
de |> group_by("chrom")
   |> rank("rank", "padj")

# Compute running difference
de |> arrange("padj")
   |> lag("prev_padj", "padj", 1)
   |> mutate("delta", |row| row.padj - (row.prev_padj ?? 0))

Utility Functions

Function	Signature	Description
rename	rename(tbl, old, new) -> table	Rename a column
distinct	distinct(tbl, cols?) -> table	Remove duplicate rows
head	head(tbl, n?) -> table	First n rows (default 10)
tail	tail(tbl, n?) -> table	Last n rows (default 10)
nrow	nrow(tbl) -> int	Number of rows
ncol	ncol(tbl) -> int	Number of columns
colnames	colnames(tbl) -> list	List of column names
to_records	to_records(tbl) -> list	Convert to list of row maps

let de = read_csv("data/expression.csv")
println("Rows:", nrow(de), "Cols:", ncol(de))
println("Columns:", colnames(de))
de |> head(5)   # preview first 5 rows

where

Alias for filter with a more SQL-like feel. Filters rows where a condition is true.

# Filter rows where score exceeds threshold
data |> where(|r| r.score > 30)

# Equivalent to filter
data |> filter(|r| r.score > 30)

# Chain multiple where clauses
variants
  |> where(|r| r.quality > 20)
  |> where(|r| r.af < 0.01)
  |> where(|r| r.impact == "HIGH")

case_when

Conditional value mapping — returns the value for the first matching condition. Takes alternating condition/value lambda pairs.

# Classify variants by allele frequency
data |> mutate("impact", |r| case_when(
  |_| r.af > 0.05, "common",
  |_| r.af > 0.01, "low_freq",
  |_| true, "rare"
))

# Assign risk levels
patients |> mutate("risk", |r| case_when(
  |_| r.score > 90, "critical",
  |_| r.score > 60, "high",
  |_| r.score > 30, "moderate",
  |_| true, "low"
))