Working with Tables
BioLang's Table type is a built-in dataframe for structured data
analysis. Think of it as a spreadsheet with columns and rows, optimized for
bioinformatics workflows. This tutorial covers everything from reading CSVs to
complex joins and aggregations.
What you will learn
- Reading CSV and TSV files into tables
- Selecting, filtering, and sorting columns
- Group-by aggregations and summaries
- Joining tables together
- Pivoting and reshaping data
- Writing tables to files
bl run examples/tutorials/tables.bl
Step 1 — Reading a CSV File
The csv() function loads a CSV file into a Table.
Column types are automatically inferred.
# requires: data/expression.csv in working directory
# tables.bio — working with tabular data
# Read a gene expression file
let expr = csv("data/expression.csv")
# Preview the table
print(expr)
# ┌──────────┬────────┬────────┬────────┬────────┐
# │ gene │ sample1│ sample2│ sample3│ group │
# ├──────────┼────────┼────────┼────────┼────────┤
# │ TP53 │ 142.5 │ 138.2 │ 155.8 │ control│
# │ BRCA1 │ 87.3 │ 92.1 │ 84.6 │ control│
# │ MYC │ 512.4 │ 498.7 │ 534.1 │ case │
# │ ... │ ... │ ... │ ... │ ... │
# └──────────┴────────┴────────┴────────┴────────┘
# Basic info
print(f"Rows: {nrow(expr)}, Cols: {ncol(expr)}")
print(f"Columns: {colnames(expr)}")
For tab-separated files, use tsv():
let tsv_data = tsv("data/results.tsv")
Step 2 — Selecting Columns
Use select() to pick specific columns. This returns a new table.
# Select specific columns
let subset = expr |> select("gene", "sample1", "group")
# Select all columns except one
let no_group = expr |> drop_cols("group")
# Rename columns
let renamed = expr
|> rename("sample1", "control_1")
|> rename("sample2", "control_2")
|> rename("sample3", "treated_1")
# Add a computed column
let with_mean = expr |> mutate("mean_expr", |row| (row.sample1 + row.sample2 + row.sample3) / 3.0)
print(with_mean)
Step 3 — Filtering Rows
The filter() function keeps rows that match a condition.
# Filter by group
let controls = expr |> filter(|row| row.group == "control")
print(f"Control genes: {nrow(controls)}")
# Filter by expression level
let high_expr = expr |> filter(|row| row.sample1 > 100.0)
# Combine conditions — use |> into for left-to-right binding
expr |> filter(|row|
row.group == "case" and row.sample1 > 200.0
) |> into interesting
# Filter using a list of gene names
let target_genes = ["TP53", "BRCA1", "EGFR", "KRAS"]
expr |> filter(|row| row.gene in target_genes) |> into targets
print(f"Found {nrow(targets)} target genes")
Step 4 — Sorting
# Sort by sample1 expression (ascending)
let sorted_asc = expr |> arrange("sample1")
# Sort descending
let sorted_desc = expr |> arrange("sample1", "desc")
# Get the top 5 highest expressed genes
let top5 = expr
|> arrange("sample1", "desc")
|> head(5)
print("Top 5 genes by expression:")
print(top5)
Step 5 — Group By and Summarize
Group rows by a column and compute aggregate statistics per group.
# Group by the "group" column and compute stats
let groups = expr |> group_by("group")
# summarize takes a Map (from group_by) and a callback |key, subtable| => Record
let summary = summarize(groups, |key, g| {
group: key,
count: nrow(g),
mean_s1: mean(col(g, "sample1")),
max_s1: max(col(g, "sample1")),
std_s1: stdev(col(g, "sample1")),
})
print(summary)
# ┌─────────┬───────┬────────┬────────┬────────┐
# │ group │ count │ mean_s1│ max_s1 │ std_s1 │
# ├─────────┼───────┼────────┼────────┼────────┤
# │ control │ 50 │ 125.4 │ 342.1 │ 67.8 │
# │ case │ 50 │ 198.7 │ 534.1 │ 89.2 │
# └─────────┴───────┴────────┴────────┴────────┘
# Add a flag column, then group and summarize
let flagged = expr |> mutate("high_expr", |row| row.sample1 > 200.0)
let flag_groups = flagged |> group_by("group")
let flag_summary = summarize(flag_groups, |key, g| {
group: key,
n: nrow(g),
})
Step 6 — Joining Tables
Combine tables using SQL-style joins.
# requires: data/gene_annotations.csv in working directory
# Load a gene annotation table
let annotations = csv("data/gene_annotations.csv")
# gene, chromosome, start, end, description
# Inner join: only genes present in both tables
let annotated = inner_join(expr, annotations, "gene")
print(colnames(annotated))
# [gene, sample1, sample2, sample3, group, chromosome, start, end, description]
# Left join: keep all expression rows, add annotations where available
let full = left_join(expr, annotations, "gene")
# Other join types are also available:
# right_join(t1, t2, "key"), outer_join(t1, t2, "key"),
# anti_join(t1, t2, "key"), semi_join(t1, t2, "key")
Step 7 — Pivoting and Reshaping
# Pivot from wide to long format
let long = pivot_longer(expr, ["sample1", "sample2", "sample3"], "sample", "expression")
print(long)
# ┌──────┬─────────┬─────────┬────────────┐
# │ gene │ group │ sample │ expression │
# ├──────┼─────────┼─────────┼────────────┤
# │ TP53 │ control │ sample1 │ 142.5 │
# │ TP53 │ control │ sample2 │ 138.2 │
# │ TP53 │ control │ sample3 │ 155.8 │
# │ ... │ ... │ ... │ ... │
# └──────┴─────────┴─────────┴────────────┘
# Pivot back from long to wide
let wide = pivot_wider(long, "sample", "expression")
Step 8 — Column Operations
# Access a column as a list
let s1 = col(expr, "sample1")
print(f"Mean: {mean(s1)}")
print(f"Median: {median(s1)}")
print(f"Std: {round(stdev(s1), 2)}")
# Apply a function to each value in a column
let log_expr = expr |> mutate("log_sample1", |row| log2(row.sample1 + 1.0))
# Create a new column from multiple columns
let with_ratio = expr |> mutate("ratio", |row| row.sample1 / (row.sample2 + 0.001))
# Conditional column
let classified = expr |> mutate("expression_level", |row|
if row.sample1 > 500.0 then "high"
else if row.sample1 > 100.0 then "medium"
else "low"
)
Step 9 — Creating Tables from Scratch
# Build a table from a list of records
let data = from_records([
{ gene: "TP53", expr: 142.5, pvalue: 0.001 },
{ gene: "BRCA1", expr: 87.3, pvalue: 0.023 },
{ gene: "MYC", expr: 512.4, pvalue: 0.0001 },
{ gene: "EGFR", expr: 245.1, pvalue: 0.045 },
])
print(data)
# ┌───────┬───────┬────────┐
# │ gene │ expr │ pvalue │
# ├───────┼───────┼────────┤
# │ TP53 │ 142.5 │ 0.001 │
# │ BRCA1 │ 87.3 │ 0.023 │
# │ MYC │ 512.4 │ 0.0001 │
# │ EGFR │ 245.1 │ 0.045 │
# └───────┴───────┴────────┘
Step 10 — Writing Output
# Write to CSV
write_csv(annotated, "results/annotated_genes.csv")
# Write to TSV
write_tsv(annotated, "results/annotated_genes.tsv")
# Write top 20 genes sorted by expression
let top20 = annotated
|> arrange("sample1", "desc")
|> head(20)
write_csv(top20, "results/top20_genes.csv")
# Complete pipeline: read, process, write
let result = csv("data/expression.csv")
|> filter(|row| row.group == "case")
|> mutate("mean", |row| (row.sample1 + row.sample2 + row.sample3) / 3.0)
|> arrange("mean", "desc")
|> select("gene", "mean", "group")
write_csv(result, "results/top_case_genes.csv")
print("Pipeline complete.")
Next Steps
Now that you can work with both sequences and tables, try the Variant Analysis tutorial to combine both skills in a VCF analysis workflow.