Beginner ~20 minutes

Working with Tables

BioLang's Table type is a built-in dataframe for structured data analysis. Think of it as a spreadsheet with columns and rows, optimized for bioinformatics workflows. This tutorial covers everything from reading CSVs to complex joins and aggregations.

What you will learn

Reading CSV and TSV files into tables
Selecting, filtering, and sorting columns
Group-by aggregations and summaries
Joining tables together
Pivoting and reshaping data
Writing tables to files

Run this tutorial: Download tables.bl and run it with bl run examples/tutorials/tables.bl

Step 1 — Reading a CSV File

The csv() function loads a CSV file into a Table. Column types are automatically inferred.

# requires: data/expression.csv in working directory
# tables.bio — working with tabular data

# Read a gene expression file
let expr = read_read_csv("data/expression.csv")

# Preview the table
print(expr)
# ┌──────────┬────────┬────────┬────────┬────────┐
# │ gene     │ sample1│ sample2│ sample3│ group  │
# ├──────────┼────────┼────────┼────────┼────────┤
# │ TP53     │  142.5 │  138.2 │  155.8 │ control│
# │ BRCA1    │   87.3 │   92.1 │   84.6 │ control│
# │ MYC      │  512.4 │  498.7 │  534.1 │ case   │
# │ ...      │   ...  │   ...  │   ...  │ ...    │
# └──────────┴────────┴────────┴────────┴────────┘

# Basic info
print(f"Rows: {nrow(expr)}, Cols: {ncol(expr)}")
print(f"Columns: {colnames(expr)}")

For tab-separated files, use tsv():

let tsv_data = tsv("data/results.tsv")

Step 2 — Selecting Columns

Use select() to pick specific columns. This returns a new table.

# Select specific columns
let subset = expr |> select("gene", "sample1", "group")

# Select all columns except one
let no_group = expr |> drop_cols("group")

# Rename columns
let renamed = expr
  |> rename("sample1", "control_1")
  |> rename("sample2", "control_2")
  |> rename("sample3", "treated_1")

# Add a computed column
let with_mean = expr |> mutate("mean_expr", |row| (row.sample1 + row.sample2 + row.sample3) / 3.0)
print(with_mean)

Step 3 — Filtering Rows

The filter() function keeps rows that match a condition.

# Filter by group
let controls = expr |> filter(|row| row.group == "control")
print(f"Control genes: {nrow(controls)}")

# Filter by expression level
let high_expr = expr |> filter(|row| row.sample1 > 100.0)

# Combine conditions — use |> into for left-to-right binding
expr |> filter(|row|
  row.group == "case" and row.sample1 > 200.0
) |> into interesting

# Filter using a list of gene names
let target_genes = ["TP53", "BRCA1", "EGFR", "KRAS"]
expr |> filter(|row| row.gene in target_genes) |> into targets
print(f"Found {nrow(targets)} target genes")

Step 4 — Sorting

# Sort by sample1 expression (ascending)
let sorted_asc = expr |> arrange("sample1")

# Sort descending
let sorted_desc = expr |> arrange("sample1", "desc")

# Get the top 5 highest expressed genes
let top5 = expr
  |> arrange("sample1", "desc")
  |> head(5)

print("Top 5 genes by expression:")
print(top5)

Step 5 — Group By and Summarize

Group rows by a column and compute aggregate statistics per group.

# Group by the "group" column and compute stats
let groups = expr |> group_by("group")

# summarize takes a Map (from group_by) and a callback |key, subtable| => Record
let summary = summarize(groups, |key, g| {
  group:   key,
  count:   nrow(g),
  mean_s1: mean(col(g, "sample1")),
  max_s1:  max(col(g, "sample1")),
  std_s1:  stdev(col(g, "sample1")),
})

print(summary)
# ┌─────────┬───────┬────────┬────────┬────────┐
# │ group   │ count │ mean_s1│ max_s1 │ std_s1 │
# ├─────────┼───────┼────────┼────────┼────────┤
# │ control │    50 │  125.4 │  342.1 │   67.8 │
# │ case    │    50 │  198.7 │  534.1 │   89.2 │
# └─────────┴───────┴────────┴────────┴────────┘

# Add a flag column, then group and summarize
let flagged = expr |> mutate("high_expr", |row| row.sample1 > 200.0)
let flag_groups = flagged |> group_by("group")
let flag_summary = summarize(flag_groups, |key, g| {
  group: key,
  n:     nrow(g),
})

Step 6 — Joining Tables

Combine tables using SQL-style joins.

# requires: data/gene_annotations.csv in working directory
# Load a gene annotation table
let annotations = read_read_csv("data/expression.csv")
# gene, chromosome, start, end, description

# Inner join: only genes present in both tables
let annotated = inner_join(expr, annotations, "gene")

print(colnames(annotated))
# [gene, sample1, sample2, sample3, group, chromosome, start, end, description]

# Left join: keep all expression rows, add annotations where available
let full = left_join(expr, annotations, "gene")

# Other join types are also available:
# right_join(t1, t2, "key"), outer_join(t1, t2, "key"),
# anti_join(t1, t2, "key"), semi_join(t1, t2, "key")

Step 7 — Pivoting and Reshaping

# Pivot from wide to long format
let long = pivot_longer(expr, ["sample1", "sample2", "sample3"], "sample", "expression")

print(long)
# ┌──────┬─────────┬─────────┬────────────┐
# │ gene │ group   │ sample  │ expression │
# ├──────┼─────────┼─────────┼────────────┤
# │ TP53 │ control │ sample1 │      142.5 │
# │ TP53 │ control │ sample2 │      138.2 │
# │ TP53 │ control │ sample3 │      155.8 │
# │ ...  │ ...     │ ...     │        ... │
# └──────┴─────────┴─────────┴────────────┘

# Pivot back from long to wide
let wide = pivot_wider(long, "sample", "expression")

Step 8 — Column Operations

# Access a column as a list
let s1 = col(expr, "sample1")
print(f"Mean: {mean(s1)}")
print(f"Median: {median(s1)}")
print(f"Std: {round(stdev(s1), 2)}")

# Apply a function to each value in a column
let log_expr = expr |> mutate("log_sample1", |row| log2(row.sample1 + 1.0))

# Create a new column from multiple columns
let with_ratio = expr |> mutate("ratio", |row| row.sample1 / (row.sample2 + 0.001))

# Conditional column
let classified = expr |> mutate("expression_level", |row|
  if row.sample1 > 500.0 then "high"
  else if row.sample1 > 100.0 then "medium"
  else "low"
)

Step 9 — Creating Tables from Scratch

# Build a table from a list of records
let data = from_records([
  { gene: "TP53",  expr: 142.5, pvalue: 0.001 },
  { gene: "BRCA1", expr:  87.3, pvalue: 0.023 },
  { gene: "MYC",   expr: 512.4, pvalue: 0.0001 },
  { gene: "EGFR",  expr: 245.1, pvalue: 0.045 },
])

print(data)
# ┌───────┬───────┬────────┐
# │ gene  │ expr  │ pvalue │
# ├───────┼───────┼────────┤
# │ TP53  │ 142.5 │  0.001 │
# │ BRCA1 │  87.3 │  0.023 │
# │ MYC   │ 512.4 │ 0.0001 │
# │ EGFR  │ 245.1 │  0.045 │
# └───────┴───────┴────────┘

Step 10 — Writing Output

# Write to CSV
write_csv(annotated, "results/annotated_genes.csv")

# Write to TSV
write_tsv(annotated, "results/annotated_genes.tsv")

# Write top 20 genes sorted by expression
let top20 = annotated
  |> arrange("sample1", "desc")
  |> head(20)
write_csv(top20, "results/top20_genes.csv")

# Complete pipeline: read, process, write
let result = read_csv("data/expression.csv")
  |> filter(|row| row.group == "case")
  |> mutate("mean", |row| (row.sample1 + row.sample2 + row.sample3) / 3.0)
  |> arrange("mean", "desc")
  |> select("gene", "mean", "group")
write_csv(result, "results/top_case_genes.csv")

print("Pipeline complete.")

Next Steps

Now that you can work with both sequences and tables, try the Variant Analysis tutorial to combine both skills in a VCF analysis workflow.