Beginner ~20 minutes

Working with Tables

BioLang's Table type is a built-in dataframe for structured data analysis. Think of it as a spreadsheet with columns and rows, optimized for bioinformatics workflows. This tutorial covers everything from reading CSVs to complex joins and aggregations.

What you will learn

  • Reading CSV and TSV files into tables
  • Selecting, filtering, and sorting columns
  • Group-by aggregations and summaries
  • Joining tables together
  • Pivoting and reshaping data
  • Writing tables to files
Run this tutorial: Download tables.bl and run it with bl run examples/tutorials/tables.bl

Step 1 — Reading a CSV File

The csv() function loads a CSV file into a Table. Column types are automatically inferred.

# requires: data/expression.csv in working directory
# tables.bio — working with tabular data

# Read a gene expression file
let expr = csv("data/expression.csv")

# Preview the table
print(expr)
# ┌──────────┬────────┬────────┬────────┬────────┐
# │ gene     │ sample1│ sample2│ sample3│ group  │
# ├──────────┼────────┼────────┼────────┼────────┤
# │ TP53     │  142.5 │  138.2 │  155.8 │ control│
# │ BRCA1    │   87.3 │   92.1 │   84.6 │ control│
# │ MYC      │  512.4 │  498.7 │  534.1 │ case   │
# │ ...      │   ...  │   ...  │   ...  │ ...    │
# └──────────┴────────┴────────┴────────┴────────┘

# Basic info
print(f"Rows: {nrow(expr)}, Cols: {ncol(expr)}")
print(f"Columns: {colnames(expr)}")

For tab-separated files, use tsv():

let tsv_data = tsv("data/results.tsv")

Step 2 — Selecting Columns

Use select() to pick specific columns. This returns a new table.

# Select specific columns
let subset = expr |> select("gene", "sample1", "group")

# Select all columns except one
let no_group = expr |> drop_cols("group")

# Rename columns
let renamed = expr
  |> rename("sample1", "control_1")
  |> rename("sample2", "control_2")
  |> rename("sample3", "treated_1")

# Add a computed column
let with_mean = expr |> mutate("mean_expr", |row| (row.sample1 + row.sample2 + row.sample3) / 3.0)
print(with_mean)

Step 3 — Filtering Rows

The filter() function keeps rows that match a condition.

# Filter by group
let controls = expr |> filter(|row| row.group == "control")
print(f"Control genes: {nrow(controls)}")

# Filter by expression level
let high_expr = expr |> filter(|row| row.sample1 > 100.0)

# Combine conditions — use |> into for left-to-right binding
expr |> filter(|row|
  row.group == "case" and row.sample1 > 200.0
) |> into interesting

# Filter using a list of gene names
let target_genes = ["TP53", "BRCA1", "EGFR", "KRAS"]
expr |> filter(|row| row.gene in target_genes) |> into targets
print(f"Found {nrow(targets)} target genes")

Step 4 — Sorting

# Sort by sample1 expression (ascending)
let sorted_asc = expr |> arrange("sample1")

# Sort descending
let sorted_desc = expr |> arrange("sample1", "desc")

# Get the top 5 highest expressed genes
let top5 = expr
  |> arrange("sample1", "desc")
  |> head(5)

print("Top 5 genes by expression:")
print(top5)

Step 5 — Group By and Summarize

Group rows by a column and compute aggregate statistics per group.

# Group by the "group" column and compute stats
let groups = expr |> group_by("group")

# summarize takes a Map (from group_by) and a callback |key, subtable| => Record
let summary = summarize(groups, |key, g| {
  group:   key,
  count:   nrow(g),
  mean_s1: mean(col(g, "sample1")),
  max_s1:  max(col(g, "sample1")),
  std_s1:  stdev(col(g, "sample1")),
})

print(summary)
# ┌─────────┬───────┬────────┬────────┬────────┐
# │ group   │ count │ mean_s1│ max_s1 │ std_s1 │
# ├─────────┼───────┼────────┼────────┼────────┤
# │ control │    50 │  125.4 │  342.1 │   67.8 │
# │ case    │    50 │  198.7 │  534.1 │   89.2 │
# └─────────┴───────┴────────┴────────┴────────┘

# Add a flag column, then group and summarize
let flagged = expr |> mutate("high_expr", |row| row.sample1 > 200.0)
let flag_groups = flagged |> group_by("group")
let flag_summary = summarize(flag_groups, |key, g| {
  group: key,
  n:     nrow(g),
})

Step 6 — Joining Tables

Combine tables using SQL-style joins.

# requires: data/gene_annotations.csv in working directory
# Load a gene annotation table
let annotations = csv("data/gene_annotations.csv")
# gene, chromosome, start, end, description

# Inner join: only genes present in both tables
let annotated = inner_join(expr, annotations, "gene")

print(colnames(annotated))
# [gene, sample1, sample2, sample3, group, chromosome, start, end, description]

# Left join: keep all expression rows, add annotations where available
let full = left_join(expr, annotations, "gene")

# Other join types are also available:
# right_join(t1, t2, "key"), outer_join(t1, t2, "key"),
# anti_join(t1, t2, "key"), semi_join(t1, t2, "key")

Step 7 — Pivoting and Reshaping

# Pivot from wide to long format
let long = pivot_longer(expr, ["sample1", "sample2", "sample3"], "sample", "expression")

print(long)
# ┌──────┬─────────┬─────────┬────────────┐
# │ gene │ group   │ sample  │ expression │
# ├──────┼─────────┼─────────┼────────────┤
# │ TP53 │ control │ sample1 │      142.5 │
# │ TP53 │ control │ sample2 │      138.2 │
# │ TP53 │ control │ sample3 │      155.8 │
# │ ...  │ ...     │ ...     │        ... │
# └──────┴─────────┴─────────┴────────────┘

# Pivot back from long to wide
let wide = pivot_wider(long, "sample", "expression")

Step 8 — Column Operations

# Access a column as a list
let s1 = col(expr, "sample1")
print(f"Mean: {mean(s1)}")
print(f"Median: {median(s1)}")
print(f"Std: {round(stdev(s1), 2)}")

# Apply a function to each value in a column
let log_expr = expr |> mutate("log_sample1", |row| log2(row.sample1 + 1.0))

# Create a new column from multiple columns
let with_ratio = expr |> mutate("ratio", |row| row.sample1 / (row.sample2 + 0.001))

# Conditional column
let classified = expr |> mutate("expression_level", |row|
  if row.sample1 > 500.0 then "high"
  else if row.sample1 > 100.0 then "medium"
  else "low"
)

Step 9 — Creating Tables from Scratch

# Build a table from a list of records
let data = from_records([
  { gene: "TP53",  expr: 142.5, pvalue: 0.001 },
  { gene: "BRCA1", expr:  87.3, pvalue: 0.023 },
  { gene: "MYC",   expr: 512.4, pvalue: 0.0001 },
  { gene: "EGFR",  expr: 245.1, pvalue: 0.045 },
])

print(data)
# ┌───────┬───────┬────────┐
# │ gene  │ expr  │ pvalue │
# ├───────┼───────┼────────┤
# │ TP53  │ 142.5 │  0.001 │
# │ BRCA1 │  87.3 │  0.023 │
# │ MYC   │ 512.4 │ 0.0001 │
# │ EGFR  │ 245.1 │  0.045 │
# └───────┴───────┴────────┘

Step 10 — Writing Output

# Write to CSV
write_csv(annotated, "results/annotated_genes.csv")

# Write to TSV
write_tsv(annotated, "results/annotated_genes.tsv")

# Write top 20 genes sorted by expression
let top20 = annotated
  |> arrange("sample1", "desc")
  |> head(20)
write_csv(top20, "results/top20_genes.csv")

# Complete pipeline: read, process, write
let result = csv("data/expression.csv")
  |> filter(|row| row.group == "case")
  |> mutate("mean", |row| (row.sample1 + row.sample2 + row.sample3) / 3.0)
  |> arrange("mean", "desc")
  |> select("gene", "mean", "group")
write_csv(result, "results/top_case_genes.csv")

print("Pipeline complete.")

Next Steps

Now that you can work with both sequences and tables, try the Variant Analysis tutorial to combine both skills in a VCF analysis workflow.