Data Wrangling

Bioinformatics involves heavy data manipulation. BioLang provides first-class support for CSV, TSV, JSON, and tabular data with pipe-friendly transforms that feel natural for data processing workflows.

Sample data: BioLang includes CSV, TSV, and other sample files in examples/sample-data/. Use examples/sample-data/samples.csv, examples/sample-data/gene_counts.csv, and examples/sample-data/counts.tsv to follow along.

Loading Data

CSV files

# Load a CSV with automatic header detection
let samples = read_csv("examples/sample-data/samples.csv")
samples |> head(5) |> print

# TSV is tab-delimited
let data = tsv("data.txt")
data |> describe |> print

TSV files

# Read a TSV file (e.g., gene expression matrix)
let expr = tsv("expression_matrix.tsv")
print(f"Rows: {len(expr)}")
expr |> describe |> print

Reading text files

# Load lines from a text file
let lines = read_lines("sample_ids.txt")
lines |> take(5) |> print

# Count entries
len(lines) |> print

Column Operations

Selecting columns

let data = read_csv("examples/sample-data/gene_counts.csv")

# Select specific columns
data |> select("gene_id", "sample_1", "sample_2") |> head(10) |> print

# Drop columns
data |> drop_cols("internal_id", "notes") |> write_csv("clean.csv")

Renaming columns

let data = tsv("results.tsv")
data
  |> rename("chr", "chrom")
  |> rename("pos", "position")
  |> write_tsv("renamed.tsv")

Adding computed columns

let variants = tsv("variants.tsv")
variants
  |> mutate("length", |row| { row["end"] - row["start"] })
  |> mutate("label", |row| {
    match row["type"] {
      "SNP" => "Single Nucleotide",
      "INS" => "Insertion",
      "DEL" => "Deletion",
      _ => "Other"
    }
  })
  |> write_tsv("annotated.tsv")

Filtering and Sorting

Row filtering

let data = read_csv("data/expression.csv")

# Filter significant genes
let sig = data
  |> filter(|row| { row["padj"] |> float < 0.05 })
  |> filter(|row| { (row["log2FoldChange"] |> float) |> abs > 1.0 })

print(f"Significant DE genes: {sig |> len}")
sig |> write_csv("significant_genes.csv")

Sorting

let data = read_csv("data/expression.csv")

# Sort by a single column
data |> sort_by(|a, b| a.p_value - b.p_value) |> head(20) |> print

# Sort descending
data |> sort_by(|a, b| b.fold_change - a.fold_change) |> head(10) |> print

# Multi-column sort
data |> sort_by(|a, b| [a.chromosome, a.position]) |> write_csv("sorted.csv")

Grouping and Aggregation

Group by and summarize

let counts = tsv("gene_counts.tsv")

# Count genes per chromosome
counts
  |> group_by("chrom")
  |> summarize(|chrom, rows| {
    chrom: chrom,
    n_genes: len(rows),
    avg_length: rows |> map(|r| r.length) |> mean,
    max_expr: rows |> map(|r| r.expression) |> max
  })
  |> sort_by(|a, b| b.n_genes - a.n_genes)
  |> print

Pivot / reshape

# Wide to long format
let wide = read_csv("data/expression.csv")
wide
  |> pivot_longer(["sample_a", "sample_b", "sample_c"], "sample", "expression")
  |> write_read_csv("data/expression.csv")

# Long to wide format
let long_data = read_csv("data/expression.csv")
long_data
  |> pivot_wider("sample", "expression")
  |> write_csv("expression_wide_again.csv")

Merging and Joining

Join tables

let genes = tsv("genes.tsv")        # gene_id, gene_name, chrom
let counts = tsv("examples/sample-data/counts.tsv")  # chrom, start, end, count

# Inner join on gene_id
let merged = genes |> inner_join(counts, "gene_id")
merged |> head(5) |> print

# Left join to keep all genes (unmatched get nil)
let full = genes |> left_join(counts, "gene_id")
let missing = full |> filter(|row| { row["count"] == nil })
print(f"Genes without counts: {missing |> len}")

Concatenating tables

# Stack tables vertically
let batch1 = read_csv("data/expression.csv")
let batch2 = read_csv("data/counts.csv")
let combined = concat([batch1, batch2])
combined |> write_csv("all_results.csv")

# Add a batch label
let labeled = concat([
  batch1 |> mutate("batch", |_| "batch1"),
  batch2 |> mutate("batch", |_| "batch2")
])
labeled |> write_csv("labeled_results.csv")

String Operations on Data

let data = tsv("annotations.tsv")

# Extract gene symbol from description
data
  |> mutate("symbol", |row| {
    row["description"] |> split(";") |> first |> trim
  })
  |> filter(|row| { row["symbol"] |> starts_with("BRC") })
  |> print

Writing Output

let results = read_csv("data/expression.csv")
  |> filter(|row| { row["quality"] |> float > 30.0 })
  |> sort_by(|a, b| b.score - a.score)

# Write in multiple formats
results |> write_csv("output.csv")
results |> write_tsv("output.tsv")
print(f"Wrote {results |> len} rows")