Data Wrangling
Bioinformatics involves heavy data manipulation. BioLang provides first-class support for CSV, TSV, JSON, and tabular data with pipe-friendly transforms that feel natural for data processing workflows.
Sample data: BioLang includes CSV, TSV, and other
sample files in examples/sample-data/. Use
examples/sample-data/samples.csv,
examples/sample-data/gene_counts.csv, and
examples/sample-data/counts.tsv to follow along.
Loading Data
CSV files
# Load a CSV with automatic header detection
let samples = read_csv("examples/sample-data/samples.csv")
samples |> head(5) |> print
# TSV is tab-delimited
let data = tsv("data.txt")
data |> describe |> print
TSV files
# Read a TSV file (e.g., gene expression matrix)
let expr = tsv("expression_matrix.tsv")
print(f"Rows: {len(expr)}")
expr |> describe |> print
Reading text files
# Load lines from a text file
let lines = read_lines("sample_ids.txt")
lines |> take(5) |> print
# Count entries
len(lines) |> print
Column Operations
Selecting columns
let data = read_csv("examples/sample-data/gene_counts.csv")
# Select specific columns
data |> select("gene_id", "sample_1", "sample_2") |> head(10) |> print
# Drop columns
data |> drop_cols("internal_id", "notes") |> write_csv("clean.csv")
Renaming columns
let data = tsv("results.tsv")
data
|> rename("chr", "chrom")
|> rename("pos", "position")
|> write_tsv("renamed.tsv")
Adding computed columns
let variants = tsv("variants.tsv")
variants
|> mutate("length", |row| { row["end"] - row["start"] })
|> mutate("label", |row| {
match row["type"] {
"SNP" => "Single Nucleotide",
"INS" => "Insertion",
"DEL" => "Deletion",
_ => "Other"
}
})
|> write_tsv("annotated.tsv")
Filtering and Sorting
Row filtering
let data = read_csv("data/expression.csv")
# Filter significant genes
let sig = data
|> filter(|row| { row["padj"] |> float < 0.05 })
|> filter(|row| { (row["log2FoldChange"] |> float) |> abs > 1.0 })
print(f"Significant DE genes: {sig |> len}")
sig |> write_csv("significant_genes.csv")
Sorting
let data = read_csv("data/expression.csv")
# Sort by a single column
data |> sort_by(|a, b| a.p_value - b.p_value) |> head(20) |> print
# Sort descending
data |> sort_by(|a, b| b.fold_change - a.fold_change) |> head(10) |> print
# Multi-column sort
data |> sort_by(|a, b| [a.chromosome, a.position]) |> write_csv("sorted.csv")
Grouping and Aggregation
Group by and summarize
let counts = tsv("gene_counts.tsv")
# Count genes per chromosome
counts
|> group_by("chrom")
|> summarize(|chrom, rows| {
chrom: chrom,
n_genes: len(rows),
avg_length: rows |> map(|r| r.length) |> mean,
max_expr: rows |> map(|r| r.expression) |> max
})
|> sort_by(|a, b| b.n_genes - a.n_genes)
|> print
Pivot / reshape
# Wide to long format
let wide = read_csv("data/expression.csv")
wide
|> pivot_longer(["sample_a", "sample_b", "sample_c"], "sample", "expression")
|> write_read_csv("data/expression.csv")
# Long to wide format
let long_data = read_csv("data/expression.csv")
long_data
|> pivot_wider("sample", "expression")
|> write_csv("expression_wide_again.csv")
Merging and Joining
Join tables
let genes = tsv("genes.tsv") # gene_id, gene_name, chrom
let counts = tsv("examples/sample-data/counts.tsv") # chrom, start, end, count
# Inner join on gene_id
let merged = genes |> inner_join(counts, "gene_id")
merged |> head(5) |> print
# Left join to keep all genes (unmatched get nil)
let full = genes |> left_join(counts, "gene_id")
let missing = full |> filter(|row| { row["count"] == nil })
print(f"Genes without counts: {missing |> len}")
Concatenating tables
# Stack tables vertically
let batch1 = read_csv("data/expression.csv")
let batch2 = read_csv("data/counts.csv")
let combined = concat([batch1, batch2])
combined |> write_csv("all_results.csv")
# Add a batch label
let labeled = concat([
batch1 |> mutate("batch", |_| "batch1"),
batch2 |> mutate("batch", |_| "batch2")
])
labeled |> write_csv("labeled_results.csv")
String Operations on Data
let data = tsv("annotations.tsv")
# Extract gene symbol from description
data
|> mutate("symbol", |row| {
row["description"] |> split(";") |> first |> trim
})
|> filter(|row| { row["symbol"] |> starts_with("BRC") })
|> print
Writing Output
let results = read_csv("data/expression.csv")
|> filter(|row| { row["quality"] |> float > 30.0 })
|> sort_by(|a, b| b.score - a.score)
# Write in multiple formats
results |> write_csv("output.csv")
results |> write_tsv("output.tsv")
print(f"Wrote {results |> len} rows")