Intermediate
~30 minutes
Statistical Analysis
BioLang includes a comprehensive statistics library designed for biological data. This tutorial covers hypothesis testing, correlation, dimensionality reduction, and clustering -- all the tools you need to analyze experimental results.
What you will learn
- Descriptive statistics and distributions
- t-tests, Wilcoxon, and paired comparisons
- ANOVA
- Correlation and regression
- PCA visualization
- Clustering with heatmaps
Run this tutorial: Download
statistics.bl
and run it with
bl run examples/tutorials/statistics.bl
Step 1 — Descriptive Statistics
# requires: data/experiment.csv in working directory
# stats.bio — statistical analysis
let data = csv("data/experiment.csv")
# Extract a column as a list (table field access returns column)
let values = col(data, "expression")
print("=== Descriptive Statistics ===")
print(f"N: {len(values)}")
print(f"Mean: {round(mean(values), 3)}")
print(f"Median: {round(median(values), 3)}")
print(f"Std Dev: {round(stdev(values), 3)}")
print(f"Variance: {round(variance(values), 3)}")
print(f"Min: {min(values)}")
print(f"Max: {max(values)}")
print(f"Range: {max(values) - min(values)}")
# Quantiles
let q25 = quantile(values, 0.25)
let q50 = quantile(values, 0.50)
let q75 = quantile(values, 0.75)
print(f"\n25th percentile: {round(q25, 3)}")
print(f"50th percentile: {round(q50, 3)}")
print(f"75th percentile: {round(q75, 3)}")
print(f"IQR: {round(q75 - q25, 3)}")
# Summary for all numeric columns at once
let summary = describe(data)
print(summary)
Step 2 — Comparing Distributions
# Extract groups for comparison
data |> filter(|r| r.group == "control") |> into ctrl_rows
data |> filter(|r| r.group == "treated") |> into treat_rows
let control = col(ctrl_rows, "expression")
let treated = col(treat_rows, "expression")
# Kolmogorov-Smirnov test: do two samples come from the same distribution?
let ks = ks_test(control, treated)
print("=== KS Test (control vs treated) ===")
print(f"Statistic: {round(ks.statistic, 4)}")
print(f"p-value: {round(ks.pvalue, 4)}")
if ks.pvalue < 0.05 {
print("Distributions differ significantly — consider non-parametric tests")
} else {
print("No significant difference in distributions — parametric tests are appropriate")
}
# Compare basic distribution properties
print(f"\nControl — mean: {round(mean(control), 3)}, stdev: {round(stdev(control), 3)}")
print(f"Treated — mean: {round(mean(treated), 3)}, stdev: {round(stdev(treated), 3)}")
Step 3 — t-Tests and Non-Parametric Alternatives
# Independent two-sample t-test (Welch's by default)
let t_result = ttest(control, treated)
print("=== Independent t-Test (Welch's) ===")
print(f"t-statistic: {round(t_result.t_statistic, 4)}")
print(f"p-value: {round(t_result.p_value, 6)}")
print(f"df: {round(t_result.df, 1)}")
# Paired t-test (for before/after measurements)
let before = col(data, "before_treatment")
let after = col(data, "after_treatment")
let paired = ttest_paired(before, after)
print(f"\nPaired t-test p-value: {round(paired.p_value, 6)}")
# Non-parametric: Wilcoxon rank-sum test (two independent samples)
let w = wilcoxon(control, treated)
print("\n=== Wilcoxon Rank-Sum Test ===")
print(f"U-statistic: {round(w.u_statistic, 1)}")
print(f"p-value: {round(w.p_value, 6)}")
Step 4 — ANOVA
# One-way ANOVA for multiple groups
# group_by returns a Map of group_name -> Table
let groups = group_by(data, "treatment")
# Extract expression columns from each group into a list of lists
let group_names = colnames(data) |> filter(|c| c == "treatment")
let treatment_names = col(data, "treatment") |> unique()
let group_lists = treatment_names |> map(|name| {
let subset = data |> filter(|r| r.treatment == name)
col(subset, "expression")
})
let anova_result = anova(group_lists)
print("=== One-Way ANOVA ===")
print(f"F-statistic: {round(anova_result.f_statistic, 3)}")
print(f"p-value: {round(anova_result.p_value, 6)}")
print(f"df between: {anova_result.df_between}")
print(f"df within: {anova_result.df_within}")
# If significant, do pairwise t-tests with p-value adjustment
if anova_result.p_value < 0.05 {
print("\n=== Pairwise Comparisons ===")
let pvals = []
let labels = []
for i in 0..len(treatment_names) {
for j in (i+1)..len(treatment_names) {
let a = group_lists[i]
let b = group_lists[j]
let t = ttest(a, b)
pvals = pvals + [t.p_value]
labels = labels + [f"{treatment_names[i]} vs {treatment_names[j]}"]
}
}
let adjusted = p_adjust(pvals, "bh")
for k in 0..len(labels) {
let sig = if adjusted[k] < 0.05 { "*" } else { "ns" }
print(f" {labels[k]}: p={round(adjusted[k], 4)} {sig}")
}
}
Step 5 — Correlation
# Pearson correlation (returns a Float)
let x = col(data, "gene_a_expression")
let y = col(data, "gene_b_expression")
let r = cor(x, y)
print("=== Pearson Correlation ===")
print(f"r: {round(r, 4)}")
# Spearman rank correlation (returns a Record)
let rho = spearman(x, y)
print(f"\nSpearman rho: {round(rho.coefficient, 4)}")
print(f"Spearman p: {round(rho.pvalue, 4)}")
# Kendall rank correlation (returns a Record)
let tau = kendall(x, y)
print(f"Kendall tau: {round(tau.coefficient, 4)}")
print(f"Kendall p: {round(tau.pvalue, 4)}")
# Compute correlations for multiple gene pairs
let gene_cols = ["gene_a", "gene_b", "gene_c", "gene_d", "gene_e"]
print("\n=== Pairwise Correlations ===")
for i in 0..len(gene_cols) {
for j in (i+1)..len(gene_cols) {
let a = col(data, gene_cols[i])
let b = col(data, gene_cols[j])
let r = cor(a, b)
print(f" {gene_cols[i]} ~ {gene_cols[j]}: r={round(r, 3)}")
}
}
Step 6 — Linear Regression
# Simple linear regression: lm(x_list, y_list)
let dosage = col(data, "dosage")
let expression = col(data, "expression")
let model = lm(dosage, expression)
print("=== Linear Regression ===")
print(f"Intercept: {round(model.intercept, 3)}")
print(f"Slope: {round(model.slope, 3)}")
print(f"R-squared: {round(model.r_squared, 4)}")
print(f"p-value: {round(model.p_value, 6)}")
# Visualize the fit
plot(data, {
x: "dosage",
y: "expression",
kind: "scatter",
title: "Expression vs Dosage",
output: "results/regression.svg",
})
Step 7 — PCA Visualization
# requires: data/expression_matrix.csv in working directory
# Load expression matrix (samples as rows, genes as columns)
let expr = csv("data/expression_matrix.csv")
# pca_plot performs PCA and renders a scatter plot of PC1 vs PC2
# It accepts a table and a config record
pca_plot(expr, {
color_by: "group",
label: "sample",
title: "PCA — RNA-seq Samples",
output: "results/pca.svg",
})
print("PCA plot saved to results/pca.svg")
Step 8 — Clustering and Heatmaps
# Clustered heatmap with hierarchical clustering built in
let all_cols = colnames(expr)
let gene_cols = all_cols |> filter(|c| c != "sample" and c != "group")
let matrix = select(expr, gene_cols)
clustered_heatmap(matrix, {
row_labels: col(expr, "sample"),
title: "Sample Clustering — Expression",
output: "results/clustered_heatmap.svg",
})
print("Clustered heatmap saved to results/clustered_heatmap.svg")
Step 9 — Multiple Testing and FDR
# When running many tests, we need to correct for multiple comparisons
# Run a t-test per gene
let all_cols = colnames(expr)
let gene_cols = all_cols |> filter(|c| c != "sample" and c != "group")
let results = gene_cols |> map(|gene| {
let ctrl_rows = expr |> filter(|r| r.group == "control")
let treat_rows = expr |> filter(|r| r.group == "treated")
let ctrl = col(ctrl_rows, gene)
let treat = col(treat_rows, gene)
let result = ttest(ctrl, treat)
{ gene: gene, pvalue: result.p_value, t_stat: result.t_statistic }
}) |> to_table()
# Collect raw p-values and apply correction methods
let raw_pvals = col(results, "pvalue")
let bh_adjusted = p_adjust(raw_pvals, "bh")
let bonf_adjusted = p_adjust(raw_pvals, "bonferroni")
let holm_adjusted = p_adjust(raw_pvals, "holm")
# Add adjusted columns using mutate
let corrected = results
|> mutate("bh_fdr", |row| bh_adjusted[row._index])
|> mutate("bonferroni", |row| bonf_adjusted[row._index])
|> mutate("holm", |row| holm_adjusted[row._index])
# Compare methods
for method in ["bh_fdr", "bonferroni", "holm"] {
let n_sig = corrected |> filter(|r| r[method] < 0.05) |> nrow()
print(f"{method}: {n_sig} significant genes")
}
# Write results
corrected
|> arrange("bh_fdr")
|> write_csv("results/gene_tests.csv")
Step 10 — Complete Statistical Workflow
# stats_pipeline.bio — full statistical analysis
fn statistical_analysis(data_file, output_dir) {
let data = csv(data_file)
# 1. Descriptive statistics
let desc = describe(data)
desc |> write_csv(f"{output_dir}/descriptive_stats.csv")
# 2. Summary per group
let treatment_names = col(data, "treatment") |> unique()
for name in treatment_names {
let subset = data |> filter(|r| r.treatment == name)
let vals = col(subset, "expression")
print(f"{name}: mean={round(mean(vals), 3)}, stdev={round(stdev(vals), 3)}, n={len(vals)}")
}
# 3. ANOVA across groups
let group_lists = treatment_names |> map(|name| {
let subset = data |> filter(|r| r.treatment == name)
col(subset, "expression")
})
let anova_result = anova(group_lists)
print(f"\nANOVA: F={round(anova_result.f_statistic, 3)}, p={round(anova_result.p_value, 6)}")
# 4. PCA visualization
let all_cols = colnames(data)
let numeric_cols = all_cols |> filter(|c| c != "sample" and c != "treatment" and c != "group")
pca_plot(select(data, numeric_cols + ["treatment"]), {
color_by: "treatment",
output: f"{output_dir}/pca.svg",
})
# 5. Clustered heatmap
clustered_heatmap(select(data, numeric_cols), {
title: "Expression Heatmap",
output: f"{output_dir}/heatmap.svg",
})
print(f"\nAnalysis complete. Results in {output_dir}/")
}
statistical_analysis("data/experiment.csv", "results")
Next Steps
Learn how to create publication-quality figures in the Visualization tutorial.