Intermediate ~30 minutes

Statistical Analysis

BioLang includes a comprehensive statistics library designed for biological data. This tutorial covers hypothesis testing, correlation, dimensionality reduction, and clustering -- all the tools you need to analyze experimental results.

What you will learn

  • Descriptive statistics and distributions
  • t-tests, Wilcoxon, and paired comparisons
  • ANOVA
  • Correlation and regression
  • PCA visualization
  • Clustering with heatmaps
Run this tutorial: Download statistics.bl and run it with bl run examples/tutorials/statistics.bl

Step 1 — Descriptive Statistics

# requires: data/experiment.csv in working directory
# stats.bio — statistical analysis

let data = csv("data/experiment.csv")

# Extract a column as a list (table field access returns column)
let values = col(data, "expression")

print("=== Descriptive Statistics ===")
print(f"N:        {len(values)}")
print(f"Mean:     {round(mean(values), 3)}")
print(f"Median:   {round(median(values), 3)}")
print(f"Std Dev:  {round(stdev(values), 3)}")
print(f"Variance: {round(variance(values), 3)}")
print(f"Min:      {min(values)}")
print(f"Max:      {max(values)}")
print(f"Range:    {max(values) - min(values)}")

# Quantiles
let q25 = quantile(values, 0.25)
let q50 = quantile(values, 0.50)
let q75 = quantile(values, 0.75)
print(f"\n25th percentile: {round(q25, 3)}")
print(f"50th percentile: {round(q50, 3)}")
print(f"75th percentile: {round(q75, 3)}")
print(f"IQR:             {round(q75 - q25, 3)}")

# Summary for all numeric columns at once
let summary = describe(data)
print(summary)

Step 2 — Comparing Distributions

# Extract groups for comparison
data |> filter(|r| r.group == "control") |> into ctrl_rows
data |> filter(|r| r.group == "treated") |> into treat_rows
let control = col(ctrl_rows, "expression")
let treated = col(treat_rows, "expression")

# Kolmogorov-Smirnov test: do two samples come from the same distribution?
let ks = ks_test(control, treated)
print("=== KS Test (control vs treated) ===")
print(f"Statistic: {round(ks.statistic, 4)}")
print(f"p-value:   {round(ks.pvalue, 4)}")

if ks.pvalue < 0.05 {
  print("Distributions differ significantly — consider non-parametric tests")
} else {
  print("No significant difference in distributions — parametric tests are appropriate")
}

# Compare basic distribution properties
print(f"\nControl — mean: {round(mean(control), 3)}, stdev: {round(stdev(control), 3)}")
print(f"Treated — mean: {round(mean(treated), 3)}, stdev: {round(stdev(treated), 3)}")

Step 3 — t-Tests and Non-Parametric Alternatives

# Independent two-sample t-test (Welch's by default)
let t_result = ttest(control, treated)
print("=== Independent t-Test (Welch's) ===")
print(f"t-statistic: {round(t_result.t_statistic, 4)}")
print(f"p-value:     {round(t_result.p_value, 6)}")
print(f"df:          {round(t_result.df, 1)}")

# Paired t-test (for before/after measurements)
let before = col(data, "before_treatment")
let after  = col(data, "after_treatment")
let paired = ttest_paired(before, after)
print(f"\nPaired t-test p-value: {round(paired.p_value, 6)}")

# Non-parametric: Wilcoxon rank-sum test (two independent samples)
let w = wilcoxon(control, treated)
print("\n=== Wilcoxon Rank-Sum Test ===")
print(f"U-statistic: {round(w.u_statistic, 1)}")
print(f"p-value:     {round(w.p_value, 6)}")

Step 4 — ANOVA

# One-way ANOVA for multiple groups
# group_by returns a Map of group_name -> Table
let groups = group_by(data, "treatment")

# Extract expression columns from each group into a list of lists
let group_names = colnames(data) |> filter(|c| c == "treatment")
let treatment_names = col(data, "treatment") |> unique()

let group_lists = treatment_names |> map(|name| {
  let subset = data |> filter(|r| r.treatment == name)
  col(subset, "expression")
})

let anova_result = anova(group_lists)
print("=== One-Way ANOVA ===")
print(f"F-statistic: {round(anova_result.f_statistic, 3)}")
print(f"p-value:     {round(anova_result.p_value, 6)}")
print(f"df between:  {anova_result.df_between}")
print(f"df within:   {anova_result.df_within}")

# If significant, do pairwise t-tests with p-value adjustment
if anova_result.p_value < 0.05 {
  print("\n=== Pairwise Comparisons ===")
  let pvals = []
  let labels = []
  for i in 0..len(treatment_names) {
    for j in (i+1)..len(treatment_names) {
      let a = group_lists[i]
      let b = group_lists[j]
      let t = ttest(a, b)
      pvals = pvals + [t.p_value]
      labels = labels + [f"{treatment_names[i]} vs {treatment_names[j]}"]
    }
  }
  let adjusted = p_adjust(pvals, "bh")
  for k in 0..len(labels) {
    let sig = if adjusted[k] < 0.05 { "*" } else { "ns" }
    print(f"  {labels[k]}: p={round(adjusted[k], 4)} {sig}")
  }
}

Step 5 — Correlation

# Pearson correlation (returns a Float)
let x = col(data, "gene_a_expression")
let y = col(data, "gene_b_expression")

let r = cor(x, y)
print("=== Pearson Correlation ===")
print(f"r: {round(r, 4)}")

# Spearman rank correlation (returns a Record)
let rho = spearman(x, y)
print(f"\nSpearman rho: {round(rho.coefficient, 4)}")
print(f"Spearman p:   {round(rho.pvalue, 4)}")

# Kendall rank correlation (returns a Record)
let tau = kendall(x, y)
print(f"Kendall tau:  {round(tau.coefficient, 4)}")
print(f"Kendall p:    {round(tau.pvalue, 4)}")

# Compute correlations for multiple gene pairs
let gene_cols = ["gene_a", "gene_b", "gene_c", "gene_d", "gene_e"]
print("\n=== Pairwise Correlations ===")
for i in 0..len(gene_cols) {
  for j in (i+1)..len(gene_cols) {
    let a = col(data, gene_cols[i])
    let b = col(data, gene_cols[j])
    let r = cor(a, b)
    print(f"  {gene_cols[i]} ~ {gene_cols[j]}: r={round(r, 3)}")
  }
}

Step 6 — Linear Regression

# Simple linear regression: lm(x_list, y_list)
let dosage = col(data, "dosage")
let expression = col(data, "expression")
let model = lm(dosage, expression)

print("=== Linear Regression ===")
print(f"Intercept:  {round(model.intercept, 3)}")
print(f"Slope:      {round(model.slope, 3)}")
print(f"R-squared:  {round(model.r_squared, 4)}")
print(f"p-value:    {round(model.p_value, 6)}")

# Visualize the fit
plot(data, {
  x:      "dosage",
  y:      "expression",
  kind:   "scatter",
  title:  "Expression vs Dosage",
  output: "results/regression.svg",
})

Step 7 — PCA Visualization

# requires: data/expression_matrix.csv in working directory
# Load expression matrix (samples as rows, genes as columns)
let expr = csv("data/expression_matrix.csv")

# pca_plot performs PCA and renders a scatter plot of PC1 vs PC2
# It accepts a table and a config record
pca_plot(expr, {
  color_by: "group",
  label:    "sample",
  title:    "PCA — RNA-seq Samples",
  output:   "results/pca.svg",
})

print("PCA plot saved to results/pca.svg")

Step 8 — Clustering and Heatmaps

# Clustered heatmap with hierarchical clustering built in
let all_cols = colnames(expr)
let gene_cols = all_cols |> filter(|c| c != "sample" and c != "group")
let matrix = select(expr, gene_cols)

clustered_heatmap(matrix, {
  row_labels: col(expr, "sample"),
  title:      "Sample Clustering — Expression",
  output:     "results/clustered_heatmap.svg",
})

print("Clustered heatmap saved to results/clustered_heatmap.svg")

Step 9 — Multiple Testing and FDR

# When running many tests, we need to correct for multiple comparisons

# Run a t-test per gene
let all_cols = colnames(expr)
let gene_cols = all_cols |> filter(|c| c != "sample" and c != "group")

let results = gene_cols |> map(|gene| {
  let ctrl_rows = expr |> filter(|r| r.group == "control")
  let treat_rows = expr |> filter(|r| r.group == "treated")
  let ctrl = col(ctrl_rows, gene)
  let treat = col(treat_rows, gene)
  let result = ttest(ctrl, treat)
  { gene: gene, pvalue: result.p_value, t_stat: result.t_statistic }
}) |> to_table()

# Collect raw p-values and apply correction methods
let raw_pvals = col(results, "pvalue")

let bh_adjusted   = p_adjust(raw_pvals, "bh")
let bonf_adjusted  = p_adjust(raw_pvals, "bonferroni")
let holm_adjusted  = p_adjust(raw_pvals, "holm")

# Add adjusted columns using mutate
let corrected = results
  |> mutate("bh_fdr", |row| bh_adjusted[row._index])
  |> mutate("bonferroni", |row| bonf_adjusted[row._index])
  |> mutate("holm", |row| holm_adjusted[row._index])

# Compare methods
for method in ["bh_fdr", "bonferroni", "holm"] {
  let n_sig = corrected |> filter(|r| r[method] < 0.05) |> nrow()
  print(f"{method}: {n_sig} significant genes")
}

# Write results
corrected
  |> arrange("bh_fdr")
  |> write_csv("results/gene_tests.csv")

Step 10 — Complete Statistical Workflow

# stats_pipeline.bio — full statistical analysis

fn statistical_analysis(data_file, output_dir) {
  let data = csv(data_file)

  # 1. Descriptive statistics
  let desc = describe(data)
  desc |> write_csv(f"{output_dir}/descriptive_stats.csv")

  # 2. Summary per group
  let treatment_names = col(data, "treatment") |> unique()
  for name in treatment_names {
    let subset = data |> filter(|r| r.treatment == name)
    let vals = col(subset, "expression")
    print(f"{name}: mean={round(mean(vals), 3)}, stdev={round(stdev(vals), 3)}, n={len(vals)}")
  }

  # 3. ANOVA across groups
  let group_lists = treatment_names |> map(|name| {
    let subset = data |> filter(|r| r.treatment == name)
    col(subset, "expression")
  })
  let anova_result = anova(group_lists)
  print(f"\nANOVA: F={round(anova_result.f_statistic, 3)}, p={round(anova_result.p_value, 6)}")

  # 4. PCA visualization
  let all_cols = colnames(data)
  let numeric_cols = all_cols |> filter(|c| c != "sample" and c != "treatment" and c != "group")
  pca_plot(select(data, numeric_cols + ["treatment"]), {
    color_by: "treatment",
    output:   f"{output_dir}/pca.svg",
  })

  # 5. Clustered heatmap
  clustered_heatmap(select(data, numeric_cols), {
    title:  "Expression Heatmap",
    output: f"{output_dir}/heatmap.svg",
  })

  print(f"\nAnalysis complete. Results in {output_dir}/")
}

statistical_analysis("data/experiment.csv", "results")

Next Steps

Learn how to create publication-quality figures in the Visualization tutorial.