Single-Cell Analysis

Single-cell RNA sequencing analysis in BioLang covers the full workflow from raw count matrices through quality control, normalization, dimensionality reduction, clustering, and marker gene identification.

Loading Count Matrices

Reading 10x Genomics output

# Load 10x MEX format: barcodes, genes, and sparse matrix
let barcodes = read_lines("filtered_feature_bc_matrix/barcodes.tsv")
let genes_file = read_lines("filtered_feature_bc_matrix/features.tsv")
let gene_names = genes_file |> map(|line| { line |> split("\t") |> last })

# Read Matrix Market file and build sparse matrix
let mtx_lines = read_lines("filtered_feature_bc_matrix/matrix.mtx")
  |> filter(|line| { !(line |> starts_with("%")) })
let header = mtx_lines |> first |> split(" ")
let n_genes = header[0] |> int
let n_cells = header[1] |> int
let entries = mtx_lines |> tail(1)

# Build sparse matrix from (row, col, value) triplets
let rows = entries |> map(|line| { let parts = line |> split(" "); (parts[0] |> int) - 1 })
let cols = entries |> map(|line| { let parts = line |> split(" "); (parts[1] |> int) - 1 })
let vals = entries |> map(|line| { let parts = line |> split(" "); parts[2] |> float })
let counts = sparse_matrix(rows, cols, vals)

print(f"Loaded: {n_cells} cells x {n_genes} genes")
print(f"Non-zero entries: {counts |> nnz}")
print(f"Sparsity: {(1.0 - (counts |> nnz |> float) / (n_cells * n_genes |> float)) * 100.0 |> round(1)}%")

Reading from CSV

# Load a dense count matrix (genes as columns, cells as rows)
let counts_table = read_csv("data/counts.csv")
let metadata = read_csv("data/sample_sheet.csv")

print(f"Count matrix: {counts_table |> len} cells")
print(f"Metadata fields: {metadata |> head(1) |> keys |> join(", ")}")

Quality Control

Cell and gene filtering

# Assume counts is a sparse matrix (cells x genes) loaded above
# Compute per-cell QC: total UMIs and genes detected
let cell_totals = counts |> sparse_row_sums
let genes_per_cell = counts |> to_dense |> mat_map(|x| { if x > 0.0 { 1.0 } else { 0.0 } }) |> row_sums

# Identify mitochondrial genes (names starting with "MT-")
let mito_mask = gene_names |> map(|g| { if g |> starts_with("MT-") { 1.0 } else { 0.0 } })

# Compute mito fraction per cell from dense matrix
let dense = counts |> to_dense
let mito_sums = range(0, n_cells) |> map(|i| {
  let cell_row = row(dense, i)
  zip(cell_row, mito_mask) |> map(|pair| { pair[0] * pair[1] }) |> sum
})
let pct_mito = zip(mito_sums, cell_totals) |> map(|pair| {
  if pair[1] > 0.0 { pair[0] / pair[1] * 100.0 } else { 0.0 }
})

print("QC Summary:")
print(f"  Median genes/cell: {genes_per_cell |> median |> round(0)}")
print(f"  Median UMIs/cell:  {cell_totals |> median |> round(0)}")
print(f"  Median %mito:      {pct_mito |> median |> round(1)}%")

# Filter cells by QC thresholds
let keep_cells = range(0, n_cells) |> filter(|i| {
  genes_per_cell[i] > 200.0 &&
  genes_per_cell[i] < 5000.0 &&
  cell_totals[i] > 500.0 &&
  pct_mito[i] < 20.0
})
print(f"Cells passing QC: {keep_cells |> len} / {n_cells}")

# Filter genes (expressed in at least 3 cells)
let gene_cell_counts = counts |> to_dense |> mat_map(|x| { if x > 0.0 { 1.0 } else { 0.0 } }) |> col_sums
let keep_genes = range(0, n_genes) |> filter(|j| { gene_cell_counts[j] >= 3.0 })
print(f"Genes retained: {keep_genes |> len} / {n_genes}")

Normalization

Standard normalization pipeline

# Library size normalization: scale each cell to target_sum, then log1p
let target_sum = 10000.0
let cell_totals = counts |> sparse_row_sums

# Normalize: divide each cell's counts by its total, multiply by target
let norm = counts |> normalize_sparse("total")
let dense_norm = norm |> to_dense

# Scale to target sum and log-transform
let scaled = dense_norm |> mat_map(|x| { x * target_sum })
let log_counts = scaled |> mat_map(|x| { log(x + 1.0) })

print(f"Normalized matrix: {log_counts |> dim}")

# Find highly variable genes (by variance across cells)
let gene_means = log_counts |> col_means
let gene_vars = range(0, n_genes) |> map(|j| {
  let col_vals = mat_col(log_counts, j)
  let mu = gene_means[j]
  col_vals |> map(|x| { pow(x - mu, 2) }) |> mean
})

# Select top 2000 most variable genes
let gene_indices = range(0, n_genes) |> sort_by(|a, b| { gene_vars[b] - gene_vars[a] })
let hvg_indices = gene_indices |> take(2000)
let hvg_names = hvg_indices |> map(|i| { gene_names[i] })
print(f"Highly variable genes: {hvg_indices |> len}")

Dimensionality Reduction

PCA and UMAP

# PCA on the log-normalized matrix (50 components)
let pca_result = pca(log_counts, 50)
let pc_coords = pca_result.coords       # cells x 50 matrix
let pc_variance = pca_result.variance    # variance explained per PC

# Check variance explained (elbow plot data)
let total_var = pc_variance |> sum
print("Variance explained by top PCs:")
for i in 0..10 {
  let pct = pc_variance[i] / total_var * 100.0
  print(f"  PC{i + 1}: {pct |> round(2)}%")
}

# UMAP embedding from top 30 PCs
let pc_30 = range(0, n_cells) |> map(|i| {
  row(pc_coords, i) |> take(30)
})
let pc_matrix = matrix(pc_30)
let umap_coords = umap(pc_matrix, 2)  # 2D embedding

# Export UMAP coordinates
let umap_table = range(0, n_cells) |> map(|i| {
  let coords = row(umap_coords, i)
  { cell: barcodes[i], umap1: coords[0], umap2: coords[1] }
})
umap_table |> write_tsv("umap_coordinates.tsv")
print(f"UMAP coordinates saved for {n_cells} cells")

Clustering

Leiden clustering

# Leiden clustering at multiple resolutions on PCA coordinates
let resolutions = [0.3, 0.5, 0.8, 1.0, 1.5]

for res in resolutions {
  let clusters = leiden(pc_matrix, res)
  let n_clusters = clusters |> unique |> len
  print(f"Resolution {res}: {n_clusters} clusters")
}

# Use resolution 0.8 as default
let clusters = leiden(pc_matrix, 0.8)

# Cluster sizes
let cluster_ids = clusters |> unique |> sort
for cl in cluster_ids {
  let size = clusters |> filter(|c| { c == cl }) |> len
  let pct = size |> float / n_cells |> float * 100.0
  print(f"  Cluster {cl}: {size} cells ({pct |> round(1)}%)")
}

Marker Gene Detection

Finding cluster markers with diff_expr

# Differential expression between each cluster and the rest
let cluster_ids = clusters |> unique |> sort

let all_markers = cluster_ids |> flat_map(|cl| {
  # Split cells into this cluster vs rest
  let in_cluster = range(0, n_cells) |> filter(|i| { clusters[i] == cl })
  let out_cluster = range(0, n_cells) |> filter(|i| { clusters[i] != cl })

  # Extract expression submatrices as lists of row vectors
  let in_expr = in_cluster |> map(|i| { row(log_counts, i) })
  let out_expr = out_cluster |> map(|i| { row(log_counts, i) })

  # diff_expr compares two groups and returns per-gene stats
  let de = diff_expr(matrix(in_expr), matrix(out_expr))

  # de is a list of {gene_index, log2fc, p_value}
  de |> sort_by(|a, b| a.p_value - b.p_value)
     |> take(20)
     |> map(|m| { {
       cluster: cl,
       gene: gene_names[m.gene_index],
       log2fc: m.log2fc |> round(3),
       p_value: m.p_value
     }})
})

# Print top 5 markers per cluster
for cl in cluster_ids {
  let top = all_markers |> filter(|m| { m.cluster == cl }) |> take(5)
  let genes = top |> map(|m| { m.gene }) |> join(", ")
  print(f"Cluster {cl}: {genes}")
}

all_markers |> write_tsv("cluster_markers.tsv")

Cell type annotation

# Annotate clusters based on known marker genes
let known_markers = {
  "T cells": ["CD3D", "CD3E", "CD4", "CD8A"],
  "B cells": ["CD19", "MS4A1", "CD79A"],
  "NK cells": ["NKG7", "GNLY", "KLRD1"],
  "Monocytes": ["CD14", "LYZ", "S100A8"],
  "Dendritic": ["FCER1A", "CST3", "CLEC10A"],
  "Platelets": ["PPBP", "PF4"]
}

# Build gene name to index lookup
let gene_index = range(0, n_genes) |> reduce({}, |acc, i| {
  acc[gene_names[i]] = i
  acc
})

let cluster_ids = clusters |> unique |> sort
let cell_types_list = ["T cells", "B cells", "NK cells", "Monocytes", "Dendritic", "Platelets"]

let annotations = cluster_ids |> map(|cl| {
  let cell_indices = range(0, n_cells) |> filter(|i| { clusters[i] == cl })

  let scores = cell_types_list |> map(|cell_type| {
    let marker_genes = known_markers[cell_type]
    let score = marker_genes |> map(|gene| {
      if has_key(gene_index, gene) {
        let gi = gene_index[gene]
        cell_indices |> map(|ci| { row(log_counts, ci)[gi] }) |> mean
      } else { 0.0 }
    }) |> mean
    { cell_type: cell_type, score: score }
  }) |> sort_by(|a, b| b.score - a.score)

  let best = scores |> first
  print(f"Cluster {cl} => {best.cell_type} (score: {best.score |> round(3)})")
  { cluster: cl, cell_type: best.cell_type }
})

# Map cluster labels to cell types
let annotation_map = annotations |> reduce({}, |acc, a| {
  acc[a.cluster] = a.cell_type
  acc
})

let cell_types = clusters |> map(|cl| { annotation_map[cl] })
let type_counts = cell_types |> frequencies |> sort_by(|a, b| b.value - a.value)
type_counts |> each(|ct| { print(f"{ct.key}: {ct.value} cells") })

Saving Results

# Export embeddings with cluster and cell type annotations
let results = range(0, n_cells) |> map(|i| {
  let coords = row(umap_coords, i)
  {
    cell: barcodes[i],
    umap1: coords[0] |> round(4),
    umap2: coords[1] |> round(4),
    cluster: clusters[i],
    cell_type: cell_types[i]
  }
})

results |> write_tsv("cell_embeddings.tsv")
print(f"Saved {results |> len} cells with UMAP coordinates, clusters, and cell types")

# Also export the normalized expression matrix
log_counts |> matrix_to_table |> write_csv("normalized_expression.csv")
print("Normalized expression matrix saved")