Research Workflows
End-to-end research scenarios that mirror real-world bioinformatics studies. Each workflow combines data processing, statistical analysis, database queries, and publication-quality visualizations into a single self-contained script.
Jump to: Cancer Genomics · RNA-seq DE · GWAS · Gut Microbiome · Pharmacogenomics · Single-Cell · CRISPR Screen · Epigenomics · Phylogenomics · Clinical WES
Cancer Genomics
Somatic mutation analysis for a molecular tumor board: filter variants by quality and allele frequency, classify as SNV/indel, flag Tier-1 cancer driver genes, query COSMIC and ClinVar, look up drug targets in KEGG, generate a volcano plot and oncoprint, and compile a clinical report.
# requires: internet connection (NCBI, COSMIC, UniProt, KEGG APIs)
# cancer_genomics.bl — Identifying Actionable Mutations in a Tumor Sample
#
# A clinical bioinformatics workflow: starting from somatic VCF calls,
# filter for high-confidence variants, cross-reference cancer databases,
# and produce a report suitable for molecular tumor board review.
# ── 1. Load Somatic Mutation Calls ──────────────────────────────────
# Simulated VCF output from Mutect2 somatic calling (tumor vs matched normal).
# These represent real hotspot loci frequently mutated in solid tumors.
print("=== Loading somatic variants ===")
let somatic = table([
{chrom: "chr17", pos: 7578406, ref: "C", alt: "T", qual: 312.0, af: 0.42, dp: 186, gene: "TP53", aa: "R248W"},
{chrom: "chr17", pos: 7577121, ref: "G", alt: "A", qual: 287.0, af: 0.38, dp: 210, gene: "TP53", aa: "R175H"},
{chrom: "chr7", pos: 55259515, ref: "G", alt: "T", qual: 198.0, af: 0.31, dp: 142, gene: "EGFR", aa: "L858R"},
{chrom: "chr7", pos: 55249071, ref: "GAATTAAGAGAAGC", alt: "G", qual: 245.0, af: 0.27, dp: 164, gene: "EGFR", aa: "del746-750"},
{chrom: "chr12", pos: 25398284, ref: "C", alt: "A", qual: 156.0, af: 0.18, dp: 98, gene: "KRAS", aa: "G12V"},
{chrom: "chr3", pos: 178936091,ref: "G", alt: "A", qual: 89.0, af: 0.09, dp: 56, gene: "PIK3CA", aa: "H1047R"},
{chrom: "chr7", pos: 140453136,ref: "A", alt: "T", qual: 342.0, af: 0.55, dp: 224, gene: "BRAF", aa: "V600E"},
{chrom: "chr17", pos: 43094692, ref: "A", alt: "G", qual: 42.0, af: 0.04, dp: 32, gene: "BRCA1", aa: "C61G"},
{chrom: "chr13", pos: 32914438, ref: "GT", alt: "G", qual: 67.0, af: 0.06, dp: 44, gene: "BRCA2", aa: "fs1023"},
{chrom: "chr10", pos: 89692905, ref: "C", alt: "T", qual: 23.0, af: 0.02, dp: 18, gene: "PTEN", aa: "R130Q"},
{chrom: "chr5", pos: 112175770,ref: "G", alt: "A", qual: 31.0, af: 0.03, dp: 22, gene: "APC", aa: "R1450*"},
{chrom: "chr2", pos: 209113112,ref: "C", alt: "T", qual: 275.0, af: 0.35, dp: 190, gene: "IDH1", aa: "R132H"}
])
print("Loaded", len(somatic), "somatic variant calls")
print("\n=== Filtering variants ===")
# ── 2. Quality and Allele Frequency Filtering ──────────────────────
let filtered = somatic
|> filter(|v| v.qual >= 50.0)
|> filter(|v| v.af >= 0.05)
print("Variants passing QC (QUAL>=50, AF>=5%):", len(filtered))
# ── 3. Classify Variant Type ───────────────────────────────────────
print("\n=== Classifying variant types ===")
let classified = filtered |> mutate("var_type", |row| {
if len(row.ref) == 1 && len(row.alt) == 1 then "SNV"
else if len(row.alt) > len(row.ref) then "INS"
else "DEL"
})
let snvs = classified |> filter(|v| v.var_type == "SNV")
let indels = classified |> filter(|v| v.var_type != "SNV")
print(" SNVs:", len(snvs), " Indels:", len(indels))
# ── 4. Flag Known Cancer Driver Genes ──────────────────────────────
print("\n=== Flagging cancer driver genes ===")
let tier1_drivers = ["TP53", "KRAS", "EGFR", "BRAF", "PIK3CA", "BRCA1", "BRCA2", "PTEN", "IDH1"]
let drivers = classified |> filter(|v| tier1_drivers |> contains(v.gene))
print("Tier-1 driver mutations:", len(drivers))
drivers |> each(|v| {
print(" " + v.gene + " " + v.aa + " (AF=" + str(v.af) + ", " + v.var_type + ")")
})
# ── 5. COSMIC Mutation Frequency ───────────────────────────────────
print("\n=== Querying COSMIC ===")
let tp53_cosmic = cosmic_gene("TP53")
print("TP53 COSMIC entries:", len(tp53_cosmic))
let top_tp53 = sort(tp53_cosmic, |a, b| b.count - a.count) |> head(3)
for m in top_tp53 {
print(" " + m.aa + " (" + m.primary_site + ") —", m.count, "tumors")
}
# ── 6. ClinVar Pathogenicity Check ─────────────────────────────────
print("\n=== Querying ClinVar ===")
let clinvar_tp53 = ncbi_search("clinvar", "TP53[gene] AND pathogenic[clinsig]")
print("TP53 pathogenic ClinVar records:", len(clinvar_tp53))
let clinvar_brca1 = ncbi_search("clinvar", "BRCA1[gene] AND pathogenic[clinsig]")
print("BRCA1 pathogenic ClinVar records:", len(clinvar_brca1))
# ── 7. UniProt Protein Domain Annotation ───────────────────────────
print("\n=== Querying UniProt ===")
let p53_entry = uniprot_entry("P04637")
print("Protein:", p53_entry.name, "(" + str(p53_entry.sequence_length) + " aa)")
let p53_features = uniprot_features("P04637")
let domains = filter(p53_features, |f| f.type == "Domain")
print("Found", len(domains), "domains")
for d in domains {
print(" Domain: " + d.description + " [" + str(d.start) + "-" + str(d.end) + "]")
}
# ── 8. Drug Target Lookup via KEGG ─────────────────────────────────
print("\n=== Querying KEGG drug targets ===")
let egfr_drugs = kegg_find("drug", "EGFR inhibitor")
print("EGFR-targeted drugs in KEGG:", len(egfr_drugs))
for d in head(egfr_drugs, 5) {
print(" " + d.id + ": " + d.name)
}
let braf_drugs = kegg_find("drug", "BRAF inhibitor")
print("BRAF-targeted drugs:", len(braf_drugs))
# ── 9. Volcano Plot: Tumor vs Normal Expression ────────────────────
let de_results = table([
{gene: "TP53", log2fc: -2.1, pvalue: 0.00003, significant: true},
{gene: "EGFR", log2fc: 3.4, pvalue: 0.00001, significant: true},
{gene: "KRAS", log2fc: 1.8, pvalue: 0.001, significant: true},
{gene: "BRAF", log2fc: 2.9, pvalue: 0.00005, significant: true},
{gene: "IDH1", log2fc: 1.2, pvalue: 0.01, significant: true},
{gene: "PIK3CA", log2fc: 1.5, pvalue: 0.008, significant: true},
{gene: "GAPDH", log2fc: 0.1, pvalue: 0.82, significant: false},
{gene: "ACTB", log2fc: -0.2, pvalue: 0.65, significant: false},
{gene: "MYC", log2fc: 2.5, pvalue: 0.0002, significant: true},
{gene: "RB1", log2fc: -1.9, pvalue: 0.0008, significant: true},
{gene: "CDKN2A", log2fc: -3.1, pvalue: 0.00001, significant: true},
{gene: "BRCA1", log2fc: -0.8, pvalue: 0.04, significant: false}
])
print("\n=== Generating visualizations ===")
let volcano_svg = volcano(de_results, {title: "Tumor vs Normal: Differential Expression"})
save_svg(volcano_svg, "output/volcano_tumor_normal.svg")
# ── 10. Oncoprint: Mutation Landscape ──────────────────────────────
let onco_data = table([
{gene: "TP53", sample: "PT-001", type: "missense"},
{gene: "TP53", sample: "PT-002", type: "missense"},
{gene: "TP53", sample: "PT-003", type: "nonsense"},
{gene: "EGFR", sample: "PT-001", type: "missense"},
{gene: "EGFR", sample: "PT-004", type: "missense"},
{gene: "KRAS", sample: "PT-002", type: "missense"},
{gene: "KRAS", sample: "PT-005", type: "missense"},
{gene: "BRAF", sample: "PT-003", type: "missense"},
{gene: "BRAF", sample: "PT-006", type: "missense"},
{gene: "PIK3CA", sample: "PT-004", type: "missense"},
{gene: "IDH1", sample: "PT-005", type: "missense"},
{gene: "BRCA1", sample: "PT-006", type: "missense"},
{gene: "PTEN", sample: "PT-001", type: "mutation"},
{gene: "CDKN2A", sample: "PT-002", type: "mutation"},
{gene: "CDKN2A", sample: "PT-003", type: "mutation"}
])
let onco_svg = oncoprint(onco_data, {format: "svg"})
save_svg(onco_svg, "output/oncoprint_cohort.svg")
# ── 11. Summary Report ─────────────────────────────────────────────
print("\n=== Generating summary report ===")
let actionable = drivers |> filter(|v| v.af >= 0.10)
let actionable_summary = actionable |> map(|v| {
v.gene + " " + v.aa + " (VAF " + str(v.af * 100.0) + "%)"
})
let report = "# Molecular Tumor Board Report — Patient PT-001\n\n" +
"## Sample Overview\n\n" +
"Whole-exome sequencing of FFPE tumor biopsy (lung adenocarcinoma) with matched normal blood. " +
"Mean target coverage: 180x. Somatic calling via Mutect2.\n\n" +
"## Variant Summary\n\n" +
"Total somatic calls: " + str(somatic |> len) + "\n" +
"After QC filtering: " + str(filtered |> len) + "\n" +
"Tier-1 driver mutations: " + str(drivers |> len) + "\n" +
"SNVs: " + str(snvs |> len) + " | Indels: " + str(indels |> len) + "\n\n" +
"## Actionable Mutations (AF >= 10%)\n\n" +
(actionable_summary |> join("\n")) + "\n\n" +
"## Therapeutic Implications\n\n" +
"EGFR L858R — Sensitive to osimertinib (3rd-gen TKI). First-line NSCLC.\n" +
"BRAF V600E — Dabrafenib + trametinib combination approved.\n" +
"KRAS G12V — Consider MEK inhibitors.\n" +
"IDH1 R132H — Ivosidenib approved for cholangiocarcinoma and AML.\n"
write_text("output/tumor_board_report.md", report)
print("Report saved to output/tumor_board_report.md")
print("\n=== Analysis complete ===")
RNA-seq Differential Expression
Nutlin-3a treatment in HCT116 colorectal cancer cells: CPM normalization, differential expression testing, Benjamini-Hochberg correction, ORA and GSEA pathway enrichment, volcano plot, clustered heatmap, PCA, and Venn diagram.
# requires: internet connection (Reactome, KEGG APIs)
# RNA-seq Differential Expression Analysis
# Scenario: Drug-treated vs control HCT116 colorectal cancer cell lines
# Treatment: 48h exposure to 10uM Nutlin-3a (MDM2 inhibitor, activates p53)
# Design: 3 biological replicates per condition (n=6 total)
# ── 1. Sample Metadata ────────────────────────────────────────────
print("=== 1. Sample Metadata ===")
let samples = table(
sample_id: ["CTL_1", "CTL_2", "CTL_3", "TRT_1", "TRT_2", "TRT_3"],
condition: ["control", "control", "control", "treated", "treated", "treated"],
replicate: [1, 2, 3, 1, 2, 3],
cell_line: ["HCT116", "HCT116", "HCT116", "HCT116", "HCT116", "HCT116"],
batch: ["A", "A", "B", "A", "B", "B"]
)
print(f"Samples loaded: {len(samples)} total")
# ── 2. Raw Count Matrix ───────────────────────────────────────────
print("=== 2. Raw Count Matrix ===")
let genes = ["TP53", "MYC", "BRCA1", "GAPDH", "ACTB",
"CDK2", "BCL2", "BAX", "EGFR", "VEGFA", "HIF1A", "MDM2"]
let raw_counts = matrix(
rows: genes,
cols: samples.sample_id,
data: [
[ 820, 780, 850, 3200, 3450, 3100], # TP53
[ 5400, 5100, 5350, 1200, 1050, 1300], # MYC
[ 1100, 1050, 1200, 1800, 1750, 1900], # BRCA1
[12000, 11800, 12200, 11900, 12100, 11700], # GAPDH
[ 9500, 9700, 9400, 9600, 9300, 9800], # ACTB
[ 3800, 3900, 3700, 900, 850, 950], # CDK2
[ 2600, 2500, 2700, 650, 700, 600], # BCL2
[ 450, 500, 420, 2800, 2650, 2900], # BAX
[ 3100, 3200, 2900, 1500, 1600, 1400], # EGFR
[ 2200, 2100, 2300, 550, 600, 500], # VEGFA
[ 1800, 1750, 1850, 1500, 1400, 1600], # HIF1A
[ 600, 650, 580, 4500, 4200, 4700] # MDM2
]
)
print(f"Count matrix: {len(genes)} genes x {len(samples)} samples")
# ── 3. CPM Normalization ──────────────────────────────────────────
print("=== 3. CPM Normalization ===")
let lib_sizes = raw_counts
|> map(|col| col |> sum)
let cpm_matrix = raw_counts
|> map(|col, i| col |> map(|count| count / lib_sizes[i] * 1_000_000))
print(f"Library sizes (millions): {lib_sizes |> map(|s| s / 1_000_000)}")
# ── 4. Differential Expression Testing ────────────────────────────
print("=== 4. Differential Expression Testing ===")
let control_idx = [0, 1, 2]
let treated_idx = [3, 4, 5]
let control_matrix = raw_counts |> map(|col, i| if contains(control_idx, i) { col } else { nil }) |> filter(|col| col != nil)
let treated_matrix = raw_counts |> map(|col, i| if contains(treated_idx, i) { col } else { nil }) |> filter(|col| col != nil)
let de_results = diff_expr(control_matrix, treated_matrix)
print(f"DE test complete: {len(de_results)} genes tested")
# ── 5. Log2 Fold Change Calculation ───────────────────────────────
print("=== 5. Log2 Fold Change Calculation ===")
de_results = de_results
|> mutate("log2fc", |row| log2(row.mean_b / row.mean_a))
print("Result: log2FC computed for", len(de_results), "genes")
# ── 6. Multiple Testing Correction (Benjamini-Hochberg) ──────────
print("=== 6. Multiple Testing Correction (BH) ===")
let pvalues = de_results |> map(|row| row.pvalue)
let adjusted = p_adjust(pvalues, "bh")
de_results = de_results
|> mutate("padj", |row, i| adjusted[i])
de_results = de_results
|> sort(|a, b| a.padj - b.padj)
print("Top hits by adjusted p-value:")
de_results |> filter(|row| row.padj < 0.05)
|> map(|row| print(f" {row.gene}\tlog2FC={row.log2fc:.2}\tpadj={row.padj:.2e}"))
# ── 7. Filter Significant Genes ───────────────────────────────────
print("=== 7. Filter Significant Genes ===")
let sig_genes = de_results
|> filter(|row| row.padj < 0.05 && abs(row.log2fc) > 1.0)
let up_genes = sig_genes |> filter(|row| row.log2fc > 0) |> map(|row| row.gene)
let down_genes = sig_genes |> filter(|row| row.log2fc < 0) |> map(|row| row.gene)
print(f"Significant DE genes: {len(sig_genes)} total")
print(f" Upregulated: {len(up_genes)} {up_genes}")
print(f" Downregulated: {len(down_genes)} {down_genes}")
# ── 8. Pathway Enrichment Analysis ────────────────────────────────
print("=== 8. Pathway Enrichment Analysis ===")
let gene_sets = {
APOPTOSIS: ["TP53", "BAX", "BCL2", "MDM2", "CASP3", "CASP9", "BID", "BAK1"],
CELL_CYCLE: ["CDK2", "CDK4", "CCND1", "CCNE1", "RB1", "MYC", "TP53", "CDKN1A"],
PI3K_SIGNALING: ["EGFR", "VEGFA", "HIF1A", "AKT1", "MTOR", "PIK3CA", "PTEN"],
DNA_REPAIR: ["BRCA1", "BRCA2", "ATM", "ATR", "RAD51", "TP53", "CHEK2"]
}
let sig_gene_list = sig_genes |> map(|row| row.gene)
let ora_results = ora(sig_gene_list, gene_sets, genes)
print("\nORA Results:")
ora_results |> map(|row| print(f" {row.term}\tp={row.pvalue:.4f}\toverlap={row.overlap}"))
let ranked_genes = de_results
|> mutate("rank_score", |row| -log2(row.pvalue) * (row.log2fc |> sign()))
|> sort(|a, b| b.rank_score - a.rank_score)
let gsea_results = gsea(ranked_genes |> select("gene", "rank_score"), gene_sets)
print("\nGSEA Results:")
gsea_results |> map(|row| print(f" {row.term}\tNES={row.nes:.3f}\tpadj={row.padj:.4f}"))
# ── 9. Visualizations ─────────────────────────────────────────────
print("=== 9. Visualizations ===")
let vol = volcano(de_results, {
x: "log2fc", y: "padj", label: "gene",
title: "Nutlin-3a Treatment: Volcano Plot",
fc_cutoff: 1.0, pval_cutoff: 0.05
})
vol |> save_svg("output/volcano_nutlin3a.svg")
let sig_indices = range(0, len(genes)) |> filter(|i| contains(sig_gene_list, genes[i]))
let sig_cpm = matrix(sig_indices |> map(|i| cpm_matrix[i]))
let hm = clustered_heatmap({
data: sig_cpm, title: "Significant DE Genes (padj<0.05, |log2FC|>1)",
cluster_rows: true, cluster_cols: true, scale: "row"
})
hm |> save_svg("output/heatmap_sig_genes.svg")
let pca = pca_plot({
data: cpm_matrix, title: "PCA: Control vs Nutlin-3a Treated",
color_by: samples.condition, labels: samples.sample_id, n_components: 2
})
pca |> save_svg("output/pca_samples.svg")
let venn_plot = venn({
data: { A: up_genes, B: gene_sets.APOPTOSIS },
labels: ["Upregulated", "Apoptosis Pathway"],
title: "Overlap: Upregulated Genes and Apoptosis"
})
venn_plot |> save_svg("output/venn_up_apoptosis.svg")
# ── 10. Pathway Database Queries ──────────────────────────────────
print("=== 10. Pathway Database Queries ===")
let tp53_pathways = reactome_pathways("TP53")
print(f"\nTP53 Reactome pathways: {len(tp53_pathways)}")
let apoptosis_kegg = kegg_find("pathway", "apoptosis")
print("\nKEGG apoptosis pathways:")
apoptosis_kegg |> map(|entry| print(f" {entry.id}: {entry.description}"))
# ── 11. Export Results ─────────────────────────────────────────────
print("=== 11. Export Results ===")
write_csv(de_results, "output/de_results_all.csv")
write_csv(sig_genes, "output/de_significant.csv")
write_csv(ora_results, "output/ora_enrichment.csv")
write_csv(gsea_results, "output/gsea_enrichment.csv")
let report = to_markdown(de_results)
write_text("output/report.md", report)
print("\n=== Analysis complete ===")
GWAS Analysis
Type 2 Diabetes GWAS: summary statistics QC, Manhattan and QQ plots, genome-wide significance filtering, LD clumping, gene annotation via interval trees, forest plot of effect sizes, GO enrichment, and clinical report generation.
# requires: internet connection (NCBI, STRING, GO APIs)
# GWAS Analysis: Type 2 Diabetes Risk Loci
# Genome-wide association study identifying SNPs associated with T2D
# Dataset: simulated summary statistics from a meta-analysis of ~50,000 cases
# ── 1. Load GWAS Summary Statistics ──
print("=== Load GWAS Summary Statistics ===")
let gwas_data = table([
{rsid: "rs7903146", chrom: "10", pos: 114758349, effect_allele: "T", other_allele: "C", eaf: 0.30, info: 0.97, beta: 0.294, se: 0.018, pvalue: 5.2e-56, or: 1.342},
{rsid: "rs1801282", chrom: "3", pos: 12393125, effect_allele: "G", other_allele: "C", eaf: 0.12, info: 0.95, beta: 0.142, se: 0.025, pvalue: 1.3e-8, or: 1.153},
{rsid: "rs5219", chrom: "11", pos: 17409572, effect_allele: "T", other_allele: "C", eaf: 0.35, info: 0.98, beta: 0.168, se: 0.021, pvalue: 1.1e-15, or: 1.183},
{rsid: "rs13266634", chrom: "8", pos: 118184783, effect_allele: "C", other_allele: "T", eaf: 0.27, info: 0.96, beta: 0.132, se: 0.022, pvalue: 2.0e-9, or: 1.141},
{rsid: "rs7754840", chrom: "6", pos: 20661019, effect_allele: "C", other_allele: "G", eaf: 0.31, info: 0.94, beta: 0.125, se: 0.020, pvalue: 4.1e-10, or: 1.133},
{rsid: "rs10811661", chrom: "9", pos: 22134095, effect_allele: "T", other_allele: "C", eaf: 0.82, info: 0.99, beta: 0.198, se: 0.026, pvalue: 2.7e-14, or: 1.219},
{rsid: "rs4402960", chrom: "3", pos: 185511687, effect_allele: "T", other_allele: "G", eaf: 0.29, info: 0.97, beta: 0.113, se: 0.020, pvalue: 1.5e-8, or: 1.120},
{rsid: "rs12779790", chrom: "10", pos: 12307894, effect_allele: "G", other_allele: "A", eaf: 0.18, info: 0.91, beta: 0.098, se: 0.024, pvalue: 4.3e-5, or: 1.103},
{rsid: "rs1111875", chrom: "10", pos: 94462882, effect_allele: "C", other_allele: "T", eaf: 0.40, info: 0.96, beta: 0.119, se: 0.019, pvalue: 3.8e-10, or: 1.126},
{rsid: "rs7961581", chrom: "12", pos: 71433293, effect_allele: "C", other_allele: "T", eaf: 0.27, info: 0.93, beta: 0.107, se: 0.021, pvalue: 3.2e-7, or: 1.113},
{rsid: "rs2237892", chrom: "11", pos: 2839751, effect_allele: "C", other_allele: "T", eaf: 0.08, info: 0.88, beta: 0.211, se: 0.035, pvalue: 1.7e-9, or: 1.235},
{rsid: "rs231362", chrom: "11", pos: 2691471, effect_allele: "G", other_allele: "A", eaf: 0.48, info: 0.97, beta: 0.088, se: 0.018, pvalue: 1.1e-6, or: 1.092},
{rsid: "rs243021", chrom: "2", pos: 60584819, effect_allele: "A", other_allele: "G", eaf: 0.47, info: 0.85, beta: 0.076, se: 0.019, pvalue: 6.2e-5, or: 1.079},
{rsid: "rs5015480", chrom: "10", pos: 94455539, effect_allele: "C", other_allele: "T", eaf: 0.42, info: 0.72, beta: 0.065, se: 0.020, pvalue: 1.1e-3, or: 1.067},
{rsid: "rs340874", chrom: "1", pos: 214159256, effect_allele: "C", other_allele: "T", eaf: 0.44, info: 0.96, beta: 0.068, se: 0.018, pvalue: 1.6e-4, or: 1.070},
{rsid: "rs9939609", chrom: "16", pos: 53820527, effect_allele: "A", other_allele: "T", eaf: 0.41, info: 0.99, beta: 0.155, se: 0.019, pvalue: 3.1e-16, or: 1.168},
{rsid: "rs896854", chrom: "8", pos: 95937502, effect_allele: "T", other_allele: "C", eaf: 0.43, info: 0.95, beta: 0.061, se: 0.019, pvalue: 1.3e-3, or: 1.063},
{rsid: "rs7578597", chrom: "2", pos: 43732823, effect_allele: "T", other_allele: "C", eaf: 0.11, info: 0.92, beta: 0.139, se: 0.029, pvalue: 1.6e-6, or: 1.149},
{rsid: "rs8050136", chrom: "16", pos: 53816275, effect_allele: "A", other_allele: "C", eaf: 0.39, info: 0.98, beta: 0.148, se: 0.019, pvalue: 6.5e-15, or: 1.160}
])
print("Loaded " + str(len(gwas_data)) + " SNPs from T2D GWAS summary statistics")
# ── 2. Quality Control Filters ──
print("=== Quality Control Filters ===")
let ambiguous_pairs = [["A","T"], ["T","A"], ["C","G"], ["G","C"]]
let qc_no_ambig = gwas_data
|> filter(|row| !contains(ambiguous_pairs, [row.effect_allele, row.other_allele]))
print("After removing ambiguous SNPs: " + str(len(qc_no_ambig)) + " remain")
let qc_maf = qc_no_ambig
|> filter(|row| min(row.eaf, 1.0 - row.eaf) > 0.01)
let qc_pass = qc_maf
|> filter(|row| row.info > 0.8)
print("After QC (MAF>0.01, INFO>0.8): " + str(len(qc_pass)) + " SNPs pass")
# ── 3. Manhattan and QQ Plots ──
print("=== Manhattan and QQ Plots ===")
manhattan(qc_pass, {threshold: 5e-8, format: "svg"})
|> save_svg("output/t2d_manhattan.svg")
let pvalues = qc_pass |> select("pvalue") |> map(|row| row.pvalue)
qq_plot(pvalues, {format: "svg"})
|> save_svg("output/t2d_qqplot.svg")
let neg_log_p = pvalues |> map(|p| -log10(p))
let lambda_gc = median(neg_log_p) / 0.301
print("Genomic inflation factor (lambda_GC, approx): " + str(round(lambda_gc, 3)))
# ── 4. Identify Significant Hits ──
print("=== Identify Significant Hits ===")
let gw_significant = qc_pass
|> filter(|row| row.pvalue < 5e-8)
|> sort(|a, b| a.pvalue - b.pvalue)
let suggestive = qc_pass
|> filter(|row| row.pvalue >= 5e-8 && row.pvalue < 1e-5)
|> sort(|a, b| a.pvalue - b.pvalue)
print("Genome-wide significant (p < 5e-8): " + str(len(gw_significant)) + " SNPs")
print("Suggestive (5e-8 <= p < 1e-5): " + str(len(suggestive)) + " SNPs")
# ── 5. LD Clumping (proximity-based) ──
print("=== LD Clumping ===")
let clump_window = 500000
let sorted_hits = gw_significant |> to_records()
let lead_snps = reduce(sorted_hits, |leads, snp|
if all(leads, |l| l.chrom != snp.chrom || abs(l.pos - snp.pos) > clump_window)
then leads + [snp]
else leads, [])
print("Independent lead SNPs after clumping: " + str(len(lead_snps)))
# ── 6. Gene Annotation via Genomic Intervals ──
print("=== Gene Annotation via Genomic Intervals ===")
let gene_regions = table([
{name: "TCF7L2", chrom: "10", start: 114710009, end: 114927437},
{name: "PPARG", chrom: "3", start: 12328867, end: 12475855},
{name: "KCNJ11", chrom: "11", start: 17406795, end: 17410878},
{name: "SLC30A8", chrom: "8", start: 118170394, end: 118185088},
{name: "CDKAL1", chrom: "6", start: 20534687, end: 20784666},
{name: "CDKN2A/B", chrom: "9", start: 21967752, end: 22009312},
{name: "IGF2BP2", chrom: "3", start: 185345628, end: 185544082},
{name: "HHEX/IDE", chrom: "10", start: 94427522, end: 94476459},
{name: "FTO", chrom: "16", start: 53737875, end: 54155853},
{name: "KCNQ1", chrom: "11", start: 2466221, end: 2870340},
{name: "THADA", chrom: "2", start: 43505464, end: 43806925}
])
let gene_tree = interval_tree(gene_regions)
let annotated = lead_snps
|> map(|snp| {
let hits = query_overlaps(gene_tree, snp.chrom, snp.pos, snp.pos + 1)
let nearest_gene = if len(hits) > 0 then hits[0].name else "intergenic"
{rsid: snp.rsid, chrom: snp.chrom, pos: snp.pos, pvalue: snp.pvalue,
or: snp.or, beta: snp.beta, se: snp.se, nearest_gene: nearest_gene}
})
print("\nLead SNPs with gene annotations:")
annotated |> each(|snp|
print(snp.rsid + "\t" + snp.chrom + "\t" + str(snp.pos) + "\t" +
str(snp.pvalue) + "\t" + str(snp.or) + "\t" + snp.nearest_gene))
# ── 7-11. Database queries, forest plot, enrichment, report ──
let top_hit = annotated[0]
let gene_info = ncbi_gene("TCF7L2")
let ppi_network = string_network(["TCF7L2"], 9606)
let forest_data = annotated
|> map(|row| {
let ci_lower = exp(log(row.or) - 1.96 * row.se)
let ci_upper = exp(log(row.or) + 1.96 * row.se)
{label: row.nearest_gene, estimate: row.or, lower: ci_lower, upper: ci_upper}
})
|> table()
forest_plot(forest_data, {format: "svg"})
|> save_svg("output/t2d_forest.svg")
let gwas_genes = annotated |> map(|row| row.nearest_gene) |> unique
let tcf7l2_go = go_annotations("TCF7L2")
let annotated_table = table(annotated)
let report = "# T2D GWAS Analysis Report\n\n" +
"## Overview\n\nGWAS meta-analysis for Type 2 Diabetes risk loci.\n\n" +
"## Lead SNPs\n\n" + to_markdown(annotated_table) + "\n"
write_text("output/t2d_gwas_report.md", report)
print("\n=== Analysis complete ===")
Gut Microbiome (16S)
IBD patients vs healthy controls: Shannon/Simpson alpha diversity with t-tests, Bray-Curtis beta diversity matrix, differential abundance analysis, taxonomy and KEGG pathway lookups, violin plots and heatmaps.
# requires: internet connection (NCBI, KEGG APIs)
# Gut Microbiome Analysis: IBD Patients vs Healthy Controls
# 16S rRNA amplicon sequencing — V3-V4 region, paired-end 2x300bp
# ── 1. Sample Metadata ──────────────────────────────────────────
print("=== Sample Metadata ===")
let metadata = table(
sample_id: ["IBD01", "IBD02", "IBD03", "IBD04", "HC01", "HC02", "HC03", "HC04"],
condition: ["IBD", "IBD", "IBD", "IBD", "healthy", "healthy", "healthy", "healthy"],
age: [34, 52, 28, 45, 31, 48, 33, 41],
sex: ["F", "M", "F", "M", "M", "F", "M", "F"],
bmi: [22.1, 27.4, 19.8, 25.6, 24.3, 23.0, 26.1, 22.8]
)
# ── 2. ASV Abundance Table (raw counts) ────────────────────────
print("=== ASV Abundance Table ===")
let taxa = ["Bacteroides", "Faecalibacterium", "Prevotella", "Escherichia",
"Akkermansia", "Ruminococcus", "Bifidobacterium", "Clostridium",
"Lactobacillus", "Roseburia", "Enterococcus", "Fusobacterium"]
let counts = matrix(
[1842, 1523, 2104, 1677, 3210, 2987, 3455, 3102],
[ 312, 198, 245, 287, 2845, 3120, 2678, 2901],
[ 178, 203, 145, 167, 1456, 1234, 1567, 1389],
[2456, 2834, 2190, 2567, 234, 189, 276, 201],
[ 456, 378, 512, 423, 1023, 987, 1145, 1067],
[ 234, 189, 267, 213, 1678, 1543, 1789, 1623],
[ 567, 489, 623, 534, 1234, 1456, 1189, 1345],
[1234, 1456, 1089, 1345, 345, 278, 312, 298],
[ 145, 123, 178, 156, 567, 623, 534, 589],
[ 89, 67, 102, 78, 1567, 1423, 1689, 1534],
[ 890, 1023, 756, 912, 98, 67, 112, 89],
[ 678, 812, 589, 723, 34, 21, 45, 28]
)
let n_samples = 8
let n_taxa = len(taxa)
# ── 3. Relative Abundance Normalization ────────────────────────
print("=== Relative Abundance Normalization ===")
let col_totals = range(0, n_samples) |> map(|j|
range(0, n_taxa) |> map(|i| counts[i][j]) |> reduce(0, |a, b| a + b)
)
let rel_abundance = range(0, n_taxa) |> map(|i|
range(0, n_samples) |> map(|j| counts[i][j] / col_totals[j])
)
# ── 4. Alpha Diversity ─────────────────────────────────────────
print("=== Alpha Diversity ===")
let shannon = range(0, n_samples) |> map(|j|
let props = range(0, n_taxa) |> map(|i| rel_abundance[i][j])
let nonzero = props |> filter(|p| p > 0.0)
nonzero |> map(|p| -1.0 * p * log(p)) |> reduce(0.0, |a, b| a + b)
)
let simpson = range(0, n_samples) |> map(|j|
let sum_sq = range(0, n_taxa)
|> map(|i| pow(rel_abundance[i][j], 2))
|> reduce(0.0, |a, b| a + b)
1.0 - sum_sq
)
let diversity = table(
sample_id: metadata.sample_id, condition: metadata.condition,
shannon: shannon, simpson: simpson
)
print(diversity)
# ── 5. Alpha Diversity — Group Comparison ──────────────────────
print("=== Alpha Diversity Group Comparison ===")
let shannon_ibd = diversity |> filter(|r| r.condition == "IBD") |> select("shannon")
let shannon_healthy = diversity |> filter(|r| r.condition == "healthy") |> select("shannon")
let shannon_test = ttest(shannon_ibd, shannon_healthy)
print("Shannon IBD vs Healthy — t=" + shannon_test.t_statistic + " p=" + shannon_test.p_value)
# ── 6. Beta Diversity — Bray-Curtis ────────────────────────────
print("=== Beta Diversity ===")
let bray_curtis = range(0, n_samples) |> map(|j|
range(0, n_samples) |> map(|k|
let shared = range(0, n_taxa)
|> map(|i| if counts[i][j] < counts[i][k] then counts[i][j] else counts[i][k])
|> reduce(0, |a, b| a + b)
1.0 - (2.0 * shared) / (col_totals[j] + col_totals[k])
)
)
# ── 7. Differential Abundance ─────────────────────────────────
print("=== Differential Abundance ===")
let ibd_cols = [0, 1, 2, 3]
let healthy_cols = [4, 5, 6, 7]
let diff_results = range(0, n_taxa) |> map(|i|
let ibd_vals = ibd_cols |> map(|j| rel_abundance[i][j])
let healthy_vals = healthy_cols |> map(|j| rel_abundance[i][j])
let mean_ibd = mean(ibd_vals)
let mean_healthy = mean(healthy_vals)
let log2fc = log(mean_ibd / mean_healthy) / log(2.0)
let test = ttest(ibd_vals, healthy_vals)
{ taxon: taxa[i], log2fc: log2fc, p_value: test.p_value }
)
let diff_table = diff_results
|> sort(|a, b| a.p_value - b.p_value)
|> filter(|r| r.p_value < 0.05)
print("Differentially abundant taxa (p < 0.05):")
print(diff_table)
# ── 8-10. Database lookups, visualizations, report ─────────────
let faecal_info = ncbi_search("taxonomy", "Faecalibacterium prausnitzii")
let butyrate_pathway = kegg_find("pathway", "butanoate metabolism")
violin({
data: diversity, x: "condition", y: "shannon",
title: "Shannon Diversity — IBD vs Healthy Controls"
}) |> save_svg("shannon_violin.svg")
let report = "# 16S Metagenomics — IBD Gut Microbiome Study\n\n" +
"## Cohort\n\nN=8 (4 IBD, 4 healthy), V3-V4 16S rRNA\n\n" +
"## Alpha Diversity\n\nShannon: IBD significantly lower (p=" + shannon_test.p_value + ")\n\n" +
"## Differential Taxa\n\nIBD enriched: Escherichia, Clostridium, Fusobacterium\n" +
"IBD depleted: Faecalibacterium, Roseburia, Ruminococcus\n"
write_text("metagenomics_report.md", report)
print("\n=== Analysis complete ===")
Pharmacogenomics
Warfarin dosing from CYP2C9/VKORC1 genotypes: star-allele activity scores, metabolizer phenotype assignment, IWPC-like dose prediction, drug interaction lookups, population allele frequencies, and clinical decision support.
# requires: internet connection (NCBI, KEGG, Reactome APIs)
# Predicting drug response from patient genotypes — Warfarin dosing
#
# Warfarin: anticoagulant with narrow therapeutic window.
# CYP2C9 star alleles (*2, *3) reduce metabolism; VKORC1 -1639G>A
# lowers expression and increases warfarin sensitivity.
# ── 1. Patient cohort ────────────────────────────────────────────
print("=== Patient Cohort ===")
let patients = table(
id: ["PT-001","PT-002","PT-003","PT-004","PT-005","PT-006","PT-007","PT-008"],
age: [62, 45, 71, 58, 39, 67, 53, 74],
weight_kg: [78, 92, 64, 85, 70, 60, 105, 68],
cyp2c9: ["*1/*1","*1/*2","*1/*3","*2/*2","*2/*3","*3/*3","*1/*1","*1/*2"],
vkorc1: ["GG", "GA", "GA", "GG", "AA", "GA", "AA", "AA" ]
)
# ── 2. Pharmacogene lookups ──────────────────────────────────────
print("=== Pharmacogene Lookups ===")
let cyp2c9_gene = ncbi_gene("CYP2C9")
let vkorc1_gene = ncbi_gene("VKORC1")
print("CYP2C9 locus: chr" + cyp2c9_gene.chromosome)
print("VKORC1 locus: chr" + vkorc1_gene.chromosome)
# ── 3. Genotype to metabolizer phenotype ─────────────────────────
print("=== Genotype to Metabolizer Phenotype ===")
let star_score = { "*1": 1.0, "*2": 0.5, "*3": 0.0 }
let assign_phenotype = |score| {
if score >= 2.0 then "Normal"
else if score >= 1.0 then "Intermediate"
else "Poor"
}
patients = patients |> mutate("activity_score", |row| {
let alleles = split(row.cyp2c9, "/")
let a1 = replace(alleles[0], "*", "")
let a2 = replace(alleles[1], "*", "")
star_score["*" + a1] + star_score["*" + a2]
})
patients = patients |> mutate("metabolizer", |row| assign_phenotype(row.activity_score))
# ── 4. Predicted warfarin dose ───────────────────────────────────
print("=== Predicted Warfarin Dose ===")
let vkorc1_factor = { "GG": 1.0, "GA": 0.75, "AA": 0.5 }
patients = patients |> mutate("predicted_dose_mg", |row| {
let base = 5.0
let cyp_adj = row.activity_score / 2.0
let vk_adj = vkorc1_factor[row.vkorc1]
round(base * cyp_adj * vk_adj, 1)
})
patients |> select("id","cyp2c9","vkorc1","metabolizer","predicted_dose_mg") |> print()
# ── 5. Drug interaction lookups ──────────────────────────────────
print("=== Drug Interaction Lookups ===")
let warfarin_kegg = kegg_get("D00564")
let cyp_inhibitors = kegg_find("drug", "CYP2C9 inhibitor")
let cyp_pathways = reactome_pathways("CYP2C9")
print("Known CYP2C9 inhibitors found: " + len(cyp_inhibitors))
# ── 6-8. Variant effects, population frequencies, visualizations ─
let pop_freq = table(
population: ["European","African","East Asian","South Asian","Latino"],
cyp2c9_s2: [0.12, 0.03, 0.00, 0.04, 0.07],
cyp2c9_s3: [0.07, 0.01, 0.04, 0.10, 0.05],
vkorc1_a: [0.39, 0.11, 0.91, 0.52, 0.44]
)
pop_freq = pop_freq |> mutate("risk_index", |row|
round(row.vkorc1_a * (row.cyp2c9_s2 + row.cyp2c9_s3) * 100, 1))
pop_freq |> arrange("-risk_index") |> print()
violin({ data: patients, x: "metabolizer", y: "predicted_dose_mg",
title: "Warfarin predicted dose by CYP2C9 metabolizer status" })
|> save_svg("warfarin_dose_violin.svg")
lollipop({ data: table(
position: [144, 359, 360, 430],
label: ["*2 R144C","*3 I359L","D360E (rare)","heme-binding"],
impact: [0.5, 1.0, 0.3, 0.0]
), title: "CYP2C9 pharmacovariants", length: 490 })
|> save_svg("cyp2c9_lollipop.svg")
# ── 9. Clinical decision support summary ────────────────────────
let high_risk = patients |> filter(|row| row.predicted_dose_mg <= 1.9)
let normal = patients |> filter(|row| row.predicted_dose_mg >= 4.0)
print("High-risk patients (dose <= 1.9 mg/day): " + len(high_risk))
let report = "# Warfarin Pharmacogenomic Dosing Report\n\n"
+ "## Cohort Summary\n\n"
+ len(patients) + " patients genotyped for CYP2C9 and VKORC1.\n\n"
+ "## Dose Predictions\n\n"
+ "Range: " + min(patients.predicted_dose_mg) + " - "
+ max(patients.predicted_dose_mg) + " mg/day.\n\n"
+ "## High-Risk Patients\n\n"
+ len(high_risk) + " patients require sub-therapeutic starting dose.\n"
write_text("warfarin_pgx_report.md", report)
print("\n=== Analysis complete ===")
Single-Cell RNA-seq
Tumor microenvironment cell type identification: QC, log-CPM normalization, highly variable gene selection, UMAP and t-SNE embedding, Leiden clustering, marker-based cell type annotation, sparse matrix support.
# requires: internet connection (NCBI API)
# Tumor Microenvironment Cell Type Identification from scRNA-seq
# Simulated 10x Chromium dataset from a solid tumor biopsy.
# ── 1. Simulated Count Matrix (20 cells x 10 marker genes) ──
let counts = matrix([
# CD4 T cells (cells 0-2): CD3E+, CD4+
[82, 45, 1, 0, 0, 0, 3, 0, 0, 0],
[91, 52, 2, 1, 0, 0, 1, 0, 0, 0],
[76, 39, 0, 0, 1, 0, 2, 0, 1, 0],
# CD8 T cells (cells 3-5): CD3E+, CD8A+
[88, 2, 61, 0, 0, 0, 12, 0, 0, 0],
[79, 1, 55, 0, 0, 0, 9, 0, 0, 0],
[93, 0, 68, 1, 0, 0, 15, 0, 0, 0],
# B cells (cells 6-8): MS4A1+
[1, 0, 0, 74, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 88, 1, 0, 0, 0, 0, 0],
[2, 1, 0, 69, 0, 0, 0, 0, 1, 0],
# Monocytes (cells 9-11): CD14+, FCGR3A+
[0, 0, 0, 0, 65, 34, 0, 0, 0, 0],
[1, 0, 0, 0, 78, 41, 0, 0, 0, 0],
[0, 0, 0, 1, 71, 28, 1, 0, 0, 0],
# NK cells (cells 12-13): NKG7+, FCGR3A+
[3, 0, 0, 0, 0, 22, 95, 0, 0, 0],
[1, 0, 0, 0, 0, 18, 87, 0, 0, 0],
# Fibroblasts (cells 14-16): COL1A1+
[0, 0, 0, 0, 0, 0, 0, 112, 2, 1],
[0, 0, 0, 0, 1, 0, 0, 98, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 105, 3, 2],
# Epithelial (cells 17-18): EPCAM+
[0, 0, 0, 0, 0, 0, 0, 1, 84, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 91, 1],
# Endothelial (cell 19): PECAM1+
[0, 0, 0, 0, 0, 0, 0, 2, 1, 76]
])
let genes = ["CD3E","CD4","CD8A","MS4A1","CD14","FCGR3A","NKG7","COL1A1","EPCAM","PECAM1"]
let n_cells = 20
let n_genes = 10
# ── 2. QC ──
let row_data = range(0, n_cells) |> map(|i| counts.row(i))
let total_umi = row_data |> map(|row| sum(row))
let genes_detected = row_data |> map(|row| row |> filter(|x| x > 0.0) |> len)
print("=== QC Summary ===")
print("Cells passing QC:", len(range(0, n_cells) |> filter(|i| genes_detected[i] >= 2)), "/", n_cells)
# ── 3. Normalization (log-CPM) ──
let norm_rows = range(0, n_cells) |> map(|i|
let row = counts.row(i)
let total = total_umi[i]
row |> map(|x| log(1.0 + x / total * 10000.0))
)
let norm = matrix(norm_rows)
# ── 4. HVG Selection ──
let gene_means = range(0, n_genes) |> map(|j|
range(0, n_cells) |> map(|i| norm_rows[i][j]) |> mean
)
let gene_vars = range(0, n_genes) |> map(|j|
let vals = range(0, n_cells) |> map(|i| norm_rows[i][j])
let mu = mean(vals)
vals |> map(|x| (x - mu) * (x - mu)) |> mean
)
let dispersion = range(0, n_genes) |> map(|j| gene_vars[j] / (gene_means[j] + 0.001))
# ── 5. UMAP + Clustering ──
let umap_coords = umap(norm, 2)
let adj_data = range(0, n_cells) |> map(|i|
range(0, n_cells) |> map(|j|
if i == j then 0.0
else {
let ri = norm_rows[i]
let rj = norm_rows[j]
let dist = range(0, n_genes) |> map(|k| (ri[k] - rj[k]) * (ri[k] - rj[k])) |> sum |> sqrt()
if dist < 5.0 then 1.0 else 0.0
}
)
)
let adj = matrix(adj_data)
let clusters = leiden(adj)
# ── 6. Cell Type Annotation ──
let cell_types = range(0, n_cells) |> map(|i|
let row = norm_rows[i]
if row[0] > 3.0 && row[1] > 3.0 then "CD4_T"
else if row[0] > 3.0 && row[2] > 3.0 then "CD8_T"
else if row[3] > 3.0 then "B_cell"
else if row[4] > 3.0 then "Monocyte"
else if row[6] > 3.0 then "NK_cell"
else if row[7] > 3.0 then "Fibroblast"
else if row[8] > 3.0 then "Epithelial"
else if row[9] > 3.0 then "Endothelial"
else "Unknown"
)
print("\n=== Cell Type Annotations ===")
range(0, n_cells) |> each(|i|
print(" Cell", i, "-> cluster", clusters[i], "-> type:", cell_types[i])
)
# ── 7. Database Validation ──
let cd3e_info = ncbi_gene("CD3E")
print("\n=== CD3E (T cell marker) ===")
print("Description:", cd3e_info.description)
# ── 8. Sparse Matrix Demo ──
let entries = []
range(0, n_cells) |> each(|i|
range(0, n_genes) |> each(|j|
let v = counts.row(i)[j]
if v > 0.0 then push(entries, {row: i, col: j, val: v})
)
)
let sp = sparse_matrix(n_cells, n_genes, entries)
print("\n=== Sparse Matrix ===")
print("Shape:", n_cells, "x", n_genes, " nnz:", nnz(sp))
print("\n=== Analysis Complete ===")
print("Identified", len(clusters |> unique), "clusters and", len(cell_types |> unique), "cell types")
CRISPR Knockout Screen
Vemurafenib resistance screen in melanoma: sgRNA library, RPM normalization, log2 fold changes, gene-level aggregation, statistical testing with BH correction, hit identification, database annotation, and waterfall rank plot.
# requires: internet connection (NCBI, Reactome APIs)
# CRISPR Knockout Screen for Drug Resistance Genes in Melanoma
# Vemurafenib (BRAF inhibitor) resistance screen in A375 cells.
# ── 1. sgRNA Library Definition ──
print("=== sgRNA Library Definition ===")
let sgrna_library = table(
sgRNA: ["BRAF_g1","BRAF_g2","NRAS_g1","MAP2K1_g1","MAP2K1_g2",
"PTEN_g1","PTEN_g2","RB1_g1","TP53_g1","TP53_g2",
"CDKN2A_g1","NF1_g1","NF1_g2","APC_g1","STK11_g1",
"NT_ctrl1","NT_ctrl2","NT_ctrl3"],
gene: ["BRAF","BRAF","NRAS","MAP2K1","MAP2K1",
"PTEN","PTEN","RB1","TP53","TP53",
"CDKN2A","NF1","NF1","APC","STK11",
"NT_ctrl","NT_ctrl","NT_ctrl"]
)
# ── 2. Simulated Read Counts ──
let counts = table(
sgRNA: sgrna_library.sgRNA,
gene: sgrna_library.gene,
plasmid_count: [5200, 4800, 4950, 5100, 5300, 4700, 5050, 4900, 5150, 4850,
5000, 4750, 5100, 4600, 4950, 5000, 5100, 4900],
treated_count: [320, 280, 48200, 1200, 1350, 52000, 47500, 890, 41000, 38500,
150, 62000, 58000, 4200, 3800, 4800, 5200, 4950],
untreated_count: [4100, 3900, 5200, 4800, 5000, 4500, 4700, 3200, 5300, 5100,
3000, 4600, 4900, 4400, 4200, 5100, 4800, 5000]
)
# ── 3-4. QC and Normalize to RPM ──
let total_plasmid = sum(counts.plasmid_count)
let total_treated = sum(counts.treated_count)
let total_untreated = sum(counts.untreated_count)
let norm = counts
|> mutate("plasmid_rpm", |r| r.plasmid_count / total_plasmid * 1e6)
|> mutate("treated_rpm", |r| r.treated_count / total_treated * 1e6)
|> mutate("untreated_rpm", |r| r.untreated_count / total_untreated * 1e6)
# ── 5. Log2 Fold Changes ──
let lfc = norm
|> mutate("lfc_treat_vs_plasmid", |r| log2((r.treated_rpm + 0.5) / (r.plasmid_rpm + 0.5)))
|> mutate("lfc_untreat_vs_plasmid", |r| log2((r.untreated_rpm + 0.5) / (r.plasmid_rpm + 0.5)))
# ── 6. Gene-Level Aggregation ──
print("=== Gene-Level Aggregation ===")
let gene_names = lfc.gene |> unique
let gene_stats = table(gene_names |> map(|g| {
let guides = lfc |> filter(|r| r.gene == g)
{gene: g, n_guides: len(guides),
mean_lfc: mean(guides.lfc_treat_vs_plasmid),
median_lfc: median(guides.lfc_treat_vs_plasmid),
fitness_lfc: mean(guides.lfc_untreat_vs_plasmid)}
}))
gene_stats = gene_stats |> sort(|a, b| b.median_lfc - a.median_lfc)
# ── 7. Statistical Testing ──
let ctrl_lfcs = lfc |> filter(|r| r.gene == "NT_ctrl") |> select("lfc_treat_vs_plasmid")
let gene_pvals = gene_stats
|> filter(|r| r.gene != "NT_ctrl")
|> mutate("p_value", |r| {
let guide_lfcs = lfc |> filter(|g| g.gene == r.gene) |> select("lfc_treat_vs_plasmid")
ttest(guide_lfcs, ctrl_lfcs)
})
let adj_pvals = p_adjust(gene_pvals.p_value, "bh")
let gene_results = gene_pvals |> mutate("p_adj", |r, i| adj_pvals[i])
# ── 8. Hit Identification ──
print("=== Hit Identification ===")
let resistance_hits = gene_results
|> filter(|r| r.median_lfc > 1.0 && r.p_adj < 0.05)
|> sort(|a, b| b.median_lfc - a.median_lfc)
let sensitizer_hits = gene_results
|> filter(|r| r.median_lfc < -1.0 && r.p_adj < 0.05)
print("Resistance genes:")
resistance_hits |> select("gene", "median_lfc", "p_adj") |> print()
# ── 9-11. Database annotation, visualizations, report ──
let braf_info = ncbi_gene("BRAF")
let braf_paths = reactome_pathways("BRAF")
volcano({
data: gene_results |> mutate("neg_log10p", |r| -log10(r.p_adj)),
x: "median_lfc", y: "neg_log10p", label: "gene",
title: "CRISPR Screen: Vemurafenib Resistance"
}) |> save_svg("volcano_crispr_screen.svg")
let report_text = [
"# CRISPR Screen Report: Vemurafenib Resistance in A375 Melanoma",
"",
"## Resistance Hits (LFC > 1, padj < 0.05):",
resistance_hits |> select("gene", "median_lfc", "p_adj") |> to_markdown(),
"",
"## Top Annotation: BRAF — " + braf_info.description
] |> join("\n")
write_text("crispr_screen_report.md", report_text)
print("\n=== Analysis complete ===")
Epigenomics / ChIP-seq
H3K27ac super-enhancer analysis in AML: interval tree construction, overlap queries, nearest-gene assignment, genome coverage, super-enhancer identification via rank-signal inflection, differential binding, motif scanning, and genome tracks.
# requires: internet connection (NCBI, STRING APIs)
# H3K27ac ChIP-seq Identifies Super-Enhancers in Leukemia
# AML patient blasts — hg38 coordinates near known leukemia loci.
# ── 1. Define H3K27ac ChIP-seq Peaks ──────────────────────────
print("=== Define H3K27ac ChIP-seq Peaks ===")
let peaks = table([
{id: "pk01", iv: interval("chr8", 127735430, 127742600), signal: 48.2, gene_near: "MYC"},
{id: "pk02", iv: interval("chr8", 127745100, 127749800), signal: 35.7, gene_near: "MYC"},
{id: "pk03", iv: interval("chr8", 127753000, 127758200), signal: 29.4, gene_near: "MYC"},
{id: "pk04", iv: interval("chr18", 63123340, 63130500), signal: 22.1, gene_near: "BCL2"},
{id: "pk05", iv: interval("chr18", 63134200, 63139000), signal: 18.6, gene_near: "BCL2"},
{id: "pk06", iv: interval("chr21", 34787800, 34795600), signal: 41.3, gene_near: "RUNX1"},
{id: "pk07", iv: interval("chr21", 34799100, 34806400), signal: 37.9, gene_near: "RUNX1"},
{id: "pk08", iv: interval("chr12", 11900150, 11907300), signal: 52.6, gene_near: "ETV6"},
{id: "pk09", iv: interval("chr13", 28577400, 28584900), signal: 44.8, gene_near: "FLT3"},
{id: "pk10", iv: interval("chr1", 23690800, 23697200), signal: 15.3, gene_near: "HMGN2"},
{id: "pk11", iv: interval("chr11", 32410500, 32418200), signal: 31.0, gene_near: "WT1"},
{id: "pk12", iv: interval("chr17", 29160700, 29167900), signal: 12.8, gene_near: "NF1"},
{id: "pk13", iv: interval("chr2", 25455800, 25462100), signal: 9.4, gene_near: "DNMT3A"},
{id: "pk14", iv: interval("chr1", 205098300, 205105400), signal: 26.5, gene_near: "NRAS"},
{id: "pk15", iv: interval("chr17", 76734100, 76741800), signal: 33.1, gene_near: "CEBPA"}
])
print("Total peaks: " + str(peaks |> len))
# ── 2. Gene Annotations (TSS +/- 2kb) ─────────────────────────
let promoters = table([
{gene: "MYC", iv: interval("chr8", 127736587, 127740587)},
{gene: "BCL2", iv: interval("chr18", 63121346, 63125346)},
{gene: "RUNX1", iv: interval("chr21", 34787800, 34791800)},
{gene: "ETV6", iv: interval("chr12", 11900496, 11904496)},
{gene: "FLT3", iv: interval("chr13", 28577411, 28581411)}
])
# ── 3. Interval Operations ────────────────────────────────────
print("=== Interval Operations ===")
let peak_ivs = table(peaks |> map(|p| {
{chrom: p.iv.chrom, start: p.iv.start, end: p.iv.end, id: p.id, signal: p.signal}
}))
let promo_ivs = table(promoters |> map(|p| {
{chrom: p.iv.chrom, start: p.iv.start, end: p.iv.end, gene: p.gene}
}))
let peak_tree = peak_ivs |> interval_tree()
let promo_tree = promo_ivs |> interval_tree()
let promoter_peaks = promoters |> map(|p| {
query_overlaps(peak_tree, p.iv.chrom, p.iv.start, p.iv.end)
}) |> flatten()
print("Peaks at promoters: " + str(promoter_peaks |> len))
let nearest_genes = peaks |> map(|p| {
let ng_table = query_nearest(promo_tree, p.iv.chrom, p.iv.start)
let ng = ng_table[0]
{peak: p.id, gene: ng.gene, distance: abs(p.iv.start - ng.start)}
})
let total_cov = coverage(peak_tree)
print("Total H3K27ac coverage: " + str(total_cov) + " bp")
# ── 4. Super-Enhancer Identification ──────────────────────────
print("=== Super-Enhancer Identification ===")
let signals_sorted = peaks |> sort(|a, b| b.signal - a.signal) |> map(|p| p.signal)
let threshold = signals_sorted |> median()
let super_enhancers = peaks |> filter(|p| p.signal >= threshold)
let typical_enhancers = peaks |> filter(|p| p.signal < threshold)
print("Super-enhancers: " + str(super_enhancers |> len))
super_enhancers |> each(|se| {
print(" SE: " + se.gene_near + " — signal " + str(se.signal))
})
# ── 5. Differential Binding ──────────────────────────────────
let diff = peaks |> mutate("log2fc", |row| log2(row.signal / 15.0))
let gained = diff |> filter(|d| d.log2fc > 1.0)
# ── 6. Motif Analysis ────────────────────────────────────────
print("=== Motif Analysis ===")
let se_seq = dna"ATCGGTGTGGTCAAGGAATCCTGTGGTAAGGAATAGCCTAACTGATCGTGTGGTCCGGAACCTA"
let runx1_hits = find_motif(se_seq, "TGTGGT")
let ets_hits = find_motif(se_seq, "GGAA")
let myb_hits = find_motif(se_seq, "TAACTG")
print("RUNX1 (TGTGGT): " + str(len(runx1_hits)) + " sites")
print("ETS (GGAA): " + str(len(ets_hits)) + " sites")
print("GC content: " + str(gc_content(se_seq)) + "%")
# ── 7-9. Database queries, visualizations, report ────────────
let myc_info = ncbi_gene("MYC")
let myc_network = string_network(["MYC"], 9606)
let track = genome_track({data: peaks |> filter(|p| p.gene_near == "MYC"),
title: "H3K27ac at MYC locus (chr8)"})
save_svg(track, "output/myc_h3k27ac_track.svg")
let report = "# Super-Enhancer Analysis — H3K27ac ChIP-seq in AML\n\n" +
"## Overview\n\n" + str(peaks |> len) + " peaks, " +
str(super_enhancers |> len) + " super-enhancers identified.\n\n" +
"## Key SE targets: MYC, ETV6, FLT3, RUNX1\n"
write_text("output/se_report.md", report)
print("\n=== Analysis complete ===")
Phylogenomics
SARS-CoV-2 spike protein evolution: Hamming distance matrices, neighbor-joining trees, MinHash sketching, k-mer composition, mutation mapping, simplified dN/dS estimation, sequence logos, and lollipop mutation plots.
# requires: internet connection (UniProt, PDB APIs)
# Molecular Evolution of SARS-CoV-2 Spike Protein Across Variants
# Phylogenomic analysis of spike protein RBD region across variants of concern.
# ── 1. Define Spike Protein Sequences ──
print("=== Define Spike Protein Sequences ===")
let wuhan = protein"MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTR"
let alpha = protein"MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTK"
let beta = protein"MFVFLVLLPLVSSQCVNLKTRTQLPPAYTNSFTR"
let delta = protein"MFVFLVLLPLVSSQCVNLRTRTQLPPAYTNSFTG"
let omi_ba1 = protein"MFVFLVLLPLVSSQCVNLKTREQLPPAYTNSFEK"
let omi_ba5 = protein"MFVFLVLLPLVSSQCVNLKTREQLPPAYENSFEK"
let xbb15 = protein"MFVFLVLLPLVSSQCVNLKTREQLPPAYEISFEK"
let variants = table({
name: ["Wuhan", "Alpha", "Beta", "Delta", "Omicron_BA1", "Omicron_BA5", "XBB15"],
seq: [wuhan, alpha, beta, delta, omi_ba1, omi_ba5, xbb15],
lineage: ["WT", "B.1.1.7", "B.1.351", "B.1.617.2", "BA.1", "BA.5", "XBB.1.5"]
})
let names = variants.name
let seqs = variants.seq
# ── 2. Pairwise Distance Matrix ──
print("=== Pairwise Alignment and Distance Matrix ===")
let dist = names |> map(|a|
names |> map(|b|
hamming_distance(seqs[index_of(names, a)], seqs[index_of(names, b)])
)
)
let dist_matrix = matrix(dist)
print("Distance matrix:")
print(dist_matrix)
# ── 3. Phylogenetic Tree ──
print("=== Phylogenetic Tree Construction ===")
let nj_tree = neighbor_joining(dist_matrix)
print("Newick tree: " + nj_tree)
phylo_tree({data: nj_tree, title: "SARS-CoV-2 Spike NJ Tree (Hamming)"})
|> save_svg("spike_nj_tree.svg")
# ── 4. Mutation Analysis ──
print("=== Sequence Analysis ===")
let mutations = seqs |> map(|s|
range(0, len(wuhan)) |> filter(|i| s[i] != wuhan[i])
)
let mutation_table = table({
variant: names,
num_mutations: mutations |> map(|m| len(m)),
positions: mutations |> map(|m| m |> map(str) |> join(","))
})
print(mutation_table)
# ── 5. MinHash Sketching ──
print("=== Sequence Similarity via MinHash ===")
let sketches = seqs |> map(|s| sketch(s))
let sketch_dists = names |> map(|a|
names |> map(|b|
sketch_dist(sketches[index_of(names, a)], sketches[index_of(names, b)])
)
)
let sketch_tree = neighbor_joining(matrix(sketch_dists))
# ── 6. Functional Domain + Database Lookups ──
let spike_uniprot = uniprot_entry("P0DTC2")
print("UniProt: " + spike_uniprot.name + " (" + spike_uniprot.sequence_length |> str + " aa)")
let spike_pdb = pdb_entry("6VXX")
print("PDB 6VXX: " + spike_pdb.title)
# ── 7. Evolutionary Rate (dN/dS) ──
print("=== Evolutionary Rate Analysis ===")
let ref_len = len(wuhan) |> float
let dn_values = mutations |> map(|m| len(m) |> float / ref_len)
let ds_estimate = 0.02
let dnds = dn_values |> map(|dn| if dn == 0.0 then 0.0 else dn / ds_estimate)
let rate_table = table({
variant: names,
dN: dn_values |> map(|v| round(v, 4)),
dN_dS: dnds |> map(|v| round(v, 2))
})
print(rate_table)
# ── 8. Visualizations ──
heatmap({data: dist_matrix, title: "Spike Pairwise Hamming Distances"})
|> save_svg("spike_distance_heatmap.svg")
lollipop({data: table({
position: [501, 484, 452, 478, 614, 681, 950, 969],
label: ["N501Y", "E484K", "L452R", "T478K", "D614G", "P681H", "D950N", "N969K"],
count: [5, 3, 2, 2, 7, 5, 2, 2]
}), title: "Key Spike Mutations Across Variants", length: 1273})
|> save_svg("spike_mutation_lollipop.svg")
let report = "# SARS-CoV-2 Spike Phylogenomics\n\n"
+ "## Variants Analyzed\n" + len(names) |> str + " lineages from Wuhan to XBB.1.5\n\n"
+ "## Key Finding\nProgressive antigenic drift concentrated in receptor-binding domain\n"
write_text("spike_phylogenomics_report.md", report)
print("\n=== Analysis complete ===")
Clinical WES (Rare Disease Trio)
Rare disease diagnosis from trio whole-exome sequencing: de novo, autosomal recessive, compound heterozygous, and X-linked inheritance filtering, ACMG evidence classification, gene-disease lookup via ClinVar/UniProt/STRING, phenotype matching, circos and lollipop visualizations, and a clinical diagnostic report.
# requires: internet connection (NCBI, UniProt, STRING APIs)
# Rare Disease Diagnosis from Trio Whole-Exome Sequencing
# 3-year-old male with developmental delay, seizures, microcephaly.
# Trio WES: proband + unaffected parents.
# ── 1. Trio Variant Calls ──────────────────────────────────────
print("=== Trio Variant Calls ===")
let trio = table([
{chrom: "chrX", pos: 153296777, ref: "C", alt: "T", gene: "MECP2", consequence: "nonsense", gt_child: "0/1", gt_mother: "0/0", gt_father: "0/0", quality: 892.0, allele_freq: 0.0},
{chrom: "chr2", pos: 166845670, ref: "G", alt: "A", gene: "SCN1A", consequence: "missense", gt_child: "0/1", gt_mother: "0/0", gt_father: "0/0", quality: 743.0, allele_freq: 0.00002},
{chrom: "chr7", pos: 114086327, ref: "C", alt: "T", gene: "FOXP2", consequence: "missense", gt_child: "0/1", gt_mother: "0/1", gt_father: "0/0", quality: 512.0, allele_freq: 0.003},
{chrom: "chr21",pos: 37766822, ref: "G", alt: "A", gene: "DYRK1A", consequence: "frameshift", gt_child: "0/1", gt_mother: "0/0", gt_father: "0/0", quality: 678.0, allele_freq: 0.0},
{chrom: "chr9", pos: 140101204, ref: "T", alt: "C", gene: "EHMT1", consequence: "missense", gt_child: "0/1", gt_mother: "0/0", gt_father: "0/0", quality: 415.0, allele_freq: 0.0001},
{chrom: "chr12",pos: 49420834, ref: "A", alt: "G", gene: "KMT2D", consequence: "synonymous", gt_child: "0/1", gt_mother: "0/1", gt_father: "0/0", quality: 310.0, allele_freq: 0.08},
{chrom: "chr5", pos: 177558478, ref: "G", alt: "T", gene: "NSD1", consequence: "missense", gt_child: "0/1", gt_mother: "0/1", gt_father: "0/0", quality: 289.0, allele_freq: 0.02},
{chrom: "chr16",pos: 89805520, ref: "C", alt: "A", gene: "ANKRD11",consequence: "missense", gt_child: "1/1", gt_mother: "0/1", gt_father: "0/1", quality: 624.0, allele_freq: 0.0003},
{chrom: "chr17",pos: 29527482, ref: "GA",alt: "G", gene: "NF1", consequence: "frameshift", gt_child: "0/1", gt_mother: "0/1", gt_father: "0/0", quality: 567.0, allele_freq: 0.0},
{chrom: "chr17",pos: 29556789, ref: "T", alt: "C", gene: "NF1", consequence: "missense", gt_child: "0/1", gt_mother: "0/0", gt_father: "0/1", quality: 498.0, allele_freq: 0.00005},
{chrom: "chr1", pos: 155205634, ref: "G", alt: "A", gene: "ASH1L", consequence: "intronic", gt_child: "0/1", gt_mother: "0/1", gt_father: "0/0", quality: 145.0, allele_freq: 0.04},
{chrom: "chr2", pos: 166930050, ref: "T", alt: "G", gene: "SCN1A", consequence: "intronic", gt_child: "0/1", gt_mother: "0/0", gt_father: "0/1", quality: 201.0, allele_freq: 0.12},
{chrom: "chr11",pos: 66082000, ref: "A", alt: "C", gene: "ALG9", consequence: "synonymous", gt_child: "0/1", gt_mother: "0/0", gt_father: "0/1", quality: 380.0, allele_freq: 0.05},
{chrom: "chr15",pos: 75012899, ref: "C", alt: "T", gene: "UBE3A", consequence: "missense", gt_child: "0/1", gt_mother: "0/1", gt_father: "0/0", quality: 355.0, allele_freq: 0.001},
{chrom: "chrX", pos: 153297200, ref: "G", alt: "C", gene: "MECP2", consequence: "missense", gt_child: "0/1", gt_mother: "0/1", gt_father: "0/0", quality: 410.0, allele_freq: 0.0}
])
print("=== Clinical Trio WES Analysis ===")
print("Proband: 3-year-old male | Phenotype: developmental delay, seizures, microcephaly")
print("Total exonic/splice variants in trio: " + str(trio |> len))
# ── 2. Inheritance Pattern Filtering ───────────────────────────
print("=== Inheritance Pattern Filtering ===")
let de_novo = trio |> filter(|v| v.gt_child == "0/1" && v.gt_mother == "0/0" && v.gt_father == "0/0")
print("\nDe novo variants: " + str(de_novo |> len))
de_novo |> each(|v| print(" " + v.gene + " " + v.chrom + ":" + str(v.pos) + " " + v.consequence))
let ar_hom = trio |> filter(|v| v.gt_child == "1/1" && v.gt_mother == "0/1" && v.gt_father == "0/1")
print("\nAutosomal recessive (homozygous): " + str(ar_hom |> len))
let compound_het = trio |> filter(|v| v.gene == "NF1" && v.gt_child == "0/1")
print("\nCompound heterozygous candidates (NF1):")
compound_het |> each(|v| {
let parent = if v.gt_mother == "0/1" then "maternal" else "paternal"
print(" " + v.consequence + " at " + str(v.pos) + " (" + parent + ")")
})
let x_linked = trio |> filter(|v| v.chrom == "chrX" && v.gt_child == "0/1")
# ── 3. Variant Prioritization ─────────────────────────────────
print("=== Variant Prioritization ===")
let rare = trio |> filter(|v| v.allele_freq <= 0.01)
let protein_altering = ["missense", "nonsense", "frameshift", "splice"]
let functional = rare |> filter(|v| protein_altering |> contains(v.consequence))
let scored = functional |> mutate("cadd_score", |v| {
if v.consequence == "nonsense" then 38.0
else if v.consequence == "frameshift" then 35.0
else if v.consequence == "splice" then 28.0
else if v.allele_freq == 0.0 then 26.0
else 22.0
})
let ranked = scored |> sort(|a, b| b.cadd_score - a.cadd_score)
# ── 4. Gene-Disease Association Lookup ─────────────────────────
print("=== Gene-Disease Association Lookup ===")
let mecp2_gene = ncbi_gene("MECP2")
print("MECP2: " + mecp2_gene.description)
let mecp2_clinvar = ncbi_search("clinvar", "MECP2[gene] AND pathogenic[clinsig]")
print("MECP2 pathogenic ClinVar records: " + str(len(mecp2_clinvar)))
let scn1a_gene = ncbi_gene("SCN1A")
print("SCN1A: " + scn1a_gene.description + " — Dravet syndrome gene")
let mecp2_protein = uniprot_entry("P51608")
print("MECP2 protein: " + mecp2_protein.name + " (" + str(mecp2_protein.sequence_length) + " aa)")
# ── 5. ACMG Evidence Classification ───────────────────────────
print("=== ACMG Evidence Classification ===")
let acmg_classified = ranked |> mutate("acmg_codes", |v| {
let codes = []
if (v.consequence == "nonsense" || v.consequence == "frameshift") && v.gene == "MECP2" then
codes = codes + ["PVS1"]
if v.gt_mother == "0/0" && v.gt_father == "0/0" then
codes = codes + ["PS2"]
if v.allele_freq <= 0.0001 then
codes = codes + ["PM2"]
if v.cadd_score >= 25.0 then
codes = codes + ["PP3"]
codes
})
let acmg_final = acmg_classified |> mutate("classification", |v| {
let codes = v.acmg_codes
let has_pvs1 = codes |> contains("PVS1")
let has_ps2 = codes |> contains("PS2")
let has_pm2 = codes |> contains("PM2")
let n = len(codes)
if has_pvs1 && has_ps2 then "Pathogenic"
else if has_pvs1 && has_pm2 then "Likely Pathogenic"
else if has_ps2 && has_pm2 && n >= 3 then "Likely Pathogenic"
else if n >= 2 then "VUS"
else "Benign"
})
acmg_final |> each(|v| {
let codes_str = v.acmg_codes |> join(", ")
print(" " + v.gene + " " + v.consequence + " → " + v.classification + " [" + codes_str + "]")
})
# ── 6. Protein Impact + Phenotype Matching ─────────────────────
let mecp2_network = string_network(["MECP2"], 9606)
print("MECP2 protein interactors: " + str(len(mecp2_network)))
let patient_phenotypes = ["developmental delay", "seizures", "microcephaly"]
let gene_phenotypes = table([
{gene: "MECP2", disease: "Rett syndrome", phenotypes: ["developmental delay", "seizures", "microcephaly", "stereotypies"]},
{gene: "SCN1A", disease: "Dravet syndrome", phenotypes: ["seizures", "developmental delay", "ataxia"]},
{gene: "DYRK1A", disease: "DYRK1A-related ID", phenotypes: ["developmental delay", "microcephaly", "feeding difficulties"]}
])
gene_phenotypes |> each(|g| {
let overlap = g.phenotypes |> filter(|p| patient_phenotypes |> contains(p))
print(" " + g.gene + " (" + g.disease + "): " + str(len(overlap)) + "/3 match")
})
# ── 7. Visualizations ─────────────────────────────────────────
let circos_svg = circos({data: functional, title: "Trio WES — Rare Protein-Altering Variants"})
save_svg(circos_svg, "output/trio_wes_circos.svg")
let lollipop_svg = lollipop({data: table([
{position: 168, label: "R168*", impact: 1.0},
{position: 294, label: "R294X", impact: 0.9},
{position: 380, label: "P380L", impact: 0.6}
]), title: "MECP2 Variant Positions", length: 498})
save_svg(lollipop_svg, "output/mecp2_lollipop.svg")
let venn_svg = venn({
data: {
"De novo": de_novo |> map(|v| v.gene),
"AR homozygous": ar_hom |> map(|v| v.gene),
"X-linked": x_linked |> map(|v| v.gene)
},
title: "Inheritance Pattern Distribution"
})
save_svg(venn_svg, "output/inheritance_venn.svg")
# ── 8. Clinical Report ────────────────────────────────────────
print("=== Clinical Report ===")
let pathogenic = acmg_final |> filter(|v| v.classification == "Pathogenic" || v.classification == "Likely Pathogenic")
let report = "# Clinical WES Report — Trio Analysis for Developmental Delay\n\n" +
"## Patient Information\n\n" +
"Proband: 3-year-old male. Referral: undiagnosed developmental delay, seizures, microcephaly.\n\n" +
"## Primary Finding\n\n" +
"MECP2 c.502C>T p.(Arg168*) — de novo nonsense variant. Classified as PATHOGENIC " +
"(PVS1 + PS2 + PM2 + PP3). Absent from gnomAD. Consistent with Rett syndrome.\n\n" +
"## Recommendations\n\n" +
"1. Confirm MECP2 variant by Sanger sequencing.\n" +
"2. Refer to clinical genetics for Rett syndrome evaluation.\n" +
"3. Consider X-inactivation studies in mother.\n"
write_text("output/clinical_wes_report.md", report)
print("Report written to output/clinical_wes_report.md")
print("\n=== Analysis complete ===")