Variant Analysis
Learn how to read VCF files, filter variants by quality and functional impact, annotate variants with gene information from Ensembl, and generate summary statistics and plots.
What you will learn
- Reading and parsing VCF files
- Filtering variants by FILTER, QUAL, and INFO fields
- Annotating variants with Ensembl gene information
- Counting variants per chromosome and type
- Generating summary tables and a Manhattan plot
bl run examples/tutorials/variant-analysis.bl
Sample data: BioLang includes a VCF file at
examples/sample-data/calls.vcf with 12 variants across 5 chromosomes.
You can use it for this tutorial or substitute your own VCF.
Step 1 — Reading a VCF File
BioLang has a built-in VCF parser that understands the full VCF 4.3 specification, including INFO, FORMAT, and genotype fields.
# requires: examples/sample-data/calls.vcf in working directory
# variants.bio — VCF variant analysis
let variants = vcf("examples/sample-data/calls.vcf") |> collect()
# Total variant count
print(f"Total variants: {len(variants)}")
# Inspect a single variant
let v = head(variants, 1) |> first()
print(f"Chrom: {v.chrom}")
print(f"Pos: {v.pos}")
print(f"ID: {v.id}")
print(f"Ref: {v.ref}")
print(f"Alt: {v.alt}")
print(f"Qual: {v.quality}")
print(f"Filter: {v.filter}")
# Access INFO fields (info is a Record)
print(f"DP: {v.info.DP}")
print(f"AF: {v.info.AF}")
Step 2 — Filtering Variants
Apply standard quality filters to keep only reliable variant calls.
# Keep only PASS variants with QUAL >= 30
let passed = variants
|> filter(|v| v.filter == "PASS" and v.quality >= 30.0)
print(f"After PASS + QUAL>=30: {len(passed)} variants")
# Further filter by depth — use |> into for left-to-right binding
passed
|> filter(|v| v.info.DP >= 10)
|> into depth_filtered
print(f"After DP>=10: {len(depth_filtered)} variants")
# Filter by allele frequency (for population data)
depth_filtered |> filter(|v| v.info.AF >= 0.01) |> into common
depth_filtered |> filter(|v| v.info.AF < 0.01) |> into rare
print(f"Common variants (AF>=1%): {len(common)}")
print(f"Rare variants (AF<1%): {len(rare)}")
Step 3 — Classifying Variant Types
# Use built-in variant classification and Ts/Tv ratio
# Each variant has: variant_type, is_snp, is_indel, is_transition, is_transversion
# Count variant types using value_counts on a table
passed
|> map(|v| {chrom: v.chrom, pos: v.pos, vtype: v.variant_type})
|> to_table()
|> into type_table
value_counts(type_table, "vtype") |> into type_counts
print("\n=== Variant Types ===")
print(type_counts)
# Compute transition/transversion ratio directly
let ti_tv = tstv_ratio(passed)
print(f"\nTi/Tv ratio: {round(ti_tv, 3)}")
Step 4 — Variants Per Chromosome
# Count variants per chromosome
let chrom_table = passed
|> map(|v| {chromosome: v.chrom, pos: v.pos})
|> to_table()
value_counts(chrom_table, "chromosome")
|> rename("value", "chromosome")
|> arrange("count", "desc")
|> into chrom_counts
print("\n=== Variants Per Chromosome ===")
print(chrom_counts)
# Density: variants per megabase
let chrom_lengths = {
"chr1": 248956422, "chr2": 242193529, "chr3": 198295559,
"chr4": 190214555, "chr5": 181538259, "chr6": 170805979,
# ... (abbreviated for clarity)
"chrX": 156040895, "chrY": 57227415,
}
let counts_list = col(chrom_counts, "count")
let chroms_list = col(chrom_counts, "chromosome")
let density = mutate(chrom_counts, "per_mb", |row| round(row.count / 248.0, 2))
print(density)
Step 5 — Annotating with Ensembl VEP
Use ensembl_vep() to predict variant consequences via the Ensembl
Variant Effect Predictor. Pass variants in HGVS notation.
# requires: internet connection
# Annotate variants using Ensembl VEP
# ensembl_vep takes HGVS notation: "chrom:g.posRef>Alt"
let annotated = passed |> map(|v| {
let hgvs = f"{v.chrom}:g.{v.pos}{v.ref}>{v.alt}"
let vep = ensembl_vep(hgvs)
# Get the most severe consequence
let worst = if len(vep) > 0 then vep.most_severe_consequence else "unknown"
# Get impact level
let impact = if len(vep) > 0 then vep.impact else "MODIFIER"
{ variant: v, consequence: worst, impact: impact }
})
# Filter for high-impact variants
let high_impact = annotated |> filter(|a|
a.impact == "HIGH"
)
print(f"High-impact variants: {len(high_impact)}")
# Filter for specific consequence types
let lof_types = ["stop_gained", "frameshift_variant",
"splice_donor_variant", "splice_acceptor_variant"]
let lof = annotated |> filter(|a|
a.consequence in lof_types
)
print(f"Loss-of-function variants: {len(lof)}")
Step 6 — Building a Summary Table
# Create a summary table of high-impact variants
let summary = high_impact
|> map(|a| {
chrom: a.variant.chrom,
pos: a.variant.pos,
ref: a.variant.ref,
alt: a.variant.alt,
quality: a.variant.quality,
consequence: a.consequence,
impact: a.impact,
af: a.variant.info.AF,
})
|> to_table()
|> arrange("quality", "desc")
print("\n=== High-Impact Variants ===")
print(summary)
# Write to CSV for downstream use
write_csv(summary, "results/high_impact_variants.csv")
Step 7 — Genotype Analysis
# Analyze variant zygosity using built-in properties
# Each variant has: is_het, is_hom_ref, is_hom_alt
let het_table = passed
|> map(|v| {
chrom: v.chrom,
pos: v.pos,
is_het: v.is_het,
is_hom: v.is_hom_alt,
})
|> to_table()
# Count heterozygous variants
let het_only = filter(het_table, |row| row.is_het == true)
print(f"Heterozygous variants: {nrow(het_only)}")
# Compute het/hom ratio using built-in
let hh_ratio = het_hom_ratio(passed)
print(f"Het/Hom ratio: {round(hh_ratio, 3)}")
# Overall variant statistics
let stats = variant_summary(passed)
print(f"Variant stats: {stats}")
Step 8 — Visualizing Variants
# Build a table of variant positions and quality for plotting
let plot_data = passed
|> map(|v| {
chrom: v.chrom,
pos: v.pos,
quality: v.quality,
dp: v.info.DP,
})
|> to_table()
# Scatter plot of quality by position
plot(plot_data, {
x: "pos",
y: "quality",
title: "Variant Quality by Position",
})
# Histogram of quality scores
let qual_table = passed
|> map(|v| {quality: v.quality})
|> to_table()
histogram(qual_table)
print("Plots rendered")
Step 9 — Complete Variant Analysis Pipeline
# variant_pipeline.bio — end-to-end variant analysis
fn analyze_vcf(input, output_dir) {
print("=== Variant Analysis Pipeline ===\n")
# 1. Read and filter
let raw = vcf(input) |> collect()
print(f"Raw variants: {len(raw)}")
let filtered = raw
|> filter(|v| v.filter == "PASS")
|> filter(|v| v.quality >= 30.0)
|> filter(|v| v.info.DP >= 10)
print(f"After filtering: {len(filtered)}")
# 2. Classify using built-in variant_type
let type_tbl = filtered
|> map(|v| {vtype: v.variant_type})
|> to_table()
let types = value_counts(type_tbl, "vtype")
print(f"\nVariant types:")
print(types)
# 3. Annotate with VEP (for a subset — VEP queries are rate-limited)
let subset = head(filtered, 50)
let annotated = subset |> map(|v| {
let hgvs = f"{v.chrom}:g.{v.pos}{v.ref}>{v.alt}"
let vep = ensembl_vep(hgvs)
let consequence = if len(vep) > 0 then vep.most_severe_consequence else "unknown"
{ variant: v, consequence: consequence }
})
# 4. Summarize by consequence type
let conseq_tbl = annotated
|> map(|a| {consequence: a.consequence})
|> to_table()
let by_consequence = value_counts(conseq_tbl, "consequence")
print(f"\nConsequence types:")
print(by_consequence)
# 5. Write filtered variants to CSV
let summary = filtered |> map(|v| {
chrom: v.chrom, pos: v.pos, ref: v.ref,
alt: v.alt, quality: v.quality, vtype: v.variant_type,
}) |> to_table()
write_csv(summary, f"{output_dir}/filtered_variants.csv")
print(f"\nDone. Results in {output_dir}/")
}
analyze_vcf("data/sample.vcf.gz", "results")
Next Steps
Continue to RNA-seq Differential Expression to learn how to analyze gene expression data with BioLang.