Intermediate ~30 minutes

Variant Analysis

Learn how to read VCF files, filter variants by quality and functional impact, annotate variants with gene information from Ensembl, and generate summary statistics and plots.

What you will learn

  • Reading and parsing VCF files
  • Filtering variants by FILTER, QUAL, and INFO fields
  • Annotating variants with Ensembl gene information
  • Counting variants per chromosome and type
  • Generating summary tables and a Manhattan plot
Run this tutorial: Download variant-analysis.bl and run it with bl run examples/tutorials/variant-analysis.bl

Sample data: BioLang includes a VCF file at examples/sample-data/calls.vcf with 12 variants across 5 chromosomes. You can use it for this tutorial or substitute your own VCF.

Step 1 — Reading a VCF File

BioLang has a built-in VCF parser that understands the full VCF 4.3 specification, including INFO, FORMAT, and genotype fields.

# requires: examples/sample-data/calls.vcf in working directory
# variants.bio — VCF variant analysis

let variants = vcf("examples/sample-data/calls.vcf") |> collect()

# Total variant count
print(f"Total variants: {len(variants)}")

# Inspect a single variant
let v = head(variants, 1) |> first()
print(f"Chrom:  {v.chrom}")
print(f"Pos:    {v.pos}")
print(f"ID:     {v.id}")
print(f"Ref:    {v.ref}")
print(f"Alt:    {v.alt}")
print(f"Qual:   {v.quality}")
print(f"Filter: {v.filter}")

# Access INFO fields (info is a Record)
print(f"DP: {v.info.DP}")
print(f"AF: {v.info.AF}")

Step 2 — Filtering Variants

Apply standard quality filters to keep only reliable variant calls.

# Keep only PASS variants with QUAL >= 30
let passed = variants
  |> filter(|v| v.filter == "PASS" and v.quality >= 30.0)

print(f"After PASS + QUAL>=30: {len(passed)} variants")

# Further filter by depth — use |> into for left-to-right binding
passed
  |> filter(|v| v.info.DP >= 10)
  |> into depth_filtered

print(f"After DP>=10: {len(depth_filtered)} variants")

# Filter by allele frequency (for population data)
depth_filtered |> filter(|v| v.info.AF >= 0.01) |> into common
depth_filtered |> filter(|v| v.info.AF < 0.01) |> into rare

print(f"Common variants (AF>=1%): {len(common)}")
print(f"Rare variants (AF<1%):    {len(rare)}")

Step 3 — Classifying Variant Types

# Use built-in variant classification and Ts/Tv ratio
# Each variant has: variant_type, is_snp, is_indel, is_transition, is_transversion

# Count variant types using value_counts on a table
passed
  |> map(|v| {chrom: v.chrom, pos: v.pos, vtype: v.variant_type})
  |> to_table()
  |> into type_table

value_counts(type_table, "vtype") |> into type_counts
print("\n=== Variant Types ===")
print(type_counts)

# Compute transition/transversion ratio directly
let ti_tv = tstv_ratio(passed)
print(f"\nTi/Tv ratio: {round(ti_tv, 3)}")

Step 4 — Variants Per Chromosome

# Count variants per chromosome
let chrom_table = passed
  |> map(|v| {chromosome: v.chrom, pos: v.pos})
  |> to_table()

value_counts(chrom_table, "chromosome")
  |> rename("value", "chromosome")
  |> arrange("count", "desc")
  |> into chrom_counts

print("\n=== Variants Per Chromosome ===")
print(chrom_counts)

# Density: variants per megabase
let chrom_lengths = {
  "chr1": 248956422, "chr2": 242193529, "chr3": 198295559,
  "chr4": 190214555, "chr5": 181538259, "chr6": 170805979,
  # ... (abbreviated for clarity)
  "chrX": 156040895, "chrY": 57227415,
}

let counts_list = col(chrom_counts, "count")
let chroms_list = col(chrom_counts, "chromosome")
let density = mutate(chrom_counts, "per_mb", |row| round(row.count / 248.0, 2))
print(density)

Step 5 — Annotating with Ensembl VEP

Use ensembl_vep() to predict variant consequences via the Ensembl Variant Effect Predictor. Pass variants in HGVS notation.

# requires: internet connection
# Annotate variants using Ensembl VEP
# ensembl_vep takes HGVS notation: "chrom:g.posRef>Alt"
let annotated = passed |> map(|v| {
  let hgvs = f"{v.chrom}:g.{v.pos}{v.ref}>{v.alt}"
  let vep = ensembl_vep(hgvs)

  # Get the most severe consequence
  let worst = if len(vep) > 0 then vep.most_severe_consequence else "unknown"

  # Get impact level
  let impact = if len(vep) > 0 then vep.impact else "MODIFIER"

  { variant: v, consequence: worst, impact: impact }
})

# Filter for high-impact variants
let high_impact = annotated |> filter(|a|
  a.impact == "HIGH"
)
print(f"High-impact variants: {len(high_impact)}")

# Filter for specific consequence types
let lof_types = ["stop_gained", "frameshift_variant",
    "splice_donor_variant", "splice_acceptor_variant"]
let lof = annotated |> filter(|a|
  a.consequence in lof_types
)
print(f"Loss-of-function variants: {len(lof)}")

Step 6 — Building a Summary Table

# Create a summary table of high-impact variants
let summary = high_impact
  |> map(|a| {
    chrom:       a.variant.chrom,
    pos:         a.variant.pos,
    ref:         a.variant.ref,
    alt:         a.variant.alt,
    quality:     a.variant.quality,
    consequence: a.consequence,
    impact:      a.impact,
    af:          a.variant.info.AF,
  })
  |> to_table()
  |> arrange("quality", "desc")

print("\n=== High-Impact Variants ===")
print(summary)

# Write to CSV for downstream use
write_csv(summary, "results/high_impact_variants.csv")

Step 7 — Genotype Analysis

# Analyze variant zygosity using built-in properties
# Each variant has: is_het, is_hom_ref, is_hom_alt

let het_table = passed
  |> map(|v| {
    chrom:  v.chrom,
    pos:    v.pos,
    is_het: v.is_het,
    is_hom: v.is_hom_alt,
  })
  |> to_table()

# Count heterozygous variants
let het_only = filter(het_table, |row| row.is_het == true)
print(f"Heterozygous variants: {nrow(het_only)}")

# Compute het/hom ratio using built-in
let hh_ratio = het_hom_ratio(passed)
print(f"Het/Hom ratio: {round(hh_ratio, 3)}")

# Overall variant statistics
let stats = variant_summary(passed)
print(f"Variant stats: {stats}")

Step 8 — Visualizing Variants

# Build a table of variant positions and quality for plotting
let plot_data = passed
  |> map(|v| {
    chrom:   v.chrom,
    pos:     v.pos,
    quality: v.quality,
    dp:      v.info.DP,
  })
  |> to_table()

# Scatter plot of quality by position
plot(plot_data, {
  x:     "pos",
  y:     "quality",
  title: "Variant Quality by Position",
})

# Histogram of quality scores
let qual_table = passed
  |> map(|v| {quality: v.quality})
  |> to_table()
histogram(qual_table)

print("Plots rendered")

Step 9 — Complete Variant Analysis Pipeline

# variant_pipeline.bio — end-to-end variant analysis

fn analyze_vcf(input, output_dir) {
  print("=== Variant Analysis Pipeline ===\n")

  # 1. Read and filter
  let raw = vcf(input) |> collect()
  print(f"Raw variants: {len(raw)}")

  let filtered = raw
    |> filter(|v| v.filter == "PASS")
    |> filter(|v| v.quality >= 30.0)
    |> filter(|v| v.info.DP >= 10)
  print(f"After filtering: {len(filtered)}")

  # 2. Classify using built-in variant_type
  let type_tbl = filtered
    |> map(|v| {vtype: v.variant_type})
    |> to_table()
  let types = value_counts(type_tbl, "vtype")
  print(f"\nVariant types:")
  print(types)

  # 3. Annotate with VEP (for a subset — VEP queries are rate-limited)
  let subset = head(filtered, 50)
  let annotated = subset |> map(|v| {
    let hgvs = f"{v.chrom}:g.{v.pos}{v.ref}>{v.alt}"
    let vep = ensembl_vep(hgvs)
    let consequence = if len(vep) > 0 then vep.most_severe_consequence else "unknown"
    { variant: v, consequence: consequence }
  })

  # 4. Summarize by consequence type
  let conseq_tbl = annotated
    |> map(|a| {consequence: a.consequence})
    |> to_table()
  let by_consequence = value_counts(conseq_tbl, "consequence")
  print(f"\nConsequence types:")
  print(by_consequence)

  # 5. Write filtered variants to CSV
  let summary = filtered |> map(|v| {
    chrom: v.chrom, pos: v.pos, ref: v.ref,
    alt: v.alt, quality: v.quality, vtype: v.variant_type,
  }) |> to_table()

  write_csv(summary, f"{output_dir}/filtered_variants.csv")

  print(f"\nDone. Results in {output_dir}/")
}

analyze_vcf("data/sample.vcf.gz", "results")

Next Steps

Continue to RNA-seq Differential Expression to learn how to analyze gene expression data with BioLang.