Python/R Interop

BioLang interoperates with Python and R through subprocess execution, file-based data exchange, and JSON communication. This lets you leverage existing libraries like scanpy, DESeq2, or ggplot2 from BioLang scripts.

Calling Python

Running a Python script

# Run a Python script and capture output
let result = shell("python3 analyze.py --input data.csv")
print("Exit code: #{result.exit_code}")
print("Output: #{result.stdout}")

Inline Python with data exchange

# Prepare data in BioLang, process in Python, read results back
let counts = read_csv("data/counts.csv")
write_tsv(counts, "_temp_counts.tsv")

let metadata = read_csv("data/sample_sheet.csv")
write_tsv(metadata, "_temp_metadata.tsv")

# Run DESeq2-like analysis via Python
# Write Python script to a temp file and run it
let py_script = "
import pandas as pd
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

counts = pd.read_csv('_temp_counts.tsv', sep='\\t', index_col=0)
metadata = pd.read_csv('_temp_metadata.tsv', sep='\\t', index_col=0)

dds = DeseqDataSet(counts=counts, metadata=metadata, design_factors='condition')
dds.deseq2()

stat_res = DeseqStats(dds, contrast=['condition', 'treated', 'control'])
stat_res.summary()
stat_res.results_df.to_csv('_temp_deseq_results.tsv', sep='\\t')
"
write_text(py_script, "_temp_deseq.py")
let result = shell("python3 _temp_deseq.py")

if result.exit_code != 0 {
  print("Python error: #{result.stderr}")
  exit(1)
}

# Read results back into BioLang
let de_results = tsv("_temp_deseq_results.tsv")
let significant = de_results
  |> filter(|r| float(r["padj"]) < 0.05)
  |> filter(|r| abs(float(r["log2FoldChange"])) > 1.0)

print("Significant DE genes: #{len(significant)}")
write_tsv(significant, "significant_genes.tsv")

# Clean up temp files
remove("_temp_counts.tsv")
remove("_temp_metadata.tsv")
remove("_temp_deseq_results.tsv")
remove("_temp_deseq.py")

Python via JSON exchange

# Pass structured data to Python via JSON
let params = {
  input_file: "counts.csv",
  min_counts: 10,
  normalization: "TPM",
  output_file: "normalized.csv"
}
write_json(params, "_params.json")

let result = shell("python3 normalize.py --params _params.json")

if result.exit_code == 0 {
  let output = read_json("_params_result.json")
  print("Normalized #{output["n_genes"]} genes across #{output["n_samples"]} samples")
} else {
  print("Normalization failed: #{result.stderr}")
}

Calling R

Running an R script

# Run R script for statistical analysis
let result = shell("Rscript analysis.R input.csv output.csv")

if result.exit_code != 0 {
  print("R error: #{result.stderr}")
  exit(1)
}

let output = read_csv("output.csv")
print("R analysis produced #{len(output)} results")

Inline R for DESeq2

# Prepare count matrix and run DESeq2 in R
let counts = read_csv("data/counts.csv")
write_csv(counts, "_deseq_input.csv")

let r_script = "
library(DESeq2)

counts <- read.csv('_deseq_input.csv', row.names=1)
coldata <- data.frame(
  condition = factor(c(rep('control', 3), rep('treated', 3)))
)

dds <- DESeqDataSetFromMatrix(countData=counts, colData=coldata, design=~condition)
dds <- DESeq(dds)
res <- results(dds, contrast=c('condition', 'treated', 'control'))

write.csv(as.data.frame(res), '_deseq_output.csv')
"

write_text(r_script, "_deseq.R")
let result = shell("Rscript _deseq.R")

if result.exit_code == 0 {
  let de = read_csv("_deseq_output.csv")
  let sig = de |> filter(|r| float(r["padj"]) < 0.05)
  print("DESeq2 found #{len(sig)} significant genes")
  write_csv(sig, "deseq2_significant.csv")
}

remove("_deseq.R")
remove("_deseq_input.csv")
remove("_deseq_output.csv")

R for plotting

# Generate publication-quality plots with ggplot2
let data = tsv("pca_coordinates.tsv")
write_tsv(data, "_plot_data.tsv")

let r_plot = '
library(ggplot2)

data <- read.delim("_plot_data.tsv")
p <- ggplot(data, aes(x=pc1, y=pc2, color=population)) +
  geom_point(size=2, alpha=0.7) +
  theme_minimal() +
  labs(x="PC1", y="PC2", title="Population Structure") +
  scale_color_brewer(palette="Set2")

ggsave("pca_plot.png", p, width=8, height=6, dpi=300)
cat("Plot saved")
'

write_text(r_plot, "_plot.R")
shell("Rscript _plot.R")
print("PCA plot saved to pca_plot.png")
remove("_plot.R")
remove("_plot_data.tsv")

Bidirectional Pipes

Streaming data to external tools

# Write data then process with an external tool
write_fastq(read_fastq("data/reads.fastq"), "_temp.fq")
let result = shell("seqkit stats -T _temp.fq")
print(result.stdout)
remove("_temp.fq")

Using samtools via pipe

# Use samtools for operations BioLang does not natively support
let flagstat = shell("samtools flagstat aligned.bam")
print(flagstat.stdout)

# Filter BED then pass to samtools
let bed_regions = read_bed("data/regions.bed")
  |> filter(|r| r.end - r.start > 100)

write_bed(bed_regions, "_filtered.bed")
let result = shell("samtools view -L _filtered.bed -b aligned.bam -o targeted.bam")
remove("_filtered.bed")

Data Format Exchange

AnnData exchange with Python

# Exchange data with scanpy via CSV/JSON
# Export from scanpy: adata.obs.to_read_csv("obs.csv")
let obs = read_csv("obs.csv")

# Do something in BioLang
let cluster_sizes = frequencies(map(obs, |row| row["leiden"]))
print("Cluster sizes: #{to_string(cluster_sizes)}")

# Compute custom scores
let scores = map(obs, |cell| {
  float(cell["n_genes"]) / float(cell["total_counts"])
})
write_json(scores, "custom_scores.json")
# Import back in scanpy: scores = json.load(open("custom_scores.json"))

Parquet for large datasets

# Use CSV/TSV for data exchange with Python/R
let data = read_csv("data/expression.csv")

# Write as TSV for efficient exchange
write_tsv(data, "data.tsv")

# Python can read it:
# pd.read_csv("data.tsv", sep="\t")

# Read TSV back
let loaded = tsv("data.tsv")
print("Loaded #{len(loaded)} rows from TSV")

Best Practices

Temporary file management

# Use a temp directory for clean interop
let tmp = "_tmp_interop"
mkdir(tmp)

try {
  let input_path = "#{tmp}/input.tsv"
  let output_path = "#{tmp}/output.tsv"

  write_tsv(data, input_path)
  shell("python3 process.py #{input_path} #{output_path}")

  let result = tsv(output_path)
  write_tsv(result, "final_output.tsv")
} catch e {
  print("Error: #{to_string(e)}")
}

# Clean up
remove(tmp)