Cloud Data
BioLang supports direct interaction with cloud storage services for downloading reference data, uploading results, and working with remote datasets without manual transfer steps.
AWS S3
Downloading from S3
# Download a file from S3 (uses s3:// URL syntax)
s3_download("s3://my-genomics-bucket/references/GRCh38.fa", "refs/GRCh38.fa")
# If dest is omitted, downloads to ~/.biolang/data/downloads/
s3_download("s3://my-genomics-bucket/data/large_file.bam", "data/sample.bam")
print("Download complete")
Uploading to S3
# Upload results to S3
s3_upload("results/analysis.tsv", "s3://my-results-bucket/project_001/analysis.tsv")
# Upload multiple files
let files = glob("results/*.tsv")
for f in files {
s3_upload("results/#{f}", "s3://my-results-bucket/project_001/#{f}")
print("Uploaded #{f}")
}
Listing and filtering S3 objects
# List objects in a bucket with prefix
let objects = s3_list("s3://genomics-data/samples/batch_001/")
for obj in objects {
print("#{obj.name} (#{obj.size} bytes)")
}
# Find all FASTQ files in a bucket
let fastq_files = s3_list("s3://genomics-data/fastq/")
|> filter(|obj| ends_with(obj.name, ".fq.gz"))
print("Found #{len(fastq_files)} FASTQ files")
# Download, then process locally
s3_download("s3://my-bucket/data/counts.csv", "counts.csv")
let data = read_csv("data/counts.csv")
|> filter(|row| int(row["count"]) > 10)
print("Filtered rows: #{len(data)}")
Working with public S3 buckets
# Access public datasets via download (HTTP URLs for public data)
# 1000 Genomes Project data
download(
"https://s3.amazonaws.com/1000genomes/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz",
"1000g_chr22.vcf.gz"
)
# Or use s3_download with s3:// URL
s3_download(
"s3://1000genomes/release/20130502/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz",
"1000g_chr22.vcf.gz"
)
Google Cloud Storage
GCS operations
# Download from GCS (uses gs:// URL syntax)
gcs_download("gs://my-gcs-bucket/data/sample.bam", "sample.bam")
# Upload to GCS
gcs_upload("results/variants.vcf.gz", "gs://my-gcs-bucket/results/run_001/variants.vcf.gz")
# Process GCS data locally after download
gcs_download("gs://my-gcs-bucket/data/counts.tsv", "counts.tsv")
let data = tsv("counts.tsv")
print("Loaded #{len(data)} rows")
Accessing public GCS datasets
# Access public genomics datasets on GCS
# gnomAD data
gcs_download(
"gs://gcp-public-data--gnomad/release/4.0/vcf/genomes/gnomad.genomes.v4.0.sites.chr22.vcf.bgz",
"gnomad_chr22.vcf.bgz"
)
# ENCODE via GCS
gcs_download(
"gs://encode-public/2023/ENCFF001XYZ.bed.gz",
"peaks.bed.gz"
)
Azure Blob Storage
Azure operations
# Azure Blob Storage via shell + az CLI
let account = env("AZURE_STORAGE_ACCOUNT")
# Download from Azure Blob Storage
shell("az storage blob download --account-name #{account} --container-name genomics-data --name references/GRCh38.fa --file refs/GRCh38.fa")
# Upload to Azure
shell("az storage blob upload --account-name #{account} --container-name results --name project_001/report.html --file results/report.html")
# List blobs
let result = shell("az storage blob list --account-name #{account} --container-name genomics-data --prefix samples/ --output json")
print(result.stdout)
HTTP/FTP Downloads
Downloading reference data
# Download reference genomes and databases
download(
"https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
"refs/GRCh38.fa.gz"
)
# Download from NCBI
download(
"https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz",
"refs/clinvar.vcf.gz"
)
# FTP download
ftp_download(
"ftp://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz",
"refs/GRCh38.gtf.gz"
)
Batch downloads
# Download multiple files in parallel
let urls = [
{ url: "https://example.com/chr1.fa.gz", dest: "refs/chr1.fa.gz" },
{ url: "https://example.com/chr2.fa.gz", dest: "refs/chr2.fa.gz" },
{ url: "https://example.com/chr3.fa.gz", dest: "refs/chr3.fa.gz" }
]
par_map(urls, |item| {
download(item.url, item.dest)
print("Downloaded #{item.dest}")
})
print("All downloads complete")
Remote File Access
Streaming from URLs
# Read directly from a URL without downloading the full file
let vcf_count = read_vcf("https://example.com/data/variants.vcf.gz")
|> filter(|v| v.chrom == "chr22" && v.filter == "PASS")
|> count
print("PASS variants on chr22: #{vcf_count}")
# Stream FASTA from a URL
let genes = read_fasta("https://ftp.ensembl.org/pub/current/fasta/homo_sapiens/cds/Homo_sapiens.GRCh38.cds.all.fa.gz")
|> filter(|r| contains(r.id, "BRCA1"))
|> collect
print("BRCA1 transcripts: #{len(genes)}")
Working with signed URLs
# Generate a pre-signed URL using the AWS CLI
let result = shell("aws s3 presign s3://my-results/analysis/report.html --expires-in 3600")
let signed_url = trim(result.stdout)
print("Share this link: #{signed_url}")
# Download from a pre-signed URL
download(signed_url, "downloaded_report.html")
Cloud Configuration
Credential management
# BioLang's cloud builtins use standard CLI tools under the hood:
# AWS: aws CLI reads ~/.aws/credentials, AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY
# GCS: gsutil reads GOOGLE_APPLICATION_CREDENTIALS, gcloud auth
# Azure: az CLI reads AZURE_STORAGE_ACCOUNT + AZURE_STORAGE_KEY
# Verify credentials are configured
let aws_check = shell("aws sts get-caller-identity")
if aws_check.exit_code == 0 {
print("AWS credentials configured")
} else {
print("AWS credentials not found, set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY")
}