Introduction
staRburst makes it trivial to scale your parallel R code from your laptop to 100+ AWS workers. This vignette walks through setup and common usage patterns.
One-Time Setup
Before using staRburst, you need to configure AWS resources. This only needs to be done once.
library(starburst)
# Interactive setup wizard (takes ~2 minutes)
starburst_setup()This will: - Validate your AWS credentials - Create an S3 bucket for data transfer - Create an ECR repository for Docker images - Set up ECS cluster and VPC resources - Check Fargate quotas and offer to request increases
Basic Usage
The simplest way to use staRburst is with the furrr
package:
library(furrr)
library(starburst)
# Define your work
expensive_simulation <- function(i) {
# Some computation that takes a few minutes
results <- replicate(1000, {
x <- rnorm(10000)
mean(x^2)
})
mean(results)
}
# Local execution (single core)
plan(sequential)
system.time({
results_local <- future_map(1:100, expensive_simulation)
})
#> ~16 minutes on typical laptop
# Cloud execution (50 workers)
plan(future_starburst, workers = 50)
system.time({
results_cloud <- future_map(1:100, expensive_simulation)
})
#> ~2 minutes (including 45s startup)
#> Cost: ~$0.85
# Results are identical
identical(results_local, results_cloud)
#> [1] TRUEExample 1: Monte Carlo Simulation
library(starburst)
library(furrr)
# Simulate portfolio returns
simulate_portfolio <- function(seed) {
set.seed(seed)
# Random walk for 252 trading days
returns <- rnorm(252, mean = 0.0003, sd = 0.02)
prices <- cumprod(1 + returns)
list(
final_value = prices[252],
max_drawdown = max(cummax(prices) - prices) / max(prices),
sharpe_ratio = mean(returns) / sd(returns) * sqrt(252)
)
}
# Run 10,000 simulations on 100 workers
plan(future_starburst, workers = 100)
results <- future_map(1:10000, simulate_portfolio, .options = furrr_options(seed = TRUE))
# Analyze results
final_values <- sapply(results, `[[`, "final_value")
hist(final_values, breaks = 50, main = "Distribution of Portfolio Final Values")
# 95% confidence interval
quantile(final_values, c(0.025, 0.975))Performance: - Local (single core): ~4 hours - Cloud (100 workers): ~3 minutes - Cost: ~$1.80
Example 2: Bootstrap Resampling
library(starburst)
library(furrr)
# Your data
data <- read.csv("my_data.csv")
# Bootstrap function
bootstrap_regression <- function(i, data) {
# Resample with replacement
boot_indices <- sample(nrow(data), replace = TRUE)
boot_data <- data[boot_indices, ]
# Fit model
model <- lm(y ~ x1 + x2 + x3, data = boot_data)
# Return coefficients
coef(model)
}
# Run 10,000 bootstrap samples
plan(future_starburst, workers = 50)
boot_results <- future_map(1:10000, bootstrap_regression, data = data)
# Convert to matrix
boot_coefs <- do.call(rbind, boot_results)
# 95% confidence intervals for each coefficient
apply(boot_coefs, 2, quantile, probs = c(0.025, 0.975))Example 3: Genomics Pipeline
library(starburst)
library(furrr)
# Process one sample
process_sample <- function(sample_id) {
# Read from S3 (data already in cloud)
fastq_path <- sprintf("s3://my-genomics-data/samples/%s.fastq", sample_id)
data <- read_fastq(fastq_path)
# Align reads
aligned <- align_reads(data, reference = "hg38")
# Call variants
variants <- call_variants(aligned)
# Return summary
list(
sample_id = sample_id,
num_variants = nrow(variants),
variants = variants
)
}
# Process 1000 samples on 100 workers
sample_ids <- list.files("s3://my-genomics-data/samples/", pattern = ".fastq$")
plan(future_starburst, workers = 100)
results <- future_map(sample_ids, process_sample, .progress = TRUE)
# Combine results
all_variants <- do.call(rbind, lapply(results, `[[`, "variants"))Performance: - Local (sequential): ~208 hours (8.7 days) - Cloud (100 workers): ~2 hours - Cost: ~$47
Working with Data
Large Data Optimization
For very large objects, pre-upload to S3:
# Upload once
large_data <- read.csv("huge_file.csv")
s3_path <- starburst_upload(large_data, "s3://my-bucket/large_data.rds")
# Workers read from S3
plan(future_starburst, workers = 100)
results <- future_map(1:1000, function(i) {
# Read from S3 inside worker
data <- readRDS(s3_path)
process(data, i)
})Cost Management
Estimate Costs
# Check cost before running
plan(future_starburst, workers = 100, cpu = 4, memory = "8GB")
#> Estimated cost: ~$3.50/hourSet Cost Limits
# Set maximum cost per job
starburst_config(
max_cost_per_job = 10, # Don't start jobs that would cost >$10
cost_alert_threshold = 5 # Warn when approaching $5
)
# Now jobs exceeding limit will error before starting
plan(future_starburst, workers = 1000) # Would cost ~$35/hour
#> Error: Estimated cost ($35/hr) exceeds limit ($10/hr)Track Actual Costs
plan(future_starburst, workers = 50)
results <- future_map(data, process)
#> Cluster runtime: 23 minutes
#> Total cost: $1.34Quota Management
Check Your Quota
starburst_quota_status()
#> Fargate vCPU Quota: 100 / 100 used
#> Allows: ~25 workers with 4 vCPUs each
#>
#> Recommended: Request increase to 500 vCPUsRequest Quota Increase
starburst_request_quota_increase(vcpus = 500)
#> Requesting Fargate vCPU quota increase:
#> Current: 100 vCPUs
#> Requested: 500 vCPUs
#>
#> ✓ Quota increase requested (Case ID: 12345678)
#> ✓ AWS typically approves within 1-24 hoursWave-Based Execution
If you request more workers than your quota allows, staRburst automatically uses wave-based execution:
# Quota allows 25 workers, but you request 100
plan(future_starburst, workers = 100, cpu = 4)
#> ⚠ Requested: 100 workers (400 vCPUs)
#> ⚠ Current quota: 100 vCPUs (allows 25 workers max)
#>
#> 📋 Execution plan:
#> • Running in 4 waves of 25 workers each
#>
#> 💡 Request quota increase to 500 vCPUs? [y/n]: y
#>
#> ✓ Quota increase requested
#> ⚡ Starting wave 1 (25 workers)...
results <- future_map(1:1000, expensive_function)
#> ⚡ Wave 1: 100% complete (250 tasks)
#> ⚡ Wave 2: 100% complete (500 tasks)
#> ⚡ Wave 3: 100% complete (750 tasks)
#> ⚡ Wave 4: 100% complete (1000 tasks)Troubleshooting
View Worker Logs
# View logs from most recent cluster
starburst_logs()
# View logs from specific task
starburst_logs(task_id = "abc-123")
# View last 100 log lines
starburst_logs(last_n = 100)Check Cluster Status
starburst_status()
#> Active Clusters:
#> • starburst-xyz123: 50 workers running
#> • starburst-abc456: 25 workers runningCommon Issues
Environment mismatch: Packages not found on workers
# Rebuild environment
starburst_rebuild_environment()Task failures: Some tasks failing
# Check logs
starburst_logs(task_id = "failed-task-id")
# Often due to memory limits - increase worker memory
plan(future_starburst, workers = 50, memory = "16GB") # Default is 8GBSlow data transfer: Large objects taking too long
Best Practices
1. Use for Right-Sized Workloads
✅ Good: Each task takes >5 minutes
# 100 tasks, each takes 10 minutes
# Local: 1000 minutes, Cloud: ~10 minutes❌ Bad: Each task takes <1 minute
# 10000 tasks, each takes 30 seconds
# Startup overhead (45s) dominates2. Batch Small Tasks
Instead of:
# 10,000 tiny tasks
results <- future_map(1:10000, small_function)Do:
3. Use S3 for Large Data
Don’t:
big_data <- read.csv("10GB_file.csv") # Upload for every task
results <- future_map(1:1000, function(i) process(big_data, i))Do:
4. Set Reasonable Limits
starburst_config(
max_cost_per_job = 50, # Prevent accidents
cost_alert_threshold = 25 # Get warned early
)5. Clean Up
# staRburst auto-cleans, but you can force it
plan(sequential) # Switch back to local
# Old cluster resources are cleaned up automaticallyAdvanced: Custom Configuration
Timeout
# Increase timeout for long-running tasks (default 1 hour)
plan(future_starburst, workers = 10, timeout = 7200) # 2 hoursRegion
# Use specific region (default from config)
plan(future_starburst, workers = 50, region = "us-west-2")Next Steps
- Check out the Advanced Usage vignette
- Review Performance Tuning guide
- See Example Workflows for real-world patterns
- Read Troubleshooting Guide when stuck
Getting Help
- GitHub Issues: https://github.com/yourname/starburst/issues
- Discussions: https://github.com/yourname/starburst/discussions
- Email: your.email@example.com
