Getting Started with staRburst • starburst

Introduction

staRburst makes it trivial to scale your parallel R code from your laptop to 100+ AWS workers. This vignette walks through setup and common usage patterns.

Installation

# Install from GitHub
remotes::install_github("yourname/starburst")

One-Time Setup

Before using staRburst, you need to configure AWS resources. This only needs to be done once.

library(starburst)

# Interactive setup wizard (takes ~2 minutes)
starburst_setup()

This will: - Validate your AWS credentials - Create an S3 bucket for data transfer - Create an ECR repository for Docker images - Set up ECS cluster and VPC resources - Check Fargate quotas and offer to request increases

Basic Usage

The simplest way to use staRburst is with the furrr package:

library(furrr)
library(starburst)

# Define your work
expensive_simulation <- function(i) {
  # Some computation that takes a few minutes
  results <- replicate(1000, {
    x <- rnorm(10000)
    mean(x^2)
  })
  mean(results)
}

# Local execution (single core)
plan(sequential)
system.time({
  results_local <- future_map(1:100, expensive_simulation)
})
#> ~16 minutes on typical laptop

# Cloud execution (50 workers)
plan(future_starburst, workers = 50)
system.time({
  results_cloud <- future_map(1:100, expensive_simulation)
})
#> ~2 minutes (including 45s startup)
#> Cost: ~$0.85

# Results are identical
identical(results_local, results_cloud)
#> [1] TRUE

Example 1: Monte Carlo Simulation

library(starburst)
library(furrr)

# Simulate portfolio returns
simulate_portfolio <- function(seed) {
  set.seed(seed)
  
  # Random walk for 252 trading days
  returns <- rnorm(252, mean = 0.0003, sd = 0.02)
  prices <- cumprod(1 + returns)
  
  list(
    final_value = prices[252],
    max_drawdown = max(cummax(prices) - prices) / max(prices),
    sharpe_ratio = mean(returns) / sd(returns) * sqrt(252)
  )
}

# Run 10,000 simulations on 100 workers
plan(future_starburst, workers = 100)

results <- future_map(1:10000, simulate_portfolio, .options = furrr_options(seed = TRUE))

# Analyze results
final_values <- sapply(results, `[[`, "final_value")
hist(final_values, breaks = 50, main = "Distribution of Portfolio Final Values")

# 95% confidence interval
quantile(final_values, c(0.025, 0.975))

Performance: - Local (single core): ~4 hours - Cloud (100 workers): ~3 minutes - Cost: ~$1.80

Example 2: Bootstrap Resampling

library(starburst)
library(furrr)

# Your data
data <- read.csv("my_data.csv")

# Bootstrap function
bootstrap_regression <- function(i, data) {
  # Resample with replacement
  boot_indices <- sample(nrow(data), replace = TRUE)
  boot_data <- data[boot_indices, ]
  
  # Fit model
  model <- lm(y ~ x1 + x2 + x3, data = boot_data)
  
  # Return coefficients
  coef(model)
}

# Run 10,000 bootstrap samples
plan(future_starburst, workers = 50)

boot_results <- future_map(1:10000, bootstrap_regression, data = data)

# Convert to matrix
boot_coefs <- do.call(rbind, boot_results)

# 95% confidence intervals for each coefficient
apply(boot_coefs, 2, quantile, probs = c(0.025, 0.975))

Example 3: Genomics Pipeline

library(starburst)
library(furrr)

# Process one sample
process_sample <- function(sample_id) {
  # Read from S3 (data already in cloud)
  fastq_path <- sprintf("s3://my-genomics-data/samples/%s.fastq", sample_id)
  data <- read_fastq(fastq_path)
  
  # Align reads
  aligned <- align_reads(data, reference = "hg38")
  
  # Call variants
  variants <- call_variants(aligned)
  
  # Return summary
  list(
    sample_id = sample_id,
    num_variants = nrow(variants),
    variants = variants
  )
}

# Process 1000 samples on 100 workers
sample_ids <- list.files("s3://my-genomics-data/samples/", pattern = ".fastq$")

plan(future_starburst, workers = 100)

results <- future_map(sample_ids, process_sample, .progress = TRUE)

# Combine results
all_variants <- do.call(rbind, lapply(results, `[[`, "variants"))

Performance: - Local (sequential): ~208 hours (8.7 days) - Cloud (100 workers): ~2 hours - Cost: ~$47

Working with Data

Data Already in S3

If your data is already in S3, workers can read it directly:

plan(future_starburst, workers = 50)

results <- future_map(file_list, function(file) {
  # Workers read directly from S3
  data <- read.csv(sprintf("s3://my-bucket/%s", file))
  process(data)
})

Uploading Local Data

For smaller datasets, you can pass data as arguments:

# Load data locally
data <- read.csv("local_file.csv")

# staRburst automatically uploads to S3 and distributes
plan(future_starburst, workers = 50)

results <- future_map(1:1000, function(i) {
  # Each worker gets a copy of 'data'
  bootstrap_analysis(data, i)
})

Large Data Optimization

For very large objects, pre-upload to S3:

# Upload once
large_data <- read.csv("huge_file.csv")
s3_path <- starburst_upload(large_data, "s3://my-bucket/large_data.rds")

# Workers read from S3
plan(future_starburst, workers = 100)

results <- future_map(1:1000, function(i) {
  # Read from S3 inside worker
  data <- readRDS(s3_path)
  process(data, i)
})

Cost Management

Estimate Costs

# Check cost before running
plan(future_starburst, workers = 100, cpu = 4, memory = "8GB")
#> Estimated cost: ~$3.50/hour

Set Cost Limits

# Set maximum cost per job
starburst_config(
  max_cost_per_job = 10,      # Don't start jobs that would cost >$10
  cost_alert_threshold = 5     # Warn when approaching $5
)

# Now jobs exceeding limit will error before starting
plan(future_starburst, workers = 1000)  # Would cost ~$35/hour
#> Error: Estimated cost ($35/hr) exceeds limit ($10/hr)

Track Actual Costs

plan(future_starburst, workers = 50)

results <- future_map(data, process)

#> Cluster runtime: 23 minutes
#> Total cost: $1.34

Quota Management

Check Your Quota

starburst_quota_status()
#> Fargate vCPU Quota: 100 / 100 used
#> Allows: ~25 workers with 4 vCPUs each
#>
#> Recommended: Request increase to 500 vCPUs

Request Quota Increase

starburst_request_quota_increase(vcpus = 500)
#> Requesting Fargate vCPU quota increase:
#>   Current: 100 vCPUs
#>   Requested: 500 vCPUs
#>
#> ✓ Quota increase requested (Case ID: 12345678)
#> ✓ AWS typically approves within 1-24 hours

Wave-Based Execution

If you request more workers than your quota allows, staRburst automatically uses wave-based execution:

# Quota allows 25 workers, but you request 100
plan(future_starburst, workers = 100, cpu = 4)

#> ⚠ Requested: 100 workers (400 vCPUs)
#> ⚠ Current quota: 100 vCPUs (allows 25 workers max)
#>
#> 📋 Execution plan:
#>   • Running in 4 waves of 25 workers each
#>
#> 💡 Request quota increase to 500 vCPUs? [y/n]: y
#>
#> ✓ Quota increase requested
#> ⚡ Starting wave 1 (25 workers)...

results <- future_map(1:1000, expensive_function)

#> ⚡ Wave 1: 100% complete (250 tasks)
#> ⚡ Wave 2: 100% complete (500 tasks)
#> ⚡ Wave 3: 100% complete (750 tasks)
#> ⚡ Wave 4: 100% complete (1000 tasks)

Troubleshooting

View Worker Logs

# View logs from most recent cluster
starburst_logs()

# View logs from specific task
starburst_logs(task_id = "abc-123")

# View last 100 log lines
starburst_logs(last_n = 100)

Check Cluster Status

starburst_status()
#> Active Clusters:
#>   • starburst-xyz123: 50 workers running
#>   • starburst-abc456: 25 workers running

Common Issues

Environment mismatch: Packages not found on workers

# Rebuild environment
starburst_rebuild_environment()

Task failures: Some tasks failing

# Check logs
starburst_logs(task_id = "failed-task-id")

# Often due to memory limits - increase worker memory
plan(future_starburst, workers = 50, memory = "16GB")  # Default is 8GB

Slow data transfer: Large objects taking too long

# Use Arrow for data frames
library(arrow)
write_parquet(my_data, "s3://bucket/data.parquet")

# Workers read Arrow
results <- future_map(1:100, function(i) {
  data <- read_parquet("s3://bucket/data.parquet")
  process(data, i)
})

Best Practices

1. Use for Right-Sized Workloads

✅ Good: Each task takes >5 minutes

# 100 tasks, each takes 10 minutes
# Local: 1000 minutes, Cloud: ~10 minutes

❌ Bad: Each task takes <1 minute

# 10000 tasks, each takes 30 seconds
# Startup overhead (45s) dominates

2. Batch Small Tasks

Instead of:

# 10,000 tiny tasks
results <- future_map(1:10000, small_function)

Do:

# 100 batches of 100 tasks each
batches <- split(1:10000, ceiling(seq_along(1:10000) / 100))

results <- future_map(batches, function(batch) {
  lapply(batch, small_function)
})

# Flatten results
results <- unlist(results, recursive = FALSE)

3. Use S3 for Large Data

Don’t:

big_data <- read.csv("10GB_file.csv")  # Upload for every task
results <- future_map(1:1000, function(i) process(big_data, i))

Do:

# Upload once to S3
s3_path <- "s3://bucket/big_data.csv"
write.csv(big_data, s3_path)

# Workers read from S3
results <- future_map(1:1000, function(i) {
  data <- read.csv(s3_path)
  process(data, i)
})

4. Set Reasonable Limits

starburst_config(
  max_cost_per_job = 50,           # Prevent accidents
  cost_alert_threshold = 25        # Get warned early
)

5. Clean Up

# staRburst auto-cleans, but you can force it
plan(sequential)  # Switch back to local
# Old cluster resources are cleaned up automatically

Advanced: Custom Configuration

CPU and Memory

# High CPU, low memory (CPU-bound work)
plan(future_starburst, workers = 50, cpu = 8, memory = "16GB")

# Low CPU, high memory (memory-bound work)
plan(future_starburst, workers = 25, cpu = 4, memory = "32GB")

Timeout

# Increase timeout for long-running tasks (default 1 hour)
plan(future_starburst, workers = 10, timeout = 7200)  # 2 hours

Region

# Use specific region (default from config)
plan(future_starburst, workers = 50, region = "us-west-2")

Next Steps

Check out the Advanced Usage vignette
Review Performance Tuning guide
See Example Workflows for real-world patterns
Read Troubleshooting Guide when stuck

Getting Help

GitHub Issues: https://github.com/yourname/starburst/issues
Discussions: https://github.com/yourname/starburst/discussions
Email: your.email@example.com