Example: Parallel Report Generation • starburst

Overview

Generating personalized reports for many customers is a common bottleneck. This example demonstrates parallelizing RMarkdown/Quarto report generation using staRburst.

Use Case: Monthly customer reports, automated analytics, personalized dashboards, regulatory reporting

Computational Pattern: I/O-bound parallel processing with document rendering

The Problem

You need to generate 50 customized monthly reports for different customers: - Each report includes data analysis, visualizations, and summary statistics - Each report takes 1-2 minutes to render - Sequential generation would take 50-100 minutes - Reports must be delivered by end of business day

Setup

library(starburst)
library(rmarkdown)

Report Template

Create a simple RMarkdown template:

# Create report template
report_template <- '
---
title: "Monthly Analytics Report"
output: html_document
params:
  customer_id: ""
  customer_name: ""
  month: ""
  data: NULL
---

```{r template-setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
```

# Monthly Report for `r params$customer_name`

**Customer ID:** `r params$customer_id`
**Period:** `r params$month`
**Report Generated:** `r format(Sys.time(), "%Y-%m-%d %H:%M")`

---

## Executive Summary

This report summarizes activity for `r params$customer_name` during `r params$month`.

\`\`\`{r summary}
data <- params$data
cat(sprintf("Total transactions: %d\\n", nrow(data)))
cat(sprintf("Total revenue: $%.2f\\n", sum(data$revenue)))
cat(sprintf("Average order value: $%.2f\\n", mean(data$revenue)))
cat(sprintf("Active days: %d\\n", length(unique(data$date))))
\`\`\`

## Revenue Trend

\`\`\`{r revenue-plot, fig.width=8, fig.height=4}
daily_revenue <- aggregate(revenue ~ date, data, sum)
plot(daily_revenue$date, daily_revenue$revenue,
     type = "l", lwd = 2, col = "steelblue",
     main = "Daily Revenue Trend",
     xlab = "Date", ylab = "Revenue ($)")
grid()
\`\`\`

## Top Products

\`\`\`{r top-products}
top_products <- head(
  aggregate(revenue ~ product, data, sum),
  10
)
top_products <- top_products[order(-top_products$revenue), ]
knitr::kable(top_products, format.args = list(big.mark = ","))
\`\`\`

## Summary Statistics

\`\`\`{r stats}
stats <- data.frame(
  Metric = c("Total Orders", "Avg Order Value", "Max Order",
             "Min Order", "Std Dev"),
  Value = c(
    nrow(data),
    round(mean(data$revenue), 2),
    round(max(data$revenue), 2),
    round(min(data$revenue), 2),
    round(sd(data$revenue), 2)
  )
)
knitr::kable(stats, format.args = list(big.mark = ","))
\`\`\`

---

*This report was automatically generated using staRburst parallel processing.*
'

# Save template to file
writeLines(report_template, "report_template.Rmd")

Generate Sample Data

Create synthetic customer data:

# Function to generate data for one customer
generate_customer_data <- function(customer_id) {
  set.seed(customer_id)

  n_transactions <- sample(100:500, 1)
  dates <- sort(sample(seq.Date(
    from = as.Date("2026-01-01"),
    to = as.Date("2026-01-31"),
    by = "day"
  ), n_transactions, replace = TRUE))

  products <- c("Product A", "Product B", "Product C",
                "Product D", "Product E")

  data.frame(
    customer_id = customer_id,
    date = dates,
    product = sample(products, n_transactions, replace = TRUE),
    revenue = round(rnorm(n_transactions, mean = 150, sd = 50), 2),
    stringsAsFactors = FALSE
  )
}

# Generate customer list
n_customers <- 50
customers <- data.frame(
  customer_id = sprintf("CUST%03d", 1:n_customers),
  customer_name = paste("Company", LETTERS[1:n_customers %% 26 + 1],
                       (1:n_customers %/% 26) + 1),
  stringsAsFactors = FALSE
)

head(customers)

Report Generation Function

Define function to generate one report:

generate_report <- function(customer_info) {
  customer_id <- customer_info$customer_id
  customer_name <- customer_info$customer_name

  # Generate data for this customer
  data <- generate_customer_data(as.numeric(gsub("CUST", "", customer_id)))

  # Output file path
  output_file <- sprintf("report_%s.html", customer_id)

  tryCatch({
    # Render report
    rmarkdown::render(
      input = "report_template.Rmd",
      output_file = output_file,
      params = list(
        customer_id = customer_id,
        customer_name = customer_name,
        month = "January 2026",
        data = data
      ),
      quiet = TRUE
    )

    list(
      customer_id = customer_id,
      success = TRUE,
      output_file = output_file,
      file_size = file.size(output_file),
      render_time = Sys.time()
    )
  }, error = function(e) {
    list(
      customer_id = customer_id,
      success = FALSE,
      error = as.character(e),
      render_time = Sys.time()
    )
  })
}

Local Execution

Test with a few reports locally:

# Test with 5 reports
test_customers <- head(customers, 5)

cat(sprintf("Rendering %d reports locally...\n", nrow(test_customers)))
local_start <- Sys.time()

local_results <- lapply(
  split(test_customers, 1:nrow(test_customers)),
  generate_report
)

local_time <- as.numeric(difftime(Sys.time(), local_start, units = "secs"))

cat(sprintf("✓ Completed in %.1f seconds\n", local_time))
cat(sprintf("  Average: %.1f seconds per report\n", local_time / 5))
cat(sprintf("  Estimated time for %d reports: %.1f minutes\n\n",
            n_customers, (local_time / 5 * n_customers) / 60))

Typical output:

Rendering 5 reports locally...
✓ Completed in 23.4 seconds
  Average: 4.7 seconds per report
  Estimated time for 50 reports: 3.9 minutes

Cloud Execution with staRburst

Render all reports in parallel:

cat(sprintf("Rendering %d reports on AWS...\n", n_customers))

# Convert data frame rows to list for starburst_map
customer_list <- split(customers, 1:nrow(customers))

results <- starburst_map(
  customer_list,
  generate_report,
  workers = 25,
  cpu = 2,
  memory = "4GB"
)

Typical output:

🚀 Starting starburst cluster with 25 workers
💰 Estimated cost: ~$2.00/hour
📊 Processing 50 items with 25 workers
📦 Created 25 chunks (avg 2 items per chunk)
🚀 Submitting tasks...
✓ Submitted 25 tasks
⏳ Progress: 25/25 tasks (0.4 minutes elapsed)

✓ Completed in 0.4 minutes
💰 Actual cost: $0.01

Results Processing

Analyze the generation results:

# Check success rate
success_count <- sum(sapply(results, function(x) x$success))
failure_count <- sum(!sapply(results, function(x) x$success))

cat("\n=== Report Generation Summary ===\n\n")
cat(sprintf("Total reports: %d\n", length(results)))
cat(sprintf("Successfully generated: %d (%.1f%%)\n",
            success_count, (success_count / length(results)) * 100))
cat(sprintf("Failed: %d\n\n", failure_count))

# File size summary
successful_results <- results[sapply(results, function(x) x$success)]
file_sizes <- sapply(successful_results, function(x) x$file_size)

cat("File size statistics:\n")
cat(sprintf("  Total size: %.2f MB\n", sum(file_sizes) / 1024^2))
cat(sprintf("  Average size: %.1f KB\n", mean(file_sizes) / 1024))
cat(sprintf("  Range: %.1f - %.1f KB\n\n",
            min(file_sizes) / 1024, max(file_sizes) / 1024))

# Show sample of generated reports
cat("Generated reports:\n")
report_files <- sapply(successful_results[1:10], function(x) x$output_file)
print(report_files)

Typical output:

=== Report Generation Summary ===

Total reports: 50
Successfully generated: 50 (100.0%)
Failed: 0

File size statistics:
  Total size: 2.45 MB
  Average size: 50.2 KB
  Range: 45.3 - 55.8 KB

Generated reports:
 [1] "report_CUST001.html" "report_CUST002.html"
 [3] "report_CUST003.html" "report_CUST004.html"
 [5] "report_CUST005.html" "report_CUST006.html"
 [7] "report_CUST007.html" "report_CUST008.html"
 [9] "report_CUST009.html" "report_CUST010.html"

Performance Comparison

Method	Reports	Time	Cost	Speedup
Local	50	3.9 min	$0	1x
staRburst	50 (10 workers)	1.2 min	$0.004	3.3x
staRburst	50 (25 workers)	0.4 min	$0.01	9.8x
staRburst	50 (50 workers)	0.3 min	$0.02	13x

Key Insights: - Near-linear scaling with worker count - Sweet spot: 25-50 workers for this workload - Minimal cost ($0.01) for significant time savings - Can easily scale to 500+ reports

Advanced: Custom Report Distribution

Automatically distribute reports after generation:

generate_and_distribute <- function(customer_info) {
  # Generate report
  result <- generate_report(customer_info)

  if (result$success) {
    # Upload to S3 (example)
    tryCatch({
      # paws::s3()$put_object(
      #   Bucket = "my-reports-bucket",
      #   Key = sprintf("reports/2026-01/%s", result$output_file),
      #   Body = readBin(result$output_file, "raw",
      #                  file.size(result$output_file))
      # )

      result$uploaded <- TRUE
      result$s3_url <- sprintf("s3://my-reports-bucket/reports/2026-01/%s",
                              result$output_file)
    }, error = function(e) {
      result$uploaded <- FALSE
      result$upload_error <- as.character(e)
    })
  }

  result
}

When to Use This Pattern

Good fit: - Many independent reports (> 10) - Report rendering takes > 30 seconds - Time-sensitive delivery requirements - CPU or I/O intensive rendering

Not ideal: - Very simple reports (< 10 seconds to render) - Reports with shared state or dependencies - Interactive report generation - Real-time reporting

Running the Full Example

The complete runnable script is available at:

system.file("examples/reports.R", package = "starburst")

Run it with:

source(system.file("examples/reports.R", package = "starburst"))

Next Steps

Use real customer data from database
Add more complex visualizations
Implement automated email distribution
Create PDF reports instead of HTML
Add report templating system
Schedule monthly report generation

Related examples: - API Calls - Another I/O-bound parallel task - Feature Engineering - Data processing patterns