--- title: "Using greenAlgoR with targets Pipelines" author: "Adrien Taudière" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Using greenAlgoR with targets Pipelines} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} bibliography: ../pkgdown/assets/bibliography.bib --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "figures/targets-", out.width = "100%", message = FALSE, warning = FALSE ) ``` ## Introduction The `targets` package [@landau_targets_2021] provides a powerful framework for reproducible computational workflows in R. The `greenAlgoR` package seamlessly integrates with `targets` to help you understand the environmental impact of your entire data analysis pipeline. This vignette demonstrates how to: - Calculate carbon footprint for complete `targets` pipelines - Identify the most carbon-intensive steps in your workflow - Optimize pipelines for environmental efficiency ## Setup ```{r setup} library(greenAlgoR) library(targets) library(ggplot2) ``` ## Basic targets Integration The `ga_targets()` function analyzes your `targets` pipeline and calculates the total carbon footprint based on: - Runtime of each target - Memory usage patterns - Storage requirements (optional) ### Simple Example ```{r targets-example} # Create a simple targets example tar_dir({ # tar_dir() runs code from a temp dir for CRAN compatibility # Define a simple pipeline tar_script( { library(targets) list( tar_target( name = data_prep, command = { # Simulate data preparation (2 seconds) Sys.sleep(2) data.frame(x = rnorm(1000), y = rnorm(1000)) } ), tar_target( name = analysis, command = { # Simulate analysis (1 second) Sys.sleep(1) lm(y ~ x, data = data_prep) } ), tar_target( name = visualization, command = { # Simulate plotting (0.5 seconds) Sys.sleep(0.5) plot(data_prep$x, data_prep$y) "plot_completed" } ) ) }, ask = FALSE ) # Run the pipeline tar_make() # Get metadata metadata <- tar_meta() print(metadata[, c("name", "seconds", "bytes")]) # Calculate carbon footprint pipeline_footprint <- ga_targets( tar_meta_raw = metadata, location_code = "WORLD", n_cores = 2, TDP_per_core = 15, memory_ram = 8 ) cat( "Pipeline carbon footprint:", pipeline_footprint$carbon_footprint_total_gCO2, "g CO2\n" ) cat("Total runtime:", pipeline_footprint$runtime_h * 3600, "seconds\n") }) ``` ## Advanced Pipeline Analysis For more complex pipelines, you can get detailed insights: ```{r advanced-targets} tar_dir({ # Create a more complex pipeline tar_script( { library(targets) simulate_computation <- function(duration, size = 1000) { Sys.sleep(duration) matrix(rnorm(size * size), nrow = size) } list( tar_target(small_task, simulate_computation(0.5, 100)), tar_target(medium_task, simulate_computation(2, 500)), tar_target(large_task, simulate_computation(5, 1000)), tar_target( combined_analysis, { # Combine results result <- list( small = summary(small_task), medium = summary(medium_task), large = summary(large_task) ) Sys.sleep(1) # Additional processing time result } ) ) }, ask = FALSE ) tar_make() metadata <- tar_meta() # Calculate footprint with storage estimation detailed_footprint <- ga_targets( tar_meta_raw = metadata, location_code = "FR", # France has relatively low carbon intensity n_cores = 4, TDP_per_core = 20, memory_ram = 16, add_storage_estimation = TRUE ) # Display breakdown cat("Total CO2 emissions:", detailed_footprint$carbon_footprint_total_gCO2, "g\n") cat("CPU contribution:", detailed_footprint$carbon_footprint_cores, "g\n") cat("Memory contribution:", detailed_footprint$carbon_footprint_memory, "g\n") if (!is.null(detailed_footprint$power_draw_storage_kWh)) { storage_co2 <- detailed_footprint$carbon_intensity * detailed_footprint$power_draw_storage_kWh cat("Storage contribution:", storage_co2, "g\n") } }) ``` ## Comparing Different Configurations You can compare how different hardware configurations affect your pipeline's carbon footprint: ```{r config-comparison} tar_dir({ # Use the same pipeline as above tar_script( { library(targets) simulate_computation <- function(duration, size = 1000) { Sys.sleep(duration) matrix(rnorm(size * size), nrow = size) } list( tar_target(task1, simulate_computation(1, 200)), tar_target(task2, simulate_computation(2, 300)), tar_target(task3, simulate_computation(1.5, 250)) ) }, ask = FALSE ) tar_make() metadata <- tar_meta() # Compare different configurations configs <- data.frame( Config = c("Laptop", "Desktop", "Server"), Cores = c(2, 8, 16), TDP = c(10, 15, 25), RAM = c(8, 16, 64), Location = c("WORLD", "FR", "NO") # Different locations ) # Calculate footprint for each configuration configs$CO2_emissions <- mapply(function(cores, tdp, ram, loc) { ga_targets( tar_meta_raw = metadata, n_cores = cores, TDP_per_core = tdp, memory_ram = ram, location_code = loc )$carbon_footprint_total_gCO2 }, configs$Cores, configs$TDP, configs$RAM, configs$Location) print(configs) }) ``` ## Visualizing Pipeline Carbon Footprint Create comprehensive visualizations of your pipeline's environmental impact: ```{r pipeline-visualization, fig.alt="Footprint value compared to references values such as 1 hour of Netflix streaming and a fly from London to Paris.", fig.width=10, fig.height=8} tar_dir({ # Create pipeline and get footprint with reference values tar_script( { library(targets) list( tar_target(data_load, { Sys.sleep(1) rnorm(5000) }), tar_target(preprocessing, { Sys.sleep(3) scale(data_load) }), tar_target(modeling, { Sys.sleep(5) lm(data_load ~ seq_along(data_load)) }), tar_target(postprocessing, { Sys.sleep(2) summary(modeling) }) ) }, ask = FALSE ) tar_make() metadata <- tar_meta() pipeline_result <- ga_targets( tar_meta_raw = metadata, location_code = "WORLD", n_cores = 4, memory_ram = 16, add_ref_values = TRUE ) # Create comprehensive visualization ref_data <- pipeline_result$ref_value ref_data$category <- "Reference Activities" # Add pipeline data pipeline_data <- data.frame( variable = "Your Pipeline", value = pipeline_result$carbon_footprint_total_gCO2, prop_footprint = NA, category = "Your Computation" ) # Combine data plot_data <- rbind( ref_data[, c("variable", "value", "category")], pipeline_data[, c("variable", "value", "category")] ) plot_data$value <- as.numeric(plot_data$value) # Create the plot ggplot(plot_data, aes( x = reorder(variable, value), y = value, fill = category )) + geom_col(alpha = 0.8) + scale_fill_manual( values = c( "Reference Activities" = "lightblue", "Your Computation" = "darkgreen" ), name = "Type" ) + scale_y_log10() + coord_flip() + labs( title = "Carbon Footprint of targets Pipeline", subtitle = paste( "Total emissions:", round(pipeline_result$carbon_footprint_total_gCO2, 2), "g CO2" ), x = "Activity", y = "CO2 Emissions (g, log scale)", caption = "Comparison with everyday reference activities" ) + theme_minimal() + theme( legend.position = "bottom", plot.title = element_text(size = 14, face = "bold"), plot.subtitle = element_text(size = 12) ) }) ``` ## Target-Level Analysis For detailed optimization, you might want to analyze individual targets: ```{r target-analysis} tar_dir({ tar_script( { library(targets) list( tar_target(quick_task, { Sys.sleep(0.5) "done" }), tar_target(slow_task, { Sys.sleep(10) "done" }), tar_target(memory_intensive, { # Simulate memory-intensive task big_matrix <- matrix(rnorm(1000 * 1000), nrow = 1000) Sys.sleep(3) summary(big_matrix) }) ) }, ask = FALSE ) tar_make() metadata <- tar_meta() # Analyze each target separately target_analysis <- data.frame( Target = metadata$name, Runtime_sec = metadata$seconds, Memory_MB = metadata$bytes / (1024^2), stringsAsFactors = FALSE ) # Calculate individual footprints (simplified) target_analysis$CO2_estimate <- sapply(metadata$seconds, function(sec) { ga_footprint( runtime_h = sec / 3600, n_cores = 2, memory_ram = 8 )$carbon_footprint_total_gCO2 }) print(target_analysis) # Identify the most carbon-intensive target most_intensive <- target_analysis[which.max(target_analysis$CO2_estimate), ] cat( "\nMost carbon-intensive target:", most_intensive$Target, "(", round(most_intensive$CO2_estimate, 3), "g CO2 )\n" ) }) ``` ## Best Practices for Sustainable Pipelines 1. **Profile your pipeline**: Use `tar_meta()` to identify bottlenecks 2. **Optimize slow targets**: Focus on reducing runtime of carbon-intensive steps 3. **Cache efficiently**: Use `targets` caching to avoid re-running expensive computations 4. **Choose appropriate hardware**: Match computational resources to task requirements 5. **Consider location**: Run pipelines in regions with cleaner energy if possible ## Integration with Workflow Include carbon footprint reporting as part of your standard pipeline: ```{r workflow-integration, eval=FALSE} # Add to your _targets.R file tar_target( pipeline_footprint, { # Calculate footprint after pipeline completion footprint <- ga_targets(location_code = "FR") # Log the results cat( "Pipeline carbon footprint:", footprint$carbon_footprint_total_gCO2, "g CO2\n" ) # Save for reporting saveRDS(footprint, "results/carbon_footprint.rds") footprint } ) ``` ## Conclusion Integrating `greenAlgoR` with `targets` provides a powerful way to understand and optimize the environmental impact of your computational workflows. By regularly monitoring carbon footprint, you can make informed decisions about computational efficiency and contribute to more sustainable research practices. ## References