---
title: "Benchmarking Chimera Detection Methods"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Benchmarking Chimera Detection Methods}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)
```

```{r setup}
library(tidypq)
library(MiscMetabar)
library(phyloseq)
library(ggplot2)
library(patchwork)
```

## Overview

This vignette demonstrates how to benchmark chimera detection methods using
synthetic chimeric sequences. The `create_chimera_pq()` function generates
artificial chimeras with known parent sequences, allowing you to evaluate
detection sensitivity and specificity.

## Data Preparation

To reduce computation time, we use a subset of `data_fungi` filtered by
taxonomy. You can adjust the filtering to test on larger or smaller datasets.

```{r prepare-data}
data(data_fungi)

# Filter to a single Class for faster computation
# Original data_fungi has 1420 taxa - this reduces it to ~200-400 taxa
data_fungi_subset <- subset_taxa(
  data_fungi,
  Class %in% c("Agaricomycetes", "Dacrymycetes","Eurotiomycetes", "Lecanoromycetes", "Tremellomycetes")
)

cat("Original taxa:", ntaxa(data_fungi), "\n")
cat("Subset taxa:", ntaxa(data_fungi_subset), "\n")
```

## Creating Test Data with Synthetic Chimeras

Create a phyloseq object with known chimeric sequences. The
`ensure_distinct_parents = TRUE` (default) ensures that parent sequences are
sufficiently different, making the chimeras detectable.

```{r create-chimeras}
df_chim <- create_chimera_pq(
  data_fungi_subset,
  n_chimeras = 50,
  median_abundance_multiplier = 0.05,
  min_parent_distance = 0.05
)
data_fungi_test <- df_chim$physeq
known_chimeras <- df_chim$chimera_names

# View parent information including sequence distance
print(df_chim$parent_info)
cat(
  "\nMinimum parent distance:",
  min(df_chim$parent_info$parent_distance),
  "\n"
)
cat("\nMean parent distance:", mean(df_chim$parent_info$parent_distance), "\n")
```

## Comparing Detection Methods

### Helper function for benchmarking

```{r benchmark-helper}
# Helper function to benchmark a detection method
benchmark_detection <- function(
  physeq,
  method_fn,
  method_name,
  known_chimeras,
  ...
) {
  # Measure time and memory
  gc() # Clean up before measurement
  mem_before <- sum(gc()[, 2])

  start_time <- Sys.time()
  nochim <- method_fn(physeq, ...)
  end_time <- Sys.time()

  mem_after <- sum(gc()[, 2])

  # Calculate detection metrics
  detected <- known_chimeras[!known_chimeras %in% taxa_names(nochim)]
  missed <- known_chimeras[known_chimeras %in% taxa_names(nochim)]

  list(
    method = method_name,
    physeq_nochim = nochim,
    n_chimeras = length(known_chimeras),
    n_detected = length(detected),
    n_missed = length(missed),
    detection_rate = 100 * length(detected) / length(known_chimeras),
    total_removed = ntaxa(physeq) - ntaxa(nochim),
    time_seconds = as.numeric(difftime(end_time, start_time, units = "secs")),
    memory_mb = mem_after - mem_before,
    detected_names = detected,
    missed_names = missed
  )
}

# Helper function to calculate detection rate
calc_detection_rate <- function(physeq_nochim, known_chimeras) {
  detected <- sum(!known_chimeras %in% taxa_names(physeq_nochim))
  100 * detected / length(known_chimeras)
}
```

### Run benchmarks

```{r run-benchmarks}
# vsearch-based detection
result_vs <- benchmark_detection(
  data_fungi_test,
  chimera_removal_vs,
  "vsearch",
  known_chimeras
)

# dada2-based detection
result_dada2 <- benchmark_detection(
  data_fungi_test,
  chimera_removal_dada2,
  "dada2",
  known_chimeras
)
```

## Benchmark Summary Table

```{r summary-table}
# Create summary table
benchmark_summary <- data.frame(
  Method = c(result_vs$method, result_dada2$method),
  `N Chimeras` = c(result_vs$n_chimeras, result_dada2$n_chimeras),
  Detected = c(result_vs$n_detected, result_dada2$n_detected),
  `Detection Rate (%)` = round(
    c(result_vs$detection_rate, result_dada2$detection_rate),
    1
  ),
  `Total Removed` = c(result_vs$total_removed, result_dada2$total_removed),
  `Time (s)` = round(c(result_vs$time_seconds, result_dada2$time_seconds), 2),
  `Memory (MB)` = round(c(result_vs$memory_mb, result_dada2$memory_mb), 1),
  check.names = FALSE
)

knitr::kable(benchmark_summary, caption = "Chimera Detection Benchmark Summary")
```

## Evaluating Detection Performance

```{r evaluate}
cat("=== Chimera Detection Benchmark ===\n")
cat("Known chimeras:", length(known_chimeras), "\n\n")
cat(
  "vsearch detected:",
  result_vs$n_detected,
  "/",
  result_vs$n_chimeras,
  "(",
  round(result_vs$detection_rate, 1),
  "%)\n"
)
cat(
  "dada2 detected:",
  result_dada2$n_detected,
  "/",
  result_dada2$n_chimeras,
  "(",
  round(result_dada2$detection_rate, 1),
  "%)\n\n"
)

cat("vsearch total removed:", result_vs$total_removed, "\n")
cat("dada2 total removed:", result_dada2$total_removed, "\n")
```

## Analyzing Missed Chimeras

Examine which chimeras were missed and their parent sequences:

```{r missed-chimeras}
if (length(result_vs$missed_names) > 0) {
  cat("\nMissed by vsearch:\n")
  missed_info <- df_chim$parent_info[
    df_chim$parent_info$chimera %in% result_vs$missed_names,
  ]
  print(missed_info)
  cat(
    "\nMean parent distance of missed chimeras:",
    mean(missed_info$parent_distance),
    "\n"
  )
}

if (length(result_dada2$missed_names) > 0) {
  cat("\nMissed by dada2:\n")
  missed_info <- df_chim$parent_info[
    df_chim$parent_info$chimera %in% result_dada2$missed_names,
  ]
  print(missed_info)
  cat(
    "\nMean parent distance of missed chimeras:",
    mean(missed_info$parent_distance),
    "\n"
  )
}
```

## Comparing Detection Across Different Chimera Types

Create different types of synthetic chimeras to test detection robustness:

```{r create-variants}
# More variable proportions (harder to detect)
result_hard <- create_chimera_pq(
  data_fungi_subset,
  n_chimeras = 50,
  prop_mean = 0.5,
  prop_sd = 0.25 # wider distribution
)

# Biased proportions (70/30 splits)
result_biased <- create_chimera_pq(
  data_fungi_subset,
  n_chimeras = 50,
  prop_mean = 0.7,
  prop_sd = 0.1
)

# Lower abundance chimeras (harder to detect)
result_low_abund <- create_chimera_pq(
  data_fungi_subset,
  n_chimeras = 50,
  median_abundance_multiplier = 0.01
)
```

Run both detection methods on all chimera types:

```{r detect-all-variants}
# Run detection on all variants
datasets <- list(
  default = list(physeq = data_fungi_test, chimeras = known_chimeras),
  hard = list(
    physeq = result_hard$physeq,
    chimeras = result_hard$chimera_names
  ),
  biased = list(
    physeq = result_biased$physeq,
    chimeras = result_biased$chimera_names
  ),
  low_abund = list(
    physeq = result_low_abund$physeq,
    chimeras = result_low_abund$chimera_names
  )
)

results <- data.frame(
  dataset = character(),
  method = character(),
  detection_rate = numeric(),
  time_seconds = numeric(),
  stringsAsFactors = FALSE
)

for (name in names(datasets)) {
  ds <- datasets[[name]]

  # vsearch detection
  start_time <- Sys.time()
  nochim_vs <- chimera_removal_vs(ds$physeq)
  time_vs <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_vs <- calc_detection_rate(nochim_vs, ds$chimeras)

  # dada2 detection
  start_time <- Sys.time()
  nochim_dada2 <- chimera_removal_dada2(ds$physeq)
  time_dada2 <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_dada2 <- calc_detection_rate(nochim_dada2, ds$chimeras)

  results <- rbind(
    results,
    data.frame(
      dataset = name,
      method = c("vsearch", "dada2"),
      detection_rate = c(rate_vs, rate_dada2),
      time_seconds = c(time_vs, time_dada2),
      stringsAsFactors = FALSE
    )
  )
}

print(results)
```

## Summary Table: All Scenarios

```{r summary-all-scenarios}
# Reshape for summary table
results_wide <- reshape(
  results,
  direction = "wide",
  idvar = "dataset",
  timevar = "method",
  v.names = c("detection_rate", "time_seconds")
)
names(results_wide) <- gsub("\\.", " ", names(results_wide))

knitr::kable(
  results_wide,
  caption = "Detection Rate and Time Across Scenarios",
  digits = 1
)
```

## Visualizing Results

```{r visualize}
# Combine parent info from all datasets for histograms
prop_data <- rbind(
  data.frame(
    dataset = "Default (sd=0.15)",
    prop = df_chim$parent_info$prop_parent1,
    min_dist = min(df_chim$parent_info$parent_distance)
  ),
  data.frame(
    dataset = "Hard (sd=0.25)",
    prop = result_hard$parent_info$prop_parent1,
    min_dist = min(result_hard$parent_info$parent_distance)
  ),
  data.frame(
    dataset = "Biased (mean=0.7)",
    prop = result_biased$parent_info$prop_parent1,
    min_dist = min(result_biased$parent_info$parent_distance)
  ),
  data.frame(
    dataset = "Low abundance",
    prop = result_low_abund$parent_info$prop_parent1,
    min_dist = min(result_low_abund$parent_info$parent_distance)
  )
)
prop_data$dataset <- factor(
  prop_data$dataset,
  levels = c(
    "Default (sd=0.15)",
    "Hard (sd=0.25)",
    "Biased (mean=0.7)",
    "Low abundance"
  )
)

# Distribution of parent1 proportions for each dataset
p_hist <- ggplot(prop_data, aes(x = prop)) +
  geom_histogram(
    binwidth = 0.05,
    fill = "steelblue",
    color = "white",
    alpha = 0.7
  ) +
  facet_wrap(~dataset, ncol = 2) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(
    x = "Proportion from parent1",
    y = "Count",
    title = "Distribution of Parent1 Proportions"
  ) +
  theme_bw()

print(p_hist)

# Barplot comparing all detection rates
results$dataset <- factor(
  results$dataset,
  levels = c(
    "Default (sd=0.15)",
    "Hard (sd=0.25)",
    "Biased (mean=0.7)",
    "Low abundance"
  )
)

p_bar <- ggplot(results, aes(x = dataset, y = detection_rate, fill = method)) +
  geom_violin(position = position_dodge(width = 0.8), width = 0.7) +
  geom_hline(
    yintercept = seq(0, 100, 20),
    color = "gray",
    linetype = "dashed",
    alpha = 0.5
  ) +
  scale_fill_manual(values = c("vsearch" = "steelblue", "dada2" = "coral")) +
  scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
  labs(
    x = "Dataset",
    y = "Detection rate (%)",
    title = "Chimera Detection Across Different Scenarios",
    fill = "Method"
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p_bar)
```

## Effect of Chimera Abundance on Detection

Chimera abundance relative to parent sequences affects detection sensitivity.

```{r abundance-effect}
abundance_multipliers <- c(0.01, 0.05, 0.1, 0.5)

abundance_results <- data.frame(
  multiplier = numeric(),
  method = character(),
  detection_rate = numeric(),
  time_seconds = numeric(),
  stringsAsFactors = FALSE
)

for (mult in abundance_multipliers) {
  # Create chimeras with this abundance level
  result_abund <- create_chimera_pq(
    data_fungi_subset,
    n_chimeras = 50,
    median_abundance_multiplier = mult,
    seed = 123
  )

  # vsearch detection
  start_time <- Sys.time()
  nochim_vs <- chimera_removal_vs(result_abund$physeq)
  time_vs <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_vs <- calc_detection_rate(nochim_vs, result_abund$chimera_names)

  # dada2 detection
  start_time <- Sys.time()
  nochim_dada2 <- chimera_removal_dada2(result_abund$physeq)
  time_dada2 <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
  rate_dada2 <- calc_detection_rate(nochim_dada2, result_abund$chimera_names)

  abundance_results <- rbind(
    abundance_results,
    data.frame(
      multiplier = c(mult, mult),
      method = c("vsearch", "dada2"),
      detection_rate = c(rate_vs, rate_dada2),
      time_seconds = c(time_vs, time_dada2),
      stringsAsFactors = FALSE
    )
  )
}

print(abundance_results)
```

### Abundance Summary Table

```{r abundance-summary-table}
knitr::kable(
  abundance_results,
  caption = "Detection by Chimera Abundance Level",
  digits = c(2, 0, 1, 2)
)
```

### Visualizing Abundance Effect

```{r abundance-plot}
# Detection rate vs abundance multiplier
p_abund_detection <- ggplot(
  abundance_results,
  aes(x = multiplier, y = detection_rate, color = method)
) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  scale_x_log10() +
  scale_y_continuous(limits = c(0, 100)) +
  scale_color_manual(values = c("vsearch" = "steelblue", "dada2" = "coral")) +
  labs(
    x = "median_abundance_multiplier",
    y = "Detection rate (%)",
    title = "Detection vs Chimera Abundance",
    color = "Method"
  ) +
  theme_bw()

print(p_abund_detection)

# Barplot comparison
abundance_results$multiplier_label <- paste0(
  abundance_results$multiplier * 100,
  "%"
)
abundance_results$multiplier_label <- factor(
  abundance_results$multiplier_label,
  levels = paste0(sort(unique(abundance_results$multiplier)) * 100, "%")
)

p_abund_bar <- ggplot(
  abundance_results,
  aes(x = multiplier_label, y = detection_rate, fill = method)
) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_hline(
    yintercept = seq(0, 100, 20),
    color = "gray",
    linetype = "dashed",
    alpha = 0.5
  ) +
  scale_fill_manual(values = c("vsearch" = "steelblue", "dada2" = "coral")) +
  scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
  labs(
    x = "Chimera abundance (% of median)",
    y = "Detection rate (%)",
    title = "Detection Rate by Chimera Abundance Level",
    fill = "Method"
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p_abund_bar)
```

## Effect of Parent Sequence Distance

Chimeras created from very similar parent sequences are harder to detect.
The `ensure_distinct_parents` parameter (default TRUE) helps avoid this issue.

```{r parent-distance-effect}
# Compare chimeras with different minimum parent distances
distance_results <- data.frame(
  min_param_distance = numeric(),
  method = character(),
  mean_parent_distance = numeric(),
  min_parent_distance = numeric(),
  max_parent_distance = numeric(),
  detection_rate = numeric(),
  nb_detected = numeric(),
  stringsAsFactors = FALSE
)

for (min_dist in c(0, 0.001, 0.01, 0.05, 0.1, 0.2, 0.4)) {
  result_dist <- create_chimera_pq(
    data_fungi_subset,
    n_chimeras = 50,
    min_parent_distance = min_dist,
    seed = 456
  )

  mean_dist <- mean(result_dist$parent_info$parent_distance)
  min_dist_parent <- min(result_dist$parent_info$parent_distance)

  # vsearch detection
  nochim_vs <- chimera_removal_vs(result_dist$physeq)
  rate_vs <- calc_detection_rate(nochim_vs, result_dist$chimera_names)

  # dada2 detection
  nochim_dada2 <- chimera_removal_dada2(result_dist$physeq)
  rate_dada2 <- calc_detection_rate(nochim_dada2, result_dist$chimera_names)

  distance_results <- rbind(
    distance_results,
    data.frame(
      min_param_distance = min_dist,
      method = c("vsearch", "dada2"),
      mean_parent_distance = c(mean_dist, mean_dist),
      min_parent_distance = c(min_dist_parent, min_dist_parent),
      max_parent_distance = c(
        max(result_dist$parent_info$parent_distance),
        max(result_dist$parent_info$parent_distance)
      ),
      detection_rate = c(rate_vs, rate_dada2),
      nb_detected = c(
       ntaxa(result_dist$physeq) - ntaxa(nochim_vs),
        ntaxa(result_dist$physeq) - ntaxa(nochim_dada2)
      ),
      stringsAsFactors = FALSE
    )
  )
}

knitr::kable(
  distance_results,
  caption = "Detection Rate by Minimum Parent Distance",
  digits = c(2, 0, 3, 4, 0)
)
```

```{r parent-distance-plot}
# Plot detection rate vs mean parent distance
p_dist <- ggplot(
  distance_results,
  aes(x = min_param_distance, y = detection_rate, color = method)
) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  scale_x_continuous(limits = c(0, 0.5)) +
  scale_y_continuous(limits = c(0, 100)) +  
  scale_color_manual(values = c("vsearch" = "steelblue", "dada2" = "coral")) +
  labs(
    x = "Min parameter distance",
    y = "Detection rate (%)",
    title = "Detection vs Min Parameter  Distance",
    color = "Method"
  ) +
  theme_bw() +
  ggrepel::geom_label_repel(aes(label=nb_detected), size = 3, show.legend = FALSE)
print(p_dist)
```