From 806107e280814eda4e9db8147597183014d1aeff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Wed, 1 Apr 2026 12:08:08 +0200 Subject: [PATCH 1/3] refactor: simplify all-sites filling --- workflow/scripts/fill_all_sites.R | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/workflow/scripts/fill_all_sites.R b/workflow/scripts/fill_all_sites.R index 045ec61..4e74d48 100644 --- a/workflow/scripts/fill_all_sites.R +++ b/workflow/scripts/fill_all_sites.R @@ -1,6 +1,4 @@ #!/usr/bin/env Rscript - -# Write stdout and stderr to log file log <- file(snakemake@log[[1]], open = "wt") sink(log, type = "message") sink(log, type = "output") @@ -15,11 +13,6 @@ log_threshold(INFO) log_info("Reading variants") variants <- read_tsv(snakemake@input[["variants"]]) -# Create a mapping of variant names to their genomic position -variant_coords <- variants %>% - select(VARIANT_NAME, CHROM, POS) %>% - distinct() - log_info("Reading filtered sites") sites <- read_tsv(snakemake@input[["sites"]]) %>% distinct(SAMPLE, POS) %>% # TODO: consider region/chrom @@ -28,14 +21,9 @@ sites <- read_tsv(snakemake@input[["sites"]]) %>% log_info("Processing variants") all_variants <- variants %>% # Select minimal columns - distinct(VARIANT_NAME, CHROM, SAMPLE, ALT_FREQ) %>% - # Handle duplicates - group_by(SAMPLE, VARIANT_NAME, CHROM) %>% - summarise(ALT_FREQ = sum(ALT_FREQ, na.rm = TRUE), .groups = "drop") %>% + distinct(VARIANT_NAME, CHROM, POS, SAMPLE, ALT_FREQ) %>% # Complete with NA - complete(SAMPLE, VARIANT_NAME, CHROM) %>% - # Assign genomic positions for all combinations - left_join(variant_coords, by = c("CHROM", "VARIANT_NAME")) %>% + complete(SAMPLE, nesting(VARIANT_NAME, CHROM, POS)) %>% # Merge filtered sites # TODO: consider region/chrom left_join(sites, by = c("SAMPLE", "POS")) %>% From 2579cf94e142add6f4b1f31348ef2e21e1f3e28d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Wed, 1 Apr 2026 12:08:41 +0200 Subject: [PATCH 2/3] refactor: add working for ambiguous variant callings --- workflow/scripts/fill_all_sites.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/workflow/scripts/fill_all_sites.R b/workflow/scripts/fill_all_sites.R index 4e74d48..7d064ad 100644 --- a/workflow/scripts/fill_all_sites.R +++ b/workflow/scripts/fill_all_sites.R @@ -13,6 +13,19 @@ log_threshold(INFO) log_info("Reading variants") variants <- read_tsv(snakemake@input[["variants"]]) +# Check if each sample, variant and position have 1 frequency +variants %>% + distinct(VARIANT_NAME, CHROM, POS, SAMPLE, ALT_FREQ) %>% + group_by(VARIANT_NAME, CHROM, POS, SAMPLE) %>% + filter(n() > 1) %>% + { + if (nrow(.) > 0) { + log_warn( + "Found {nrow(.)} ambiguous (SAMPLE, VARIANT_NAME, POS) combinations" + ) + } + } + log_info("Reading filtered sites") sites <- read_tsv(snakemake@input[["sites"]]) %>% distinct(SAMPLE, POS) %>% # TODO: consider region/chrom From eecf88e0df8f6edab89755071cecc5a79278792c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Wed, 1 Apr 2026 12:39:45 +0200 Subject: [PATCH 3/3] fix: collapse multi-site variant names this is an edge, but it's relevant for variants that share the same variant name but have different sites (multi-site variants with the same effect) --- .../report/pairwise_trajectory_correlation_data.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/workflow/scripts/report/pairwise_trajectory_correlation_data.R b/workflow/scripts/report/pairwise_trajectory_correlation_data.R index 8ca4cf1..fc92f86 100644 --- a/workflow/scripts/report/pairwise_trajectory_correlation_data.R +++ b/workflow/scripts/report/pairwise_trajectory_correlation_data.R @@ -1,6 +1,4 @@ #!/usr/bin/env Rscript - -# Write stdout and stderr to log file log <- file(snakemake@log[[1]], open = "wt") sink(log, type = "message") sink(log, type = "output") @@ -17,7 +15,7 @@ log_threshold(INFO) log_info("Reading variants") variants <- read_tsv(snakemake@input[["variants"]]) -# Obtain sample names ordered by CollectionDate +log_info("Sorting dates") date_order <- read_csv(snakemake@input[["metadata"]]) %>% arrange(CollectionDate) %>% pull(ID) %>% @@ -25,6 +23,14 @@ date_order <- read_csv(snakemake@input[["metadata"]]) %>% log_info("Formatting variants") all_variants_wider <- variants %>% + # Collapse positions (treat as uncertain if filter is inconsistent) + group_by(SAMPLE, VARIANT_NAME) %>% + summarise( + ALT_FREQ = ifelse(n_distinct(ALT_FREQ, na.rm = TRUE) > 1, NA, first(ALT_FREQ)), + FILTER_PASS = ifelse(n_distinct(FILTER_PASS) > 1, NA, first(FILTER_PASS)), + .groups = "drop" + ) %>% + mutate(ALT_FREQ = if_else(is.na(FILTER_PASS), NA, ALT_FREQ)) %>% distinct(SAMPLE, VARIANT_NAME, ALT_FREQ) %>% pivot_wider( names_from = VARIANT_NAME,