diff --git a/workflow/scripts/fill_all_sites.R b/workflow/scripts/fill_all_sites.R index 045ec61..7d064ad 100644 --- a/workflow/scripts/fill_all_sites.R +++ b/workflow/scripts/fill_all_sites.R @@ -1,6 +1,4 @@ #!/usr/bin/env Rscript - -# Write stdout and stderr to log file log <- file(snakemake@log[[1]], open = "wt") sink(log, type = "message") sink(log, type = "output") @@ -15,10 +13,18 @@ log_threshold(INFO) log_info("Reading variants") variants <- read_tsv(snakemake@input[["variants"]]) -# Create a mapping of variant names to their genomic position -variant_coords <- variants %>% - select(VARIANT_NAME, CHROM, POS) %>% - distinct() +# Check if each sample, variant and position have 1 frequency +variants %>% + distinct(VARIANT_NAME, CHROM, POS, SAMPLE, ALT_FREQ) %>% + group_by(VARIANT_NAME, CHROM, POS, SAMPLE) %>% + filter(n() > 1) %>% + { + if (nrow(.) > 0) { + log_warn( + "Found {nrow(.)} ambiguous (SAMPLE, VARIANT_NAME, POS) combinations" + ) + } + } log_info("Reading filtered sites") sites <- read_tsv(snakemake@input[["sites"]]) %>% @@ -28,14 +34,9 @@ sites <- read_tsv(snakemake@input[["sites"]]) %>% log_info("Processing variants") all_variants <- variants %>% # Select minimal columns - distinct(VARIANT_NAME, CHROM, SAMPLE, ALT_FREQ) %>% - # Handle duplicates - group_by(SAMPLE, VARIANT_NAME, CHROM) %>% - summarise(ALT_FREQ = sum(ALT_FREQ, na.rm = TRUE), .groups = "drop") %>% + distinct(VARIANT_NAME, CHROM, POS, SAMPLE, ALT_FREQ) %>% # Complete with NA - complete(SAMPLE, VARIANT_NAME, CHROM) %>% - # Assign genomic positions for all combinations - left_join(variant_coords, by = c("CHROM", "VARIANT_NAME")) %>% + complete(SAMPLE, nesting(VARIANT_NAME, CHROM, POS)) %>% # Merge filtered sites # TODO: consider region/chrom left_join(sites, by = c("SAMPLE", "POS")) %>% diff --git a/workflow/scripts/report/pairwise_trajectory_correlation_data.R b/workflow/scripts/report/pairwise_trajectory_correlation_data.R index 8ca4cf1..fc92f86 100644 --- a/workflow/scripts/report/pairwise_trajectory_correlation_data.R +++ b/workflow/scripts/report/pairwise_trajectory_correlation_data.R @@ -1,6 +1,4 @@ #!/usr/bin/env Rscript - -# Write stdout and stderr to log file log <- file(snakemake@log[[1]], open = "wt") sink(log, type = "message") sink(log, type = "output") @@ -17,7 +15,7 @@ log_threshold(INFO) log_info("Reading variants") variants <- read_tsv(snakemake@input[["variants"]]) -# Obtain sample names ordered by CollectionDate +log_info("Sorting dates") date_order <- read_csv(snakemake@input[["metadata"]]) %>% arrange(CollectionDate) %>% pull(ID) %>% @@ -25,6 +23,14 @@ date_order <- read_csv(snakemake@input[["metadata"]]) %>% log_info("Formatting variants") all_variants_wider <- variants %>% + # Collapse positions (treat as uncertain if filter is inconsistent) + group_by(SAMPLE, VARIANT_NAME) %>% + summarise( + ALT_FREQ = ifelse(n_distinct(ALT_FREQ, na.rm = TRUE) > 1, NA, first(ALT_FREQ)), + FILTER_PASS = ifelse(n_distinct(FILTER_PASS) > 1, NA, first(FILTER_PASS)), + .groups = "drop" + ) %>% + mutate(ALT_FREQ = if_else(is.na(FILTER_PASS), NA, ALT_FREQ)) %>% distinct(SAMPLE, VARIANT_NAME, ALT_FREQ) %>% pivot_wider( names_from = VARIANT_NAME,