Skip to content

Commit b9fee30

Browse files
authored
fix: collapse multi-site variant names (#42)
1 parent de3c21c commit b9fee30

File tree

2 files changed

+23
-16
lines changed

2 files changed

+23
-16
lines changed

workflow/scripts/fill_all_sites.R

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
#!/usr/bin/env Rscript
2-
3-
# Write stdout and stderr to log file
42
log <- file(snakemake@log[[1]], open = "wt")
53
sink(log, type = "message")
64
sink(log, type = "output")
@@ -15,10 +13,18 @@ log_threshold(INFO)
1513
log_info("Reading variants")
1614
variants <- read_tsv(snakemake@input[["variants"]])
1715

18-
# Create a mapping of variant names to their genomic position
19-
variant_coords <- variants %>%
20-
select(VARIANT_NAME, CHROM, POS) %>%
21-
distinct()
16+
# Check if each sample, variant and position have 1 frequency
17+
variants %>%
18+
distinct(VARIANT_NAME, CHROM, POS, SAMPLE, ALT_FREQ) %>%
19+
group_by(VARIANT_NAME, CHROM, POS, SAMPLE) %>%
20+
filter(n() > 1) %>%
21+
{
22+
if (nrow(.) > 0) {
23+
log_warn(
24+
"Found {nrow(.)} ambiguous (SAMPLE, VARIANT_NAME, POS) combinations"
25+
)
26+
}
27+
}
2228

2329
log_info("Reading filtered sites")
2430
sites <- read_tsv(snakemake@input[["sites"]]) %>%
@@ -28,14 +34,9 @@ sites <- read_tsv(snakemake@input[["sites"]]) %>%
2834
log_info("Processing variants")
2935
all_variants <- variants %>%
3036
# Select minimal columns
31-
distinct(VARIANT_NAME, CHROM, SAMPLE, ALT_FREQ) %>%
32-
# Handle duplicates
33-
group_by(SAMPLE, VARIANT_NAME, CHROM) %>%
34-
summarise(ALT_FREQ = sum(ALT_FREQ, na.rm = TRUE), .groups = "drop") %>%
37+
distinct(VARIANT_NAME, CHROM, POS, SAMPLE, ALT_FREQ) %>%
3538
# Complete with NA
36-
complete(SAMPLE, VARIANT_NAME, CHROM) %>%
37-
# Assign genomic positions for all combinations
38-
left_join(variant_coords, by = c("CHROM", "VARIANT_NAME")) %>%
39+
complete(SAMPLE, nesting(VARIANT_NAME, CHROM, POS)) %>%
3940
# Merge filtered sites
4041
# TODO: consider region/chrom
4142
left_join(sites, by = c("SAMPLE", "POS")) %>%

workflow/scripts/report/pairwise_trajectory_correlation_data.R

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
#!/usr/bin/env Rscript
2-
3-
# Write stdout and stderr to log file
42
log <- file(snakemake@log[[1]], open = "wt")
53
sink(log, type = "message")
64
sink(log, type = "output")
@@ -17,14 +15,22 @@ log_threshold(INFO)
1715
log_info("Reading variants")
1816
variants <- read_tsv(snakemake@input[["variants"]])
1917

20-
# Obtain sample names ordered by CollectionDate
18+
log_info("Sorting dates")
2119
date_order <- read_csv(snakemake@input[["metadata"]]) %>%
2220
arrange(CollectionDate) %>%
2321
pull(ID) %>%
2422
unique()
2523

2624
log_info("Formatting variants")
2725
all_variants_wider <- variants %>%
26+
# Collapse positions (treat as uncertain if filter is inconsistent)
27+
group_by(SAMPLE, VARIANT_NAME) %>%
28+
summarise(
29+
ALT_FREQ = ifelse(n_distinct(ALT_FREQ, na.rm = TRUE) > 1, NA, first(ALT_FREQ)),
30+
FILTER_PASS = ifelse(n_distinct(FILTER_PASS) > 1, NA, first(FILTER_PASS)),
31+
.groups = "drop"
32+
) %>%
33+
mutate(ALT_FREQ = if_else(is.na(FILTER_PASS), NA, ALT_FREQ)) %>%
2834
distinct(SAMPLE, VARIANT_NAME, ALT_FREQ) %>%
2935
pivot_wider(
3036
names_from = VARIANT_NAME,

0 commit comments

Comments
 (0)