11# !/usr/bin/env Rscript
2-
3- # Write stdout and stderr to log file
42log <- file(snakemake @ log [[1 ]], open = " wt" )
53sink(log , type = " message" )
64sink(log , type = " output" )
@@ -15,10 +13,18 @@ log_threshold(INFO)
1513log_info(" Reading variants" )
1614variants <- read_tsv(snakemake @ input [[" variants" ]])
1715
18- # Create a mapping of variant names to their genomic position
19- variant_coords <- variants %> %
20- select(VARIANT_NAME , CHROM , POS ) %> %
21- distinct()
16+ # Check if each sample, variant and position have 1 frequency
17+ variants %> %
18+ distinct(VARIANT_NAME , CHROM , POS , SAMPLE , ALT_FREQ ) %> %
19+ group_by(VARIANT_NAME , CHROM , POS , SAMPLE ) %> %
20+ filter(n() > 1 ) %> %
21+ {
22+ if (nrow(. ) > 0 ) {
23+ log_warn(
24+ " Found {nrow(.)} ambiguous (SAMPLE, VARIANT_NAME, POS) combinations"
25+ )
26+ }
27+ }
2228
2329log_info(" Reading filtered sites" )
2430sites <- read_tsv(snakemake @ input [[" sites" ]]) %> %
@@ -28,14 +34,9 @@ sites <- read_tsv(snakemake@input[["sites"]]) %>%
2834log_info(" Processing variants" )
2935all_variants <- variants %> %
3036 # Select minimal columns
31- distinct(VARIANT_NAME , CHROM , SAMPLE , ALT_FREQ ) %> %
32- # Handle duplicates
33- group_by(SAMPLE , VARIANT_NAME , CHROM ) %> %
34- summarise(ALT_FREQ = sum(ALT_FREQ , na.rm = TRUE ), .groups = " drop" ) %> %
37+ distinct(VARIANT_NAME , CHROM , POS , SAMPLE , ALT_FREQ ) %> %
3538 # Complete with NA
36- complete(SAMPLE , VARIANT_NAME , CHROM ) %> %
37- # Assign genomic positions for all combinations
38- left_join(variant_coords , by = c(" CHROM" , " VARIANT_NAME" )) %> %
39+ complete(SAMPLE , nesting(VARIANT_NAME , CHROM , POS )) %> %
3940 # Merge filtered sites
4041 # TODO: consider region/chrom
4142 left_join(sites , by = c(" SAMPLE" , " POS" )) %> %
0 commit comments