shared/scripts/assign-colors at main · nextstrain/shared · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
"""
Generate colors.tsv for augur export based on ordering, color schemes, and
traits that exists in the metadata.
"""
import argparse
import pandas as pd

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Assign colors based on defined ordering of traits.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument('--ordering', type=str, required=True,
        help="""Input TSV file defining the color ordering where the first
        column is the field and the second column is the trait in that field.
        Blank lines are ignored. Lines starting with '#' will be ignored as comments.""")
    parser.add_argument('--color-schemes', type=str, required=True,
        help="Input color schemes where each line is a different color scheme separated by tabs.")
    parser.add_argument('--metadata', type=str,
        help="""If provided, restrict colors to only those traits found in
        metadata. If the metadata includes a 'focal' column that only contains
        boolean values, then restrict colors to traits for rows where 'focal'
        is set to True.""")
    parser.add_argument('--output', type=str, required=True,
        help="Output colors TSV file to be passed to augur export.")
    args = parser.parse_args()

    assignment = {}
    with open(args.ordering) as f:
        for line in f.readlines():
            array = line.strip().split("\t")
            # Ignore empty lines or commented lines
            if not array or not array[0] or array[0].startswith('#'):
                continue
            # Throw a warning if encountering a line not matching the expected number of columns, ignore line
            elif len(array)!=2:
                print(f"WARNING: Could not decode color ordering line: {line}")
                continue
            # Otherwise, process color ordering where we expect 2 columns: name, traits
            else:
                name = array[0]
                trait = array[1]
                if name not in assignment:
                    assignment[name] = [trait]
                else:
                    assignment[name].append(trait)

    # if metadata supplied, go through and
    # 1. remove assignments that don't exist in metadata
    # 2. remove assignments that have 'focal' set to 'False' in metadata
    if args.metadata:
        metadata = pd.read_csv(args.metadata, delimiter='\t')
        for name, trait in assignment.items():
            if name in metadata:
                if 'focal' in metadata and metadata['focal'].dtype == 'bool':
                    focal_list = metadata.loc[metadata['focal'], name].unique()
                    subset_focal = [x for x in assignment[name] if x in focal_list]
                    assignment[name] = subset_focal
                else: # no 'focal' present
                    subset_present = [x for x in assignment[name] if x in metadata[name].unique()]
                    assignment[name] = subset_present


    schemes = {}
    counter = 0
    with open(args.color_schemes) as f:
        for line in f.readlines():
            counter += 1
            array = line.lstrip().rstrip().split("\t")
            schemes[counter] = array

    with open(args.output, 'w') as f:
        for trait_name, trait_array in assignment.items():
            if len(trait_array)==0:
                print(f"No traits found for {trait_name}")
                continue
            if len(schemes)<len(trait_array):
              print(f"WARNING: insufficient colours available for trait {trait_name} - reusing colours!")
              remain = len(trait_array)
              color_array = []
              while(remain>0):
                if (remain>len(schemes)):
                  color_array = [*color_array, *schemes[len(schemes)]]
                  remain -= len(schemes)
                else:
                  color_array = [*color_array, *schemes[remain]]
                  remain = 0
            else:
              color_array = schemes[len(trait_array)]

            zipped = list(zip(trait_array, color_array))
            for trait_value, color in zipped:
                f.write(trait_name + "\t" + trait_value + "\t" + color + "\n")
            f.write("\n")