Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ if CALCULATE_GROUPS_OVERRIDE_JSON:
if [[ "{params.use_mirror}" == "True" ]]; then
curl {params.mirror_url} -o {params.compressed_package_tzst} && \
mkdir -p {params.package} && \
tar xvf {params.compressed_package_tzst} -C {params.package}
tar xf {params.compressed_package_tzst} -C {params.package}
else
datasets download genome taxon {params.taxon_id} \
--assembly-source {wildcards.source} \
Expand Down
87 changes: 64 additions & 23 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1406,21 +1406,6 @@ defaultOrganismConfig: &defaultOrganismConfig
label: Length
preprocessing:
inputs: {input: nextclade.coverage}
- name: variant
isSequenceFilter: true
perSegment: true
header: "Alignment and QC metrics"
displayName: "Variant"
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
preprocessing:
function: is_above_threshold
args:
threshold: 50
inputs: {input: "nextclade.privateNucMutations.totalPrivateSubstitutions"}
website: &website
tableColumns:
- sampleCollectionDate
Expand Down Expand Up @@ -1614,6 +1599,21 @@ defaultOrganisms:
includeInDownloadsByDefault: false
preprocessing:
inputs: {input: "nextclade.cladeFounderInfo.aaMutations.*.privateSubstitutions"}
- name: variant
isSequenceFilter: true
perSegment: true
header: "Alignment and QC metrics"
displayName: "Variant"
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
preprocessing:
function: is_variant
args:
mu: 0.002
inputs: {numMutations: "nextclade.privateNucMutations.totalPrivateSubstitutions", length: processed.length}
website:
<<: *website
tableColumns:
Expand Down Expand Up @@ -2089,26 +2089,67 @@ defaultOrganisms:
header: "Host"
ingest: ncbiHostName
initiallyVisible: true
- name: variant
- name: variant_L
isSequenceFilter: true
perSegment: true
header: "Clade & Lineage"
oneHeader: true
displayName: "Variant"
displayName: "Variant L"
Comment thread
anna-parker marked this conversation as resolved.
relatesToSegment: L
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
customDisplay:
type: variantReference
displayGroup: reference
label: Closest reference
displayGroup: reference_L
label: Closest reference L
preprocessing:
function: is_variant
args:
mu: 0.004
inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_L}
- name: variant_M
isSequenceFilter: true
header: "Clade & Lineage"
oneHeader: true
displayName: "Variant M"
relatesToSegment: M
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
customDisplay:
type: variantReference
displayGroup: reference_M
label: Closest reference M
preprocessing:
function: is_variant
args:
mu: 0.008
inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_M}
- name: variant_S
isSequenceFilter: true
header: "Clade & Lineage"
oneHeader: true
displayName: "Variant S"
relatesToSegment: S
type: boolean
noInput: true
autocomplete: true
initiallyVisible: false
includeInDownloadsByDefault: false
customDisplay:
type: variantReference
displayGroup: reference_S
label: Closest reference S
preprocessing:
function: is_above_threshold
function: is_variant
args:
threshold: 1000
inputs: {input: "nextclade.totalSubstitutions"} #custom nextclade dataset does not have private mutations, so using total substitutions as a proxy for distance from reference
mu: 0.004
# custom nextclade dataset does not have private mutations, so using total substitutions as a proxy for distance from reference
inputs: {numMutations: "nextclade.totalSubstitutions", length: processed.length_S}
- name: reference
Comment thread
anna-parker marked this conversation as resolved.
oneHeader: true
header: "Clade & Lineage"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1250,6 +1250,154 @@ def is_above_threshold(
)
return ProcessingResult(datum=(input > threshold), warnings=[], errors=[])

@staticmethod
def is_variant(
input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
) -> ProcessingResult:
"""Flag if number of mutations is above mutation rate (specified in args) times length"""
if "mu" not in args:
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"Field {output_field} is missing mu argument."
" Please report this error to the administrator."
),
)
],
)
length_datum = input_data.get("length")
num_mutations_datum = input_data.get("numMutations")
if length_datum is None or num_mutations_datum is None:
return ProcessingResult(datum=None, warnings=[], errors=[])
try:
mu = float(args["mu"]) # type: ignore
length = float(length_datum)
threshold = mu * length
is_above_threshold_result = ProcessingFunctions.is_above_threshold(
input_data={"input": num_mutations_datum},
output_field=output_field,
input_fields=input_fields,
args={"threshold": threshold},
)
except (ValueError, TypeError):
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"Field {output_field} has non-numeric length or numMutations value."
),
)
],
)
return ProcessingResult(
datum=is_above_threshold_result.datum,
warnings=is_above_threshold_result.warnings,
errors=is_above_threshold_result.errors,
)

@staticmethod
def assign_custom_lineage( # noqa: C901
input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs
) -> ProcessingResult:
"""
Assign flu lineage based on seg4 and seg6.
Add reassortant flag if different lineages are detected for internal segments,
add and variant flag if any segment is a variant.
It expects the following input_data fields to be present:
- subtype_seg4: the subtype assigned to seg4
- subtype_seg6: the subtype assigned to seg6
- reference_seg1,...,reference_seg8: the reference sequence assigned to each segment
- variant_seg1,...,variant_seg8: boolean flag indicating whether a segment is a variant
It expects the following args to be present:
- pattern: regex pattern to extract lineage from reference
e.g. ^(?P<segment>[^-]+)-(?P<lineage>[^-]+)$
- uppercase: boolean flag indicating whether to uppercase the extracted lineage
- capture_group: the name of the capture group in the regex pattern to extract
e.g. "lineage"
"""
logger.debug(
f"Starting custom lineage assignment with input_data: {input_data} and args: {args}"
)
if not input_data:
return ProcessingResult(datum=None, warnings=[], errors=[])
ha_subtype = input_data.get("subtype_seg4")
na_subtype = input_data.get("subtype_seg6")
references: dict[str, str | None] = {}
extracted_lineages: dict[str, str | None] = {}
variant: dict[str, bool | None] = {}
for i in range(1, 9):
segment = f"seg{i}"
reference_field = f"reference_seg{i}"
variant_field = f"variant_seg{i}"
if reference_field in input_data:
references[segment] = input_data.get(reference_field)
variant[segment] = (
bool(input_data.get(variant_field)) if variant_field in input_data else None
)
try:
for i in range(1, 9):
segment = f"seg{i}"
extracted_lineages[segment] = ProcessingFunctions.call_function( # type: ignore
"extract_regex",
{
"pattern": args["pattern"],
"uppercase": args["uppercase"],
"capture_group": args["capture_group"],
},
{"regex_field": references.get(segment, "")},
"output_field",
["segment_name"],
).datum
logger.debug(f"Extracted lineages: {extracted_lineages} from references: {references}")
if not ha_subtype or not na_subtype:
return ProcessingResult(datum=None, warnings=[], errors=[])
lineage = f"{ha_subtype}{na_subtype}"
if (
extracted_lineages.get("seg4") == "H1N1PDM"
and extracted_lineages.get("seg6") == "H1N1PDM"
):
lineage = "H1N1pdm"
logger.debug(
f"Determined preliminary lineage {lineage} based on segments seg4 and seg6"
)
if lineage in {"H1N1", "H3N2", "H2N2", "H1N1pdm"}:
logger.debug(
f"Lineage {lineage} is a human lineage, checking for reassortment and variants"
)
# only assign human lineages
if len({v for v in extracted_lineages.values() if v is not None}) > 1:
lineage += " reassortant"
if any(v for v in variant.values() if v):
lineage += " (variant)"
return ProcessingResult(datum=lineage, warnings=[], errors=[])
except (ValueError, TypeError):
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation.from_fields(
input_fields,
[output_field],
AnnotationSourceType.METADATA,
message=(
f"Internal error processing custom lineage for field {output_field}."
),
)
],
)
return ProcessingResult(datum=None, warnings=[], errors=[])

@staticmethod
def build_display_name( # noqa: C901
input_data: InputMetadata,
Expand Down
Loading
Loading