{
"level": "info",
"event": {
"EventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"Kind": "ASG_LAUNCH_LIFECYCLE",
"Monitor": "SQS_MONITOR",
"Description": "ASG Lifecycle Launch event received. Instance was started at 2025-10-28 15:43:14.87 +0000 UTC \n",
"State": "",
"AutoScalingGroupName": "on-demand-huge-workers-01-2024112121084029580000002c",
"NodeName": "ip-10-31-16-42.eu-west-1.compute.internal",
"NodeLabels": null,
"Pods": null,
"InstanceID": "i-004eafb13e0bbb439",
"ProviderID": "aws:///eu-west-1a/i-004eafb13e0bbb439",
"InstanceType": "",
"IsManaged": true,
"StartTime": "2025-10-28T15:43:14.87Z",
"EndTime": "0001-01-01T00:00:00Z",
"NodeProcessed": false,
"InProgress": false
},
"time": "2025-10-28T15:43:15Z",
"message": "Adding new event to the event store"
}
{
"level": "info",
"event-id": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"kind": "ASG_LAUNCH_LIFECYCLE",
"node-name": "ip-10-31-16-42.eu-west-1.compute.internal",
"instance-id": "i-004eafb13e0bbb439",
"provider-id": "aws:///eu-west-1a/i-004eafb13e0bbb439",
"time": "2025-10-28T15:43:15Z",
"message": "Requesting instance drain"
}
{
"level": "info",
"instanceID": "i-004eafb13e0bbb439",
"time": "2025-10-28T15:43:15Z",
"message": "EC2 instance not found in Kubernetes cluster - waiting for instance to join as a node"
}
{
"level": "info",
"instanceID": "i-004eafb13e0bbb439",
"eventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"time": "2025-10-28T15:43:15Z",
"message": "Node not ready - canceling launch lifecycle event (ASG will retry or timeout)"
}
{
"level": "info",
"event": {
"EventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"Kind": "ASG_LAUNCH_LIFECYCLE",
"Monitor": "SQS_MONITOR",
"Description": "ASG Lifecycle Launch event received. Instance was started at 2025-10-28 15:43:14.87 +0000 UTC \n",
"State": "",
"AutoScalingGroupName": "on-demand-huge-workers-01-2024112121084029580000002c",
"NodeName": "ip-10-31-16-42.eu-west-1.compute.internal",
"NodeLabels": null,
"Pods": null,
"InstanceID": "i-004eafb13e0bbb439",
"ProviderID": "aws:///eu-west-1a/i-004eafb13e0bbb439",
"InstanceType": "",
"IsManaged": true,
"StartTime": "2025-10-28T15:43:14.87Z",
"EndTime": "0001-01-01T00:00:00Z",
"NodeProcessed": false,
"InProgress": false
},
"time": "2025-10-28T15:43:35Z",
"message": "Adding new event to the event store"
}
{
"level": "info",
"event-id": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"kind": "ASG_LAUNCH_LIFECYCLE",
"node-name": "ip-10-31-16-42.eu-west-1.compute.internal",
"instance-id": "i-004eafb13e0bbb439",
"provider-id": "aws:///eu-west-1a/i-004eafb13e0bbb439",
"time": "2025-10-28T15:43:35Z",
"message": "Requesting instance drain"
}
{
"level": "info",
"instanceID": "i-004eafb13e0bbb439",
"time": "2025-10-28T15:43:36Z",
"message": "EC2 instance not found in Kubernetes cluster - waiting for instance to join as a node"
}
{
"level": "info",
"instanceID": "i-004eafb13e0bbb439",
"eventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"time": "2025-10-28T15:43:36Z",
"message": "Node not ready - canceling launch lifecycle event (ASG will retry or timeout)"
}
{
"level": "info",
"event": {
"EventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"Kind": "ASG_LAUNCH_LIFECYCLE",
"Monitor": "SQS_MONITOR",
"Description": "ASG Lifecycle Launch event received. Instance was started at 2025-10-28 15:43:14.87 +0000 UTC \n",
"State": "",
"AutoScalingGroupName": "on-demand-huge-workers-01-2024112121084029580000002c",
"NodeName": "ip-10-31-16-42.eu-west-1.compute.internal",
"NodeLabels": null,
"Pods": null,
"InstanceID": "i-004eafb13e0bbb439",
"ProviderID": "aws:///eu-west-1a/i-004eafb13e0bbb439",
"InstanceType": "",
"IsManaged": true,
"StartTime": "2025-10-28T15:43:14.87Z",
"EndTime": "0001-01-01T00:00:00Z",
"NodeProcessed": false,
"InProgress": false
},
"time": "2025-10-28T15:43:55Z",
"message": "Adding new event to the event store"
}
{
"level": "info",
"event-id": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"kind": "ASG_LAUNCH_LIFECYCLE",
"node-name": "ip-10-31-16-42.eu-west-1.compute.internal",
"instance-id": "i-004eafb13e0bbb439",
"provider-id": "aws:///eu-west-1a/i-004eafb13e0bbb439",
"time": "2025-10-28T15:43:55Z",
"message": "Requesting instance drain"
}
{
"level": "info",
"instanceID": "i-004eafb13e0bbb439",
"nodeName": "ip-10-31-16-42.eu-west-1.compute.internal",
"readyStatus": "False",
"readyReason": "KubeletNotReady",
"readyMessage": "container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: cni plugin not initialized",
"time": "2025-10-28T15:43:56Z",
"message": "EC2 instance found in cluster but node not ready - waiting for ready status"
}
{
"level": "info",
"instanceID": "i-004eafb13e0bbb439",
"eventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064",
"time": "2025-10-28T15:43:56Z",
"message": "Node not ready - canceling launch lifecycle event (ASG will retry or timeout)"
}
{
"level": "info",
"event": {
"EventID": "asg-lifecycle-term-36306636363861652d313831622d656261632d653565342d303132353936316261396332",
"Kind": "ASG_LIFECYCLE",
"Monitor": "SQS_MONITOR",
"Description": "ASG Lifecycle Termination event received. Instance will be interrupted at 2025-10-28 15:53:15.796 +0000 UTC \n",
"State": "",
"AutoScalingGroupName": "on-demand-huge-workers-01-2024112121084029580000002c",
"NodeName": "ip-10-31-16-42.eu-west-1.compute.internal",
"NodeLabels": null,
"Pods": null,
"InstanceID": "i-004eafb13e0bbb439",
"ProviderID": "aws:///eu-west-1a/i-004eafb13e0bbb439",
"InstanceType": "r6a.xlarge",
"IsManaged": true,
"StartTime": "2025-10-28T15:53:15.796Z",
"EndTime": "0001-01-01T00:00:00Z",
"NodeProcessed": false,
"InProgress": false
},
"time": "2025-10-28T15:53:15Z",
"message": "Adding new event to the event store"
}
{
"level": "info",
"event-id": "asg-lifecycle-term-36306636363861652d313831622d656261632d653565342d303132353936316261396332",
"kind": "ASG_LIFECYCLE",
"node-name": "ip-10-31-16-42.eu-west-1.compute.internal",
"instance-id": "i-004eafb13e0bbb439",
"provider-id": "aws:///eu-west-1a/i-004eafb13e0bbb439",
"time": "2025-10-28T15:53:16Z",
"message": "Requesting instance drain"
}
{
"level": "info",
"lifecycleHookName": "nth-termination-request",
"instanceID": "i-004eafb13e0bbb439",
"time": "2025-10-28T15:53:20Z",
"message": "Completed ASG Lifecycle Hook"
}
Describe the bug
We use ASG instance launch lifecycle hook and noticed NTH logs node readiness checks only 3 times for the first ~40 seconds (since initial
ASG_LAUNCH_LIFECYCLEmessage is received), before the node becomes ready, and then it no longer checks.Even though node reaches
Readystatus (soon after last NTH check), ultimately the lifecycle hook heartbeat timeout happens and the ASG has to terminate the instance (hook default action isABANDON).This seems to be similar to the report in #1007.
Steps to reproduce
ABANDONdefault action)Note
The logs below were retrieved running a local build with this small patch applied, to have more insights:
patch git diff (click to expand)
Expected outcome
NTH should retry checking node readiness without cancelling the event, allowing SQS retries to continue until the node becomes Ready, then send CONTINUE to complete the lifecycle hook successfully.
Application Logs
k logs aws-nth-6668f87c7b-brdvp | rg i-004eafb13e0bbb43 (click to expand)
{ "level": "info", "event": { "EventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "Kind": "ASG_LAUNCH_LIFECYCLE", "Monitor": "SQS_MONITOR", "Description": "ASG Lifecycle Launch event received. Instance was started at 2025-10-28 15:43:14.87 +0000 UTC \n", "State": "", "AutoScalingGroupName": "on-demand-huge-workers-01-2024112121084029580000002c", "NodeName": "ip-10-31-16-42.eu-west-1.compute.internal", "NodeLabels": null, "Pods": null, "InstanceID": "i-004eafb13e0bbb439", "ProviderID": "aws:///eu-west-1a/i-004eafb13e0bbb439", "InstanceType": "", "IsManaged": true, "StartTime": "2025-10-28T15:43:14.87Z", "EndTime": "0001-01-01T00:00:00Z", "NodeProcessed": false, "InProgress": false }, "time": "2025-10-28T15:43:15Z", "message": "Adding new event to the event store" } { "level": "info", "event-id": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "kind": "ASG_LAUNCH_LIFECYCLE", "node-name": "ip-10-31-16-42.eu-west-1.compute.internal", "instance-id": "i-004eafb13e0bbb439", "provider-id": "aws:///eu-west-1a/i-004eafb13e0bbb439", "time": "2025-10-28T15:43:15Z", "message": "Requesting instance drain" } { "level": "info", "instanceID": "i-004eafb13e0bbb439", "time": "2025-10-28T15:43:15Z", "message": "EC2 instance not found in Kubernetes cluster - waiting for instance to join as a node" } { "level": "info", "instanceID": "i-004eafb13e0bbb439", "eventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "time": "2025-10-28T15:43:15Z", "message": "Node not ready - canceling launch lifecycle event (ASG will retry or timeout)" } { "level": "info", "event": { "EventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "Kind": "ASG_LAUNCH_LIFECYCLE", "Monitor": "SQS_MONITOR", "Description": "ASG Lifecycle Launch event received. Instance was started at 2025-10-28 15:43:14.87 +0000 UTC \n", "State": "", "AutoScalingGroupName": "on-demand-huge-workers-01-2024112121084029580000002c", "NodeName": "ip-10-31-16-42.eu-west-1.compute.internal", "NodeLabels": null, "Pods": null, "InstanceID": "i-004eafb13e0bbb439", "ProviderID": "aws:///eu-west-1a/i-004eafb13e0bbb439", "InstanceType": "", "IsManaged": true, "StartTime": "2025-10-28T15:43:14.87Z", "EndTime": "0001-01-01T00:00:00Z", "NodeProcessed": false, "InProgress": false }, "time": "2025-10-28T15:43:35Z", "message": "Adding new event to the event store" } { "level": "info", "event-id": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "kind": "ASG_LAUNCH_LIFECYCLE", "node-name": "ip-10-31-16-42.eu-west-1.compute.internal", "instance-id": "i-004eafb13e0bbb439", "provider-id": "aws:///eu-west-1a/i-004eafb13e0bbb439", "time": "2025-10-28T15:43:35Z", "message": "Requesting instance drain" } { "level": "info", "instanceID": "i-004eafb13e0bbb439", "time": "2025-10-28T15:43:36Z", "message": "EC2 instance not found in Kubernetes cluster - waiting for instance to join as a node" } { "level": "info", "instanceID": "i-004eafb13e0bbb439", "eventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "time": "2025-10-28T15:43:36Z", "message": "Node not ready - canceling launch lifecycle event (ASG will retry or timeout)" } { "level": "info", "event": { "EventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "Kind": "ASG_LAUNCH_LIFECYCLE", "Monitor": "SQS_MONITOR", "Description": "ASG Lifecycle Launch event received. Instance was started at 2025-10-28 15:43:14.87 +0000 UTC \n", "State": "", "AutoScalingGroupName": "on-demand-huge-workers-01-2024112121084029580000002c", "NodeName": "ip-10-31-16-42.eu-west-1.compute.internal", "NodeLabels": null, "Pods": null, "InstanceID": "i-004eafb13e0bbb439", "ProviderID": "aws:///eu-west-1a/i-004eafb13e0bbb439", "InstanceType": "", "IsManaged": true, "StartTime": "2025-10-28T15:43:14.87Z", "EndTime": "0001-01-01T00:00:00Z", "NodeProcessed": false, "InProgress": false }, "time": "2025-10-28T15:43:55Z", "message": "Adding new event to the event store" } { "level": "info", "event-id": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "kind": "ASG_LAUNCH_LIFECYCLE", "node-name": "ip-10-31-16-42.eu-west-1.compute.internal", "instance-id": "i-004eafb13e0bbb439", "provider-id": "aws:///eu-west-1a/i-004eafb13e0bbb439", "time": "2025-10-28T15:43:55Z", "message": "Requesting instance drain" } { "level": "info", "instanceID": "i-004eafb13e0bbb439", "nodeName": "ip-10-31-16-42.eu-west-1.compute.internal", "readyStatus": "False", "readyReason": "KubeletNotReady", "readyMessage": "container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: cni plugin not initialized", "time": "2025-10-28T15:43:56Z", "message": "EC2 instance found in cluster but node not ready - waiting for ready status" } { "level": "info", "instanceID": "i-004eafb13e0bbb439", "eventID": "asg-lifecycle-term-64373436363861642d663331612d663838382d396561622d633831623861656561373064", "time": "2025-10-28T15:43:56Z", "message": "Node not ready - canceling launch lifecycle event (ASG will retry or timeout)" } { "level": "info", "event": { "EventID": "asg-lifecycle-term-36306636363861652d313831622d656261632d653565342d303132353936316261396332", "Kind": "ASG_LIFECYCLE", "Monitor": "SQS_MONITOR", "Description": "ASG Lifecycle Termination event received. Instance will be interrupted at 2025-10-28 15:53:15.796 +0000 UTC \n", "State": "", "AutoScalingGroupName": "on-demand-huge-workers-01-2024112121084029580000002c", "NodeName": "ip-10-31-16-42.eu-west-1.compute.internal", "NodeLabels": null, "Pods": null, "InstanceID": "i-004eafb13e0bbb439", "ProviderID": "aws:///eu-west-1a/i-004eafb13e0bbb439", "InstanceType": "r6a.xlarge", "IsManaged": true, "StartTime": "2025-10-28T15:53:15.796Z", "EndTime": "0001-01-01T00:00:00Z", "NodeProcessed": false, "InProgress": false }, "time": "2025-10-28T15:53:15Z", "message": "Adding new event to the event store" } { "level": "info", "event-id": "asg-lifecycle-term-36306636363861652d313831622d656261632d653565342d303132353936316261396332", "kind": "ASG_LIFECYCLE", "node-name": "ip-10-31-16-42.eu-west-1.compute.internal", "instance-id": "i-004eafb13e0bbb439", "provider-id": "aws:///eu-west-1a/i-004eafb13e0bbb439", "time": "2025-10-28T15:53:16Z", "message": "Requesting instance drain" } { "level": "info", "lifecycleHookName": "nth-termination-request", "instanceID": "i-004eafb13e0bbb439", "time": "2025-10-28T15:53:20Z", "message": "Completed ASG Lifecycle Hook" }k8s node ip-10-31-16-42.eu-west-1.compute.internal events (click to expand)
From K8s events timeline is:
15:43:38Z - Node starts joining and gets various status updates
15:43:44Z - Node synced successfully
15:43:59Z - Node became ready
15:44:00Z - Pods start getting scheduled to the node
But looking at the NTH logs:
15:43:15Z - First NTH check shows no node in K8s for the EC2 instance
15:43:36Z - Second NTH check shows same
15:43:56Z - Last NTH check shows EC2 as node "not ready" with CNI issues
15:53:15Z - Next log is the termination event (10 minutes later - lifecycle hook heartbeat timeout)
Somehow, the node was ready at 15:43:59Z, but NTH never checked again after 15:43:56Z, until the ASG lifecycle hook eventually times out.
Environment