Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions kiloclaw/controller/src/routes/gateway.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
import os from 'node:os';
import type { Hono } from 'hono';
import { timingSafeTokenEqual } from '../auth';
import type { Supervisor } from '../supervisor';

// shared-cpu-2x gives ~6% of 2 physical cores, so even modest load
// averages represent heavy pressure. After boot completes, an idle
// system sits near 0. A threshold of 0.1 ensures boot CPU work has
// fully subsided before we tell the frontend it's safe to proceed.
const LOAD_SETTLED_THRESHOLD = 0.1;

function loadFields(): { loadAverage: number[]; settled: boolean } {
const loadAverage = os.loadavg();
return { loadAverage, settled: loadAverage[0] < LOAD_SETTLED_THRESHOLD };
}

export function getBearerToken(header: string | undefined): string | null {
if (!header) return null;
const [scheme, token] = header.split(/\s+/, 2);
Expand Down Expand Up @@ -58,6 +70,30 @@ export function registerGatewayRoutes(
}
});

app.get('/_kilo/gateway/ready', async c => {
if (supervisor.getState() !== 'running') {
return c.json({ ready: false, error: 'Gateway not running', ...loadFields() }, 503);
}
try {
const res = await fetch('http://127.0.0.1:3001/ready');
const body = await res.text();
let json: unknown;
try {
json = JSON.parse(body);
} catch {
json = { raw: body };
}
const envelope =
typeof json === 'object' && json !== null
? { ...json, ...loadFields() }
: { raw: json, ...loadFields() };
return c.json(envelope, res.ok ? 200 : 503);
} catch (error) {
console.error('[controller] /_kilo/gateway/ready failed:', error);
return c.json({ ready: false, error: 'Failed to reach gateway', ...loadFields() }, 502);
}
});

app.post('/_kilo/gateway/restart', async c => {
try {
const restarted = await supervisor.restart();
Expand Down
16 changes: 16 additions & 0 deletions kiloclaw/src/durable-objects/gateway-controller-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,22 @@ export const ControllerVersionResponseSchema = z.object({
openclawCommit: z.string().nullable().optional(),
});

export type ControllerHealthResponse = {
status: 'ok';
state: 'bootstrapping' | 'starting' | 'ready' | 'degraded';
phase?: string;
error?: string;
};

export const ControllerHealthResponseSchema: ZodType<ControllerHealthResponse> = z.object({
status: z.literal('ok'),
state: z.enum(['bootstrapping', 'starting', 'ready', 'degraded']),
phase: z.string().optional(),
error: z.string().optional(),
});

export const GatewayReadyResponseSchema = z.record(z.string(), z.unknown());

export const EnvPatchResponseSchema = z.object({
ok: z.boolean(),
signaled: z.boolean(),
Expand Down
27 changes: 27 additions & 0 deletions kiloclaw/src/durable-objects/kiloclaw-instance/gateway.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
GatewayCommandResponseSchema,
ConfigRestoreResponseSchema,
ControllerVersionResponseSchema,
GatewayReadyResponseSchema,
EnvPatchResponseSchema,
OpenclawConfigResponseSchema,
GatewayControllerError,
Expand Down Expand Up @@ -239,6 +240,32 @@ export async function getControllerVersion(
}
}

export async function getGatewayReady(
state: InstanceMutableState,
env: KiloClawEnv
): Promise<Record<string, unknown> | null> {
try {
return await callGatewayController(
state,
env,
'/_kilo/gateway/ready',
'GET',
GatewayReadyResponseSchema
);
} catch (error) {
if (isErrorUnknownRoute(error)) {
return null;
}
// During startup the gateway process may not be running yet, producing
// a 503 from the controller. Return a descriptive object instead of
// throwing so the frontend poll doesn't see a wall of 500s.
if (error instanceof GatewayControllerError) {
return { ready: false, error: error.message, status: error.status };
}
throw error;
}
}

/** Returns null if the controller is too old to have the /_kilo/config/read endpoint. */
export async function getOpenclawConfig(
state: InstanceMutableState,
Expand Down
5 changes: 5 additions & 0 deletions kiloclaw/src/durable-objects/kiloclaw-instance/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1399,6 +1399,11 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {
return gateway.getControllerVersion(this.s, this.env);
}

async getGatewayReady(): Promise<Record<string, unknown> | null> {
await this.loadState();
return gateway.getGatewayReady(this.s, this.env);
}

async patchConfigOnMachine(patch: Record<string, unknown>): Promise<void> {
await this.loadState();
return gateway.patchConfigOnMachine(this.s, this.env, patch);
Expand Down
22 changes: 22 additions & 0 deletions kiloclaw/src/routes/platform.ts
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,28 @@ platform.get('/gateway/status', async c => {
}
});

// GET /api/platform/gateway/ready?userId=...
// Non-fatal polling endpoint — always returns 200 so the frontend poll
// doesn't generate a wall of errors during startup.
platform.get('/gateway/ready', async c => {
const userId = setValidatedQueryUserId(c);
if (!userId) {
return c.json({ error: 'userId query parameter is required' }, 400);
}

try {
const result = await withDORetry(
instanceStubFactory(c.env, userId),
stub => stub.getGatewayReady(),
'getGatewayReady'
);
return c.json(result ?? { ready: false, error: 'controller too old' }, 200);
} catch (err) {
const { message } = sanitizeError(err, 'gateway ready');
return c.json({ ready: false, error: message }, 200);
}
});

// GET /api/platform/controller-version?userId=...
platform.get('/controller-version', async c => {
const userId = setValidatedQueryUserId(c);
Expand Down
Loading
Loading