From 554386435d9a977fe8c4e79ab641280b9558fbe9 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Tue, 14 Apr 2026 14:05:09 -0500 Subject: [PATCH 01/10] feat: enrich telemetry event types with env fingerprint, start timestamps, and new event types Add environment fingerprint fields (OS, Node version, CI detection, shell) to session.start and session.end events. Add startTimestamp to step and agent.tool events for span reconstruction. Define command and crash event types with stub emission methods. Add discriminated union Zod schema validation tests mirroring the API schema. --- src/utils/analytics.spec.ts | 141 ++++++++++++++++ src/utils/analytics.ts | 97 +++++++++++ src/utils/telemetry-schema.spec.ts | 258 +++++++++++++++++++++++++++++ src/utils/telemetry-types.ts | 45 ++++- 4 files changed, 540 insertions(+), 1 deletion(-) create mode 100644 src/utils/telemetry-schema.spec.ts diff --git a/src/utils/analytics.spec.ts b/src/utils/analytics.spec.ts index 97dc2a4b..8c0f1bfe 100644 --- a/src/utils/analytics.spec.ts +++ b/src/utils/analytics.spec.ts @@ -206,6 +206,17 @@ describe('Analytics', () => { }), ); }); + + it('includes environment fingerprint fields', () => { + analytics.sessionStart('cli', '1.0.0'); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'session.start')[0]; + expect(event.attributes).toHaveProperty('env.os'); + expect(event.attributes).toHaveProperty('env.os_version'); + expect(event.attributes).toHaveProperty('env.node_version'); + expect(event.attributes).toHaveProperty('env.shell'); + expect(typeof event.attributes['env.ci']).toBe('boolean'); + }); }); describe('shutdown', () => { @@ -260,6 +271,21 @@ describe('Analytics', () => { }), ); }); + + it('includes env fingerprint and installer.mode', async () => { + analytics.sessionStart('tui', '1.0.0'); + mockQueueEvent.mockClear(); + + await analytics.shutdown('success'); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'session.end')[0]; + expect(event.attributes).toHaveProperty('env.os'); + expect(event.attributes).toHaveProperty('env.os_version'); + expect(event.attributes).toHaveProperty('env.node_version'); + expect(event.attributes).toHaveProperty('env.shell'); + expect(typeof event.attributes['env.ci']).toBe('boolean'); + expect(event.attributes['installer.mode']).toBe('tui'); + }); }); describe('getFeatureFlag', () => { @@ -306,6 +332,14 @@ describe('Analytics', () => { const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'step')[0]; expect(event.error).toBeUndefined(); }); + + it('includes startTimestamp as valid ISO 8601', () => { + analytics.stepCompleted('detect_framework', 150, true); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'step')[0]; + expect(event.startTimestamp).toBeDefined(); + expect(new Date(event.startTimestamp).toISOString()).toBe(event.startTimestamp); + }); }); describe('toolCalled', () => { @@ -334,6 +368,14 @@ describe('Analytics', () => { }), ); }); + + it('includes startTimestamp as valid ISO 8601', () => { + analytics.toolCalled('Write', 50, true); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'agent.tool')[0]; + expect(event.startTimestamp).toBeDefined(); + expect(new Date(event.startTimestamp).toISOString()).toBe(event.startTimestamp); + }); }); describe('llmRequest', () => { @@ -351,6 +393,13 @@ describe('Analytics', () => { ); }); + it('does NOT include startTimestamp (point-in-time marker)', () => { + analytics.llmRequest('claude-sonnet-4-20250514', 1000, 500); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'agent.llm')[0]; + expect(event.startTimestamp).toBeUndefined(); + }); + it('accumulates tokens for session.end', async () => { analytics.llmRequest('claude-sonnet-4-20250514', 1000, 500); analytics.llmRequest('claude-sonnet-4-20250514', 800, 300); @@ -375,6 +424,80 @@ describe('Analytics', () => { expect(sessionEnd.attributes['installer.agent.iterations']).toBe(3); }); }); + + describe('commandExecuted', () => { + it('queues a command event with correct attributes', () => { + analytics.commandExecuted('org.list', 200, true); + + expect(mockQueueEvent).toHaveBeenCalledWith( + expect.objectContaining({ + type: 'command', + attributes: expect.objectContaining({ + 'command.name': 'org.list', + 'command.duration_ms': 200, + 'command.success': true, + 'env.os': expect.any(String), + 'env.node_version': expect.any(String), + }), + }), + ); + }); + + it('includes error info when provided', () => { + const error = new TypeError('Not found'); + analytics.commandExecuted('org.get', 50, false, { error }); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'command')[0]; + expect(event.attributes['command.error_type']).toBe('TypeError'); + expect(event.attributes['command.error_message']).toBe('Not found'); + }); + + it('includes flags as comma-separated names', () => { + analytics.commandExecuted('org.list', 100, true, { flags: ['json', 'limit'] }); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'command')[0]; + expect(event.attributes['command.flags']).toBe('json,limit'); + }); + }); + + describe('captureUnhandledCrash', () => { + it('queues a crash event with error details', () => { + const error = new Error('Unexpected failure'); + error.stack = 'Error: Unexpected failure\n at foo.ts:1'; + analytics.captureUnhandledCrash(error, { command: 'install', version: '1.0.0' }); + + expect(mockQueueEvent).toHaveBeenCalledWith( + expect.objectContaining({ + type: 'crash', + attributes: expect.objectContaining({ + 'crash.error_type': 'Error', + 'crash.error_message': 'Unexpected failure', + 'crash.stack': 'Error: Unexpected failure\n at foo.ts:1', + 'crash.command': 'install', + 'installer.version': '1.0.0', + 'env.os': expect.any(String), + 'env.node_version': expect.any(String), + }), + }), + ); + }); + + it('truncates stack traces to 4KB', () => { + const error = new Error('Big stack'); + error.stack = 'x'.repeat(5000); + analytics.captureUnhandledCrash(error); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'crash')[0]; + expect(event.attributes['crash.stack'].length).toBe(4096); + }); + + it('defaults version to unknown when not provided', () => { + analytics.captureUnhandledCrash(new Error('test')); + + const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'crash')[0]; + expect(event.attributes['installer.version']).toBe('unknown'); + }); + }); }); describe('with telemetry disabled', () => { @@ -454,5 +577,23 @@ describe('Analytics', () => { expect(mockQueueEvent).not.toHaveBeenCalled(); }); + + it('commandExecuted does nothing', async () => { + const { Analytics } = await import('./analytics.js'); + const analytics = new Analytics(); + + analytics.commandExecuted('org.list', 100, true); + + expect(mockQueueEvent).not.toHaveBeenCalled(); + }); + + it('captureUnhandledCrash does nothing', async () => { + const { Analytics } = await import('./analytics.js'); + const analytics = new Analytics(); + + analytics.captureUnhandledCrash(new Error('test')); + + expect(mockQueueEvent).not.toHaveBeenCalled(); + }); }); }); diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts index c0f348ea..1c9242bb 100644 --- a/src/utils/analytics.ts +++ b/src/utils/analytics.ts @@ -1,3 +1,4 @@ +import os from 'node:os'; import { v4 as uuidv4 } from 'uuid'; import { debug } from './debug.js'; import { telemetryClient } from './telemetry-client.js'; @@ -7,6 +8,8 @@ import type { StepEvent, AgentToolEvent, AgentLLMEvent, + CommandEvent, + CrashEvent, } from './telemetry-types.js'; import { WORKOS_TELEMETRY_ENABLED } from '../lib/constants.js'; @@ -15,6 +18,7 @@ export class Analytics { private sessionId: string; private sessionStartTime: Date; private distinctId?: string; + private mode?: 'cli' | 'tui' | 'headless'; // Agent metrics tracking private totalInputTokens = 0; @@ -71,9 +75,40 @@ export class Analytics { return undefined; } + private detectCiProvider(): string | undefined { + if (process.env.GITHUB_ACTIONS) return 'github-actions'; + if (process.env.BUILDKITE) return 'buildkite'; + if (process.env.CIRCLECI) return 'circleci'; + if (process.env.GITLAB_CI) return 'gitlab-ci'; + if (process.env.JENKINS_URL) return 'jenkins'; + return undefined; + } + + private getEnvFingerprint() { + let osVersion: string; + try { + osVersion = os.release(); + } catch { + osVersion = 'unknown'; + } + + const ciProvider = this.detectCiProvider(); + + return { + 'env.os': process.platform, + 'env.os_version': osVersion, + 'env.node_version': process.version, + 'env.shell': process.env.SHELL ?? process.env.COMSPEC ?? 'unknown', + 'env.ci': Boolean(process.env.CI || process.env.GITHUB_ACTIONS || process.env.BUILDKITE), + ...(ciProvider ? { 'env.ci_provider': ciProvider } : {}), + }; + } + sessionStart(mode: 'cli' | 'tui' | 'headless', version: string) { if (!WORKOS_TELEMETRY_ENABLED) return; + this.mode = mode; + const event: SessionStartEvent = { type: 'session.start', sessionId: this.sessionId, @@ -82,6 +117,7 @@ export class Analytics { 'installer.version': version, 'installer.mode': mode, 'workos.user_id': this.distinctId, + ...this.getEnvFingerprint(), }, }; @@ -96,6 +132,7 @@ export class Analytics { sessionId: this.sessionId, timestamp: new Date().toISOString(), name, + startTimestamp: new Date(Date.now() - durationMs).toISOString(), durationMs, success, error: error ? { type: error.name, message: error.message } : undefined, @@ -112,6 +149,7 @@ export class Analytics { sessionId: this.sessionId, timestamp: new Date().toISOString(), toolName, + startTimestamp: new Date(Date.now() - durationMs).toISOString(), durationMs, success, }; @@ -141,6 +179,61 @@ export class Analytics { this.agentIterations++; } + commandExecuted( + name: string, + durationMs: number, + success: boolean, + options?: { error?: Error; flags?: string[] }, + ) { + if (!WORKOS_TELEMETRY_ENABLED) return; + + const event: CommandEvent = { + type: 'command', + sessionId: this.sessionId, + timestamp: new Date().toISOString(), + attributes: { + 'command.name': name, + 'command.duration_ms': durationMs, + 'command.success': success, + ...(options?.error + ? { + 'command.error_type': options.error.name, + 'command.error_message': options.error.message, + } + : {}), + ...(options?.flags?.length + ? { 'command.flags': options.flags.join(',') } + : {}), + ...this.getEnvFingerprint(), + }, + }; + + telemetryClient.queueEvent(event); + } + + captureUnhandledCrash(error: Error, options?: { command?: string; version?: string }) { + if (!WORKOS_TELEMETRY_ENABLED) return; + + const stack = error.stack ?? ''; + const truncatedStack = stack.length > 4096 ? stack.slice(0, 4096) : stack; + + const event: CrashEvent = { + type: 'crash', + sessionId: this.sessionId, + timestamp: new Date().toISOString(), + attributes: { + 'crash.error_type': error.name, + 'crash.error_message': error.message, + 'crash.stack': truncatedStack, + ...(options?.command ? { 'crash.command': options.command } : {}), + 'installer.version': options?.version ?? 'unknown', + ...this.getEnvFingerprint(), + }, + }; + + telemetryClient.queueEvent(event); + } + async shutdown(status: 'success' | 'error' | 'cancelled') { if (!WORKOS_TELEMETRY_ENABLED) return; @@ -152,6 +245,8 @@ export class Analytics { string | number | boolean >; + const envFingerprint = this.getEnvFingerprint(); + const event: SessionEndEvent = { type: 'session.end', sessionId: this.sessionId, @@ -162,6 +257,8 @@ export class Analytics { 'installer.agent.iterations': this.agentIterations, 'installer.agent.tokens.input': this.totalInputTokens, 'installer.agent.tokens.output': this.totalOutputTokens, + ...envFingerprint, + ...(this.mode ? { 'installer.mode': this.mode } : {}), ...extraAttributes, }, }; diff --git a/src/utils/telemetry-schema.spec.ts b/src/utils/telemetry-schema.spec.ts new file mode 100644 index 00000000..45c27d3f --- /dev/null +++ b/src/utils/telemetry-schema.spec.ts @@ -0,0 +1,258 @@ +import { describe, it, expect } from 'vitest'; +import { z } from 'zod'; + +/** + * Mirror of the API's Zod discriminated union schema. + * These tests validate that the schema shape preserves top-level fields + * (not stripped by safeParse) and accepts all 7 event types. + * This ensures CLI and API stay in sync. + */ + +const attributesSchema = z + .record(z.string(), z.union([z.string(), z.number(), z.boolean()])) + .optional(); + +const baseFields = { + sessionId: z.string(), + timestamp: z.string(), + attributes: attributesSchema, +}; + +const SessionStartSchema = z + .object({ type: z.literal('session.start'), ...baseFields }) + .passthrough(); + +const SessionEndSchema = z + .object({ type: z.literal('session.end'), ...baseFields }) + .passthrough(); + +const StepSchema = z + .object({ + type: z.literal('step'), + ...baseFields, + name: z.string(), + startTimestamp: z.string().optional(), + durationMs: z.number(), + success: z.boolean(), + error: z.object({ type: z.string(), message: z.string() }).optional(), + }) + .passthrough(); + +const AgentToolSchema = z + .object({ + type: z.literal('agent.tool'), + ...baseFields, + toolName: z.string(), + startTimestamp: z.string().optional(), + durationMs: z.number(), + success: z.boolean(), + }) + .passthrough(); + +const AgentLlmSchema = z + .object({ + type: z.literal('agent.llm'), + ...baseFields, + model: z.string(), + inputTokens: z.number(), + outputTokens: z.number(), + }) + .passthrough(); + +const CommandSchema = z + .object({ type: z.literal('command'), ...baseFields }) + .passthrough(); + +const CrashSchema = z + .object({ type: z.literal('crash'), ...baseFields }) + .passthrough(); + +const TelemetryEventSchema = z.discriminatedUnion('type', [ + SessionStartSchema, + SessionEndSchema, + StepSchema, + AgentToolSchema, + AgentLlmSchema, + CommandSchema, + CrashSchema, +]); + +describe('TelemetryEventSchema (discriminated union)', () => { + const base = { sessionId: 'sess-1', timestamp: '2024-01-01T00:00:00Z' }; + + describe('accepts all 7 event types', () => { + it('accepts session.start', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'session.start', + ...base, + attributes: { 'installer.version': '1.0.0' }, + }); + expect(result.success).toBe(true); + }); + + it('accepts session.end', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'session.end', + ...base, + attributes: { 'installer.outcome': 'success' }, + }); + expect(result.success).toBe(true); + }); + + it('accepts step', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'step', + ...base, + name: 'detect', + startTimestamp: '2024-01-01T00:00:00Z', + durationMs: 100, + success: true, + }); + expect(result.success).toBe(true); + }); + + it('accepts agent.tool', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'agent.tool', + ...base, + toolName: 'Write', + startTimestamp: '2024-01-01T00:00:00Z', + durationMs: 50, + success: true, + }); + expect(result.success).toBe(true); + }); + + it('accepts agent.llm', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'agent.llm', + ...base, + model: 'claude', + inputTokens: 100, + outputTokens: 50, + }); + expect(result.success).toBe(true); + }); + + it('accepts command', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'command', + ...base, + attributes: { 'command.name': 'org.list' }, + }); + expect(result.success).toBe(true); + }); + + it('accepts crash', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'crash', + ...base, + attributes: { 'crash.error_type': 'Error' }, + }); + expect(result.success).toBe(true); + }); + }); + + describe('preserves top-level fields via .passthrough()', () => { + it('preserves name and durationMs on step events', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'step', + ...base, + name: 'detect', + durationMs: 100, + success: true, + }); + expect(result.success).toBe(true); + if (result.success) { + expect(result.data).toHaveProperty('name', 'detect'); + expect(result.data).toHaveProperty('durationMs', 100); + } + }); + + it('preserves toolName on agent.tool events', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'agent.tool', + ...base, + toolName: 'Write', + durationMs: 50, + success: true, + }); + expect(result.success).toBe(true); + if (result.success) { + expect(result.data).toHaveProperty('toolName', 'Write'); + } + }); + + it('preserves model, inputTokens, outputTokens on agent.llm events', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'agent.llm', + ...base, + model: 'claude', + inputTokens: 100, + outputTokens: 50, + }); + expect(result.success).toBe(true); + if (result.success) { + expect(result.data).toHaveProperty('model', 'claude'); + expect(result.data).toHaveProperty('inputTokens', 100); + expect(result.data).toHaveProperty('outputTokens', 50); + } + }); + + it('preserves startTimestamp on step events', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'step', + ...base, + name: 'detect', + startTimestamp: '2024-01-01T00:00:00Z', + durationMs: 100, + success: true, + }); + expect(result.success).toBe(true); + if (result.success) { + expect(result.data).toHaveProperty('startTimestamp', '2024-01-01T00:00:00Z'); + } + }); + }); + + describe('backward compatibility', () => { + it('accepts step event without startTimestamp', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'step', + ...base, + name: 'detect', + durationMs: 100, + success: true, + }); + expect(result.success).toBe(true); + }); + + it('accepts agent.tool event without startTimestamp', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'agent.tool', + ...base, + toolName: 'Write', + durationMs: 50, + success: true, + }); + expect(result.success).toBe(true); + }); + }); + + describe('rejects invalid events', () => { + it('rejects events with unknown type', () => { + const result = TelemetryEventSchema.safeParse({ + type: 'unknown', + ...base, + }); + expect(result.success).toBe(false); + }); + + it('rejects events without type', () => { + const result = TelemetryEventSchema.safeParse({ + ...base, + }); + expect(result.success).toBe(false); + }); + }); +}); diff --git a/src/utils/telemetry-types.ts b/src/utils/telemetry-types.ts index b9a66798..93faa117 100644 --- a/src/utils/telemetry-types.ts +++ b/src/utils/telemetry-types.ts @@ -4,7 +4,7 @@ */ export interface TelemetryEvent { - type: 'session.start' | 'session.end' | 'step' | 'agent.tool' | 'agent.llm'; + type: 'session.start' | 'session.end' | 'step' | 'agent.tool' | 'agent.llm' | 'command' | 'crash'; sessionId: string; timestamp: string; attributes?: Record; @@ -17,6 +17,12 @@ export interface SessionStartEvent extends TelemetryEvent { 'installer.mode': 'cli' | 'tui' | 'headless'; 'workos.user_id'?: string; 'workos.org_id'?: string; + 'env.os': string; + 'env.os_version': string; + 'env.node_version': string; + 'env.shell': string; + 'env.ci': boolean; + 'env.ci_provider'?: string; }; } @@ -31,6 +37,7 @@ export interface SessionEndEvent extends TelemetryEvent { export interface StepEvent extends TelemetryEvent { type: 'step'; name: string; + startTimestamp: string; durationMs: number; success: boolean; error?: { @@ -42,6 +49,7 @@ export interface StepEvent extends TelemetryEvent { export interface AgentToolEvent extends TelemetryEvent { type: 'agent.tool'; toolName: string; + startTimestamp: string; durationMs: number; success: boolean; } @@ -53,6 +61,41 @@ export interface AgentLLMEvent extends TelemetryEvent { outputTokens: number; } +export interface CommandEvent extends TelemetryEvent { + type: 'command'; + attributes: { + 'command.name': string; + 'command.duration_ms': number; + 'command.success': boolean; + 'command.error_type'?: string; + 'command.error_message'?: string; + 'command.flags'?: string; + 'env.os': string; + 'env.os_version': string; + 'env.node_version': string; + 'env.shell': string; + 'env.ci': boolean; + 'env.ci_provider'?: string; + }; +} + +export interface CrashEvent extends TelemetryEvent { + type: 'crash'; + attributes: { + 'crash.error_type': string; + 'crash.error_message': string; + 'crash.stack': string; + 'crash.command'?: string; + 'installer.version': string; + 'env.os': string; + 'env.os_version': string; + 'env.node_version': string; + 'env.shell': string; + 'env.ci': boolean; + 'env.ci_provider'?: string; + }; +} + export interface TelemetryRequest { events: TelemetryEvent[]; } From 67ffc2fa77fba9be22d738e2d08ec205cb1dde79 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Tue, 14 Apr 2026 14:25:12 -0500 Subject: [PATCH 02/10] feat: add command-level telemetry, crash reporting, and store-forward persistence Wire up yargs middleware that emits a provisional command event before each handler runs, then replaces it with actual duration/success on completion. This covers the ~25 process.exit() call sites without modifying them. - Command telemetry middleware with canonical name resolution and flag extraction - Crash reporter with sanitized stack traces (sync handlers, no async) - Store-forward: persist unsent events to temp file on exit, recover on next run - Fix flush() to retain events until HTTP success (was clearing before fetch) - Auto-wrap handlers in registerSubcommand() (single change point) - Shared COMMAND_ALIASES map for telemetry and help-json - analytics.initForNonInstaller() sets gatewayUrl + JWT from stored creds --- src/bin.ts | 18 ++- src/lib/command-aliases.ts | 10 ++ src/utils/analytics.spec.ts | 113 +++++++++++++++ src/utils/analytics.ts | 38 ++++++ src/utils/command-telemetry.spec.ts | 159 ++++++++++++++++++++++ src/utils/command-telemetry.ts | 86 ++++++++++++ src/utils/crash-reporter.spec.ts | 149 ++++++++++++++++++++ src/utils/crash-reporter.ts | 55 ++++++++ src/utils/register-subcommand.ts | 5 +- src/utils/telemetry-client.spec.ts | 106 ++++++++++++++- src/utils/telemetry-client.ts | 45 +++++- src/utils/telemetry-store-forward.spec.ts | 159 ++++++++++++++++++++++ src/utils/telemetry-store-forward.ts | 57 ++++++++ 13 files changed, 991 insertions(+), 9 deletions(-) create mode 100644 src/lib/command-aliases.ts create mode 100644 src/utils/command-telemetry.spec.ts create mode 100644 src/utils/command-telemetry.ts create mode 100644 src/utils/crash-reporter.spec.ts create mode 100644 src/utils/crash-reporter.ts create mode 100644 src/utils/telemetry-store-forward.spec.ts create mode 100644 src/utils/telemetry-store-forward.ts diff --git a/src/bin.ts b/src/bin.ts index 169811be..d8334468 100644 --- a/src/bin.ts +++ b/src/bin.ts @@ -31,6 +31,20 @@ import { isNonInteractiveEnvironment } from './utils/environment.js'; import { resolveOutputMode, setOutputMode, isJsonMode, outputJson, exitWithError } from './utils/output.js'; import clack from './utils/clack.js'; import { registerSubcommand } from './utils/register-subcommand.js'; +import { COMMAND_ALIASES } from './lib/command-aliases.js'; +import { installCrashReporter } from './utils/crash-reporter.js'; +import { installStoreForward, recoverPendingEvents } from './utils/telemetry-store-forward.js'; +import { commandTelemetryMiddleware } from './utils/command-telemetry.js'; +import { analytics } from './utils/analytics.js'; + +// Telemetry infrastructure: crash reporter, store-forward, and gateway init. +// Must be before yargs so crashes during startup are captured. +installCrashReporter(); +installStoreForward(); +analytics.initForNonInstaller(); +// Fire-and-forget: recover events from previous crashes/exits. +// NO await — must not block startup (flush timeout is 3s). +recoverPendingEvents(); // Resolve output mode early from raw argv (before yargs parses) const rawArgs = hideBin(process.argv); @@ -40,9 +54,8 @@ setOutputMode(resolveOutputMode(hasJsonFlag)); // Intercept --help --json before yargs parses (yargs exits on --help) if (hasJsonFlag && (rawArgs.includes('--help') || rawArgs.includes('-h'))) { const { buildCommandTree } = await import('./utils/help-json.js'); - const commandAliases: Record = { org: 'organization' }; const rawCommand = rawArgs.find((a) => !a.startsWith('-')); - const command = rawCommand ? (commandAliases[rawCommand] ?? rawCommand) : undefined; + const command = rawCommand ? (COMMAND_ALIASES[rawCommand] ?? rawCommand) : undefined; outputJson(buildCommandTree(command)); process.exit(0); } @@ -183,6 +196,7 @@ yargs(rawArgs) describe: 'Output results as JSON (auto-enabled in non-TTY)', global: true, }) + .middleware(commandTelemetryMiddleware(rawArgs)) .middleware(async (argv) => { // Warn about unclaimed environments before management commands. // Excluded: auth/claim/install/dashboard handle their own credential flows; diff --git a/src/lib/command-aliases.ts b/src/lib/command-aliases.ts new file mode 100644 index 00000000..8f515ae3 --- /dev/null +++ b/src/lib/command-aliases.ts @@ -0,0 +1,10 @@ +/** + * Shared canonical command alias map. + * Single source of truth for both telemetry and help-json. + * + * Keys are user-facing aliases, values are canonical command names. + * Adding an alias here updates both metrics aggregation and --help --json output. + */ +export const COMMAND_ALIASES: Record = { + org: 'organization', +}; diff --git a/src/utils/analytics.spec.ts b/src/utils/analytics.spec.ts index 8c0f1bfe..c98531b8 100644 --- a/src/utils/analytics.spec.ts +++ b/src/utils/analytics.spec.ts @@ -5,6 +5,7 @@ const mockSetGatewayUrl = vi.fn(); const mockSetAccessToken = vi.fn(); const mockQueueEvent = vi.fn(); const mockFlush = vi.fn().mockResolvedValue(undefined); +const mockReplaceLastEventOfType = vi.fn(); vi.mock('./telemetry-client.js', () => ({ telemetryClient: { @@ -12,6 +13,7 @@ vi.mock('./telemetry-client.js', () => ({ setAccessToken: mockSetAccessToken, queueEvent: mockQueueEvent, flush: mockFlush, + replaceLastEventOfType: (...args: unknown[]) => mockReplaceLastEventOfType(...args), }, })); @@ -25,6 +27,27 @@ vi.mock('uuid', () => ({ v4: () => 'test-session-id-123', })); +// Mock settings for initForNonInstaller +const mockGetLlmGatewayUrl = vi.fn(() => 'https://api.workos.com/llm-gateway'); +const mockSettingsConfig = { + nodeVersion: '>=18', + logging: { debugMode: false }, + telemetry: { enabled: true, eventName: 'installer_interaction' }, + documentation: { workosDocsUrl: 'https://workos.com/docs', dashboardUrl: 'https://dashboard.workos.com', issuesUrl: 'https://github.com' }, + legacy: { oauthPort: 3000 }, +}; +vi.mock('../lib/settings.js', () => ({ + getLlmGatewayUrl: () => mockGetLlmGatewayUrl(), + getConfig: () => mockSettingsConfig, + getVersion: () => '0.12.1', +})); + +// Mock credentials for initForNonInstaller +const mockGetCredentials = vi.fn(); +vi.mock('../lib/credentials.js', () => ({ + getCredentials: () => mockGetCredentials(), +})); + describe('Analytics', () => { // Need to handle WORKOS_TELEMETRY_ENABLED which is evaluated at import time const originalEnv = process.env.WORKOS_TELEMETRY; @@ -56,8 +79,17 @@ describe('Analytics', () => { setAccessToken: mockSetAccessToken, queueEvent: mockQueueEvent, flush: mockFlush, + replaceLastEventOfType: (...args: unknown[]) => mockReplaceLastEventOfType(...args), }, })); + vi.doMock('../lib/settings.js', () => ({ + getLlmGatewayUrl: () => mockGetLlmGatewayUrl(), + getConfig: () => mockSettingsConfig, + getVersion: () => '0.12.1', + })); + vi.doMock('../lib/credentials.js', () => ({ + getCredentials: () => mockGetCredentials(), + })); const module = await import('./analytics.js'); Analytics = module.Analytics; analytics = new Analytics(); @@ -498,6 +530,59 @@ describe('Analytics', () => { expect(event.attributes['installer.version']).toBe('unknown'); }); }); + + describe('initForNonInstaller', () => { + it('sets gatewayUrl from default config', () => { + mockGetLlmGatewayUrl.mockReturnValue('https://api.workos.com/llm-gateway'); + analytics.initForNonInstaller(); + + expect(mockSetGatewayUrl).toHaveBeenCalledWith('https://api.workos.com/llm-gateway'); + }); + + it('sets access token from stored credentials', () => { + mockGetCredentials.mockReturnValue({ accessToken: 'stored-jwt-token' }); + analytics.initForNonInstaller(); + + expect(mockSetAccessToken).toHaveBeenCalledWith('stored-jwt-token'); + }); + + it('skips access token when no credentials stored', () => { + mockGetCredentials.mockReturnValue(null); + analytics.initForNonInstaller(); + + expect(mockSetAccessToken).not.toHaveBeenCalled(); + }); + }); + + describe('replaceLastCommandEvent', () => { + it('removes last command event and queues a new one', () => { + analytics.replaceLastCommandEvent('organization.list', 150, true, { flags: ['json'] }); + + expect(mockReplaceLastEventOfType).toHaveBeenCalledWith('command'); + expect(mockQueueEvent).toHaveBeenCalledWith( + expect.objectContaining({ + type: 'command', + attributes: expect.objectContaining({ + 'command.name': 'organization.list', + 'command.duration_ms': 150, + 'command.success': true, + 'command.flags': 'json', + }), + }), + ); + }); + + it('includes error info on failure', () => { + const error = new Error('oops'); + error.name = 'CommandError'; + analytics.replaceLastCommandEvent('auth.login', 50, false, { error }); + + const event = mockQueueEvent.mock.calls[0][0]; + expect(event.attributes['command.success']).toBe(false); + expect(event.attributes['command.error_type']).toBe('CommandError'); + expect(event.attributes['command.error_message']).toBe('oops'); + }); + }); }); describe('with telemetry disabled', () => { @@ -510,8 +595,17 @@ describe('Analytics', () => { setAccessToken: mockSetAccessToken, queueEvent: mockQueueEvent, flush: mockFlush, + replaceLastEventOfType: (...args: unknown[]) => mockReplaceLastEventOfType(...args), }, })); + vi.doMock('../lib/settings.js', () => ({ + getLlmGatewayUrl: () => mockGetLlmGatewayUrl(), + getConfig: () => mockSettingsConfig, + getVersion: () => '0.12.1', + })); + vi.doMock('../lib/credentials.js', () => ({ + getCredentials: () => mockGetCredentials(), + })); }); it('capture does nothing', async () => { @@ -595,5 +689,24 @@ describe('Analytics', () => { expect(mockQueueEvent).not.toHaveBeenCalled(); }); + + it('initForNonInstaller does nothing', async () => { + const { Analytics } = await import('./analytics.js'); + const analytics = new Analytics(); + + analytics.initForNonInstaller(); + + expect(mockSetGatewayUrl).not.toHaveBeenCalled(); + }); + + it('replaceLastCommandEvent does nothing', async () => { + const { Analytics } = await import('./analytics.js'); + const analytics = new Analytics(); + + analytics.replaceLastCommandEvent('org.list', 100, true); + + expect(mockReplaceLastEventOfType).not.toHaveBeenCalled(); + expect(mockQueueEvent).not.toHaveBeenCalled(); + }); }); }); diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts index 1c9242bb..9253b6e6 100644 --- a/src/utils/analytics.ts +++ b/src/utils/analytics.ts @@ -12,6 +12,8 @@ import type { CrashEvent, } from './telemetry-types.js'; import { WORKOS_TELEMETRY_ENABLED } from '../lib/constants.js'; +import { getLlmGatewayUrl } from '../lib/settings.js'; +import { getCredentials } from '../lib/credentials.js'; export class Analytics { private tags: Record = {}; @@ -43,6 +45,24 @@ export class Analytics { telemetryClient.setGatewayUrl(url); } + /** + * Initialize telemetry for non-installer commands. + * Sets gatewayUrl from default config and loads stored JWT if available. + * The installer flow sets these itself in run-with-core.ts; this covers + * management commands like `org list`, `auth login`, etc. + */ + initForNonInstaller(): void { + if (!WORKOS_TELEMETRY_ENABLED) return; + + const gatewayUrl = getLlmGatewayUrl(); + telemetryClient.setGatewayUrl(gatewayUrl); + + const creds = getCredentials(); + if (creds?.accessToken) { + telemetryClient.setAccessToken(creds.accessToken); + } + } + setTag(key: string, value: string | boolean | number | null | undefined) { this.tags[key] = value; } @@ -211,6 +231,24 @@ export class Analytics { telemetryClient.queueEvent(event); } + /** + * Replace the last queued command event with updated data. + * Used by the command handler wrapper to swap the provisional event + * (queued by middleware) with the real one after the handler completes. + */ + replaceLastCommandEvent( + name: string, + durationMs: number, + success: boolean, + options?: { error?: Error; flags?: string[] }, + ) { + if (!WORKOS_TELEMETRY_ENABLED) return; + + telemetryClient.replaceLastEventOfType('command'); + + this.commandExecuted(name, durationMs, success, options); + } + captureUnhandledCrash(error: Error, options?: { command?: string; version?: string }) { if (!WORKOS_TELEMETRY_ENABLED) return; diff --git a/src/utils/command-telemetry.spec.ts b/src/utils/command-telemetry.spec.ts new file mode 100644 index 00000000..d0095fe0 --- /dev/null +++ b/src/utils/command-telemetry.spec.ts @@ -0,0 +1,159 @@ +import { describe, it, expect, beforeEach, vi } from 'vitest'; +import { resolveCanonicalName, extractUserFlags, commandTelemetryMiddleware, wrapCommandHandler } from './command-telemetry.js'; + +const mockCommandExecuted = vi.fn(); +const mockReplaceLastCommandEvent = vi.fn(); + +vi.mock('./analytics.js', () => ({ + analytics: { + commandExecuted: (...args: unknown[]) => mockCommandExecuted(...args), + replaceLastCommandEvent: (...args: unknown[]) => mockReplaceLastCommandEvent(...args), + }, +})); + +vi.mock('../lib/constants.js', () => ({ + WORKOS_TELEMETRY_ENABLED: true, +})); + +describe('command-telemetry', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + describe('resolveCanonicalName', () => { + it('resolves aliased commands', () => { + expect(resolveCanonicalName(['org', 'list'])).toBe('organization.list'); + }); + + it('passes through non-aliased commands', () => { + expect(resolveCanonicalName(['auth', 'login'])).toBe('auth.login'); + }); + + it('returns root for empty parts', () => { + expect(resolveCanonicalName([])).toBe('root'); + }); + + it('handles single-part commands', () => { + expect(resolveCanonicalName(['install'])).toBe('install'); + }); + + it('only aliases the first part', () => { + expect(resolveCanonicalName(['org', 'org'])).toBe('organization.org'); + }); + }); + + describe('extractUserFlags', () => { + it('extracts long flags', () => { + expect(extractUserFlags(['org', 'list', '--json'])).toEqual(['json']); + }); + + it('extracts short flags', () => { + expect(extractUserFlags(['-v'])).toEqual(['v']); + }); + + it('handles flags with values', () => { + expect(extractUserFlags(['--env=staging'])).toEqual(['env']); + }); + + it('deduplicates flags', () => { + expect(extractUserFlags(['--json', '--json'])).toEqual(['json']); + }); + + it('ignores positionals', () => { + expect(extractUserFlags(['org', 'list', 'my-org'])).toEqual([]); + }); + + it('ignores multi-char short flags (not real flags)', () => { + expect(extractUserFlags(['-abc'])).toEqual([]); + }); + }); + + describe('commandTelemetryMiddleware', () => { + it('queues provisional event with duration=0', async () => { + const middleware = commandTelemetryMiddleware(['org', 'list', '--json']); + const argv: Record = { _: ['org', 'list'] }; + + await middleware(argv); + + expect(mockCommandExecuted).toHaveBeenCalledWith('organization.list', 0, true, { + flags: ['json'], + }); + }); + + it('stores telemetry metadata on argv', async () => { + const middleware = commandTelemetryMiddleware(['auth', 'login']); + const argv: Record = { _: ['auth', 'login'] }; + + await middleware(argv); + + expect(argv.__telemetryCommandName).toBe('auth.login'); + expect(argv.__telemetryStartTime).toBeTypeOf('number'); + expect(argv.__telemetryFlags).toEqual([]); + }); + }); + + describe('wrapCommandHandler', () => { + it('replaces provisional event on success', async () => { + const handler = vi.fn().mockResolvedValue(undefined); + const wrapped = wrapCommandHandler(handler); + const argv = { + __telemetryCommandName: 'organization.list', + __telemetryStartTime: Date.now() - 100, + __telemetryFlags: ['json'], + }; + + await wrapped(argv); + + expect(handler).toHaveBeenCalledWith(argv); + expect(mockReplaceLastCommandEvent).toHaveBeenCalledWith( + 'organization.list', + expect.any(Number), + true, + { flags: ['json'] }, + ); + const duration = mockReplaceLastCommandEvent.mock.calls[0][1] as number; + expect(duration).toBeGreaterThanOrEqual(100); + }); + + it('replaces provisional event on failure with error', async () => { + const error = new Error('command failed'); + const handler = vi.fn().mockRejectedValue(error); + const wrapped = wrapCommandHandler(handler); + const argv = { + __telemetryCommandName: 'organization.list', + __telemetryStartTime: Date.now(), + __telemetryFlags: [], + }; + + await expect(wrapped(argv)).rejects.toThrow('command failed'); + + expect(mockReplaceLastCommandEvent).toHaveBeenCalledWith( + 'organization.list', + expect.any(Number), + false, + { error, flags: [] }, + ); + }); + + it('re-throws the original error', async () => { + const error = new Error('original'); + const handler = vi.fn().mockRejectedValue(error); + const wrapped = wrapCommandHandler(handler); + + await expect(wrapped({ __telemetryStartTime: Date.now() })).rejects.toBe(error); + }); + + it('handles non-Error throws', async () => { + const handler = vi.fn().mockRejectedValue('string error'); + const wrapped = wrapCommandHandler(handler); + + await expect( + wrapped({ __telemetryCommandName: 'test', __telemetryStartTime: Date.now(), __telemetryFlags: [] }), + ).rejects.toBe('string error'); + + const errorArg = mockReplaceLastCommandEvent.mock.calls[0][3].error; + expect(errorArg).toBeInstanceOf(Error); + expect(errorArg.message).toBe('string error'); + }); + }); +}); diff --git a/src/utils/command-telemetry.ts b/src/utils/command-telemetry.ts new file mode 100644 index 00000000..57603b19 --- /dev/null +++ b/src/utils/command-telemetry.ts @@ -0,0 +1,86 @@ +import { analytics } from './analytics.js'; +import { COMMAND_ALIASES } from '../lib/command-aliases.js'; +import { WORKOS_TELEMETRY_ENABLED } from '../lib/constants.js'; + +/** + * Resolve user-typed command parts to their canonical name. + * Applies alias mapping to the top-level command only. + * + * Examples: + * ['org', 'list'] -> 'organization.list' + * ['auth', 'login'] -> 'auth.login' + * [] -> 'root' + */ +export function resolveCanonicalName(parts: string[]): string { + if (parts.length === 0) return 'root'; + const resolved = [...parts]; + resolved[0] = COMMAND_ALIASES[resolved[0]] ?? resolved[0]; + return resolved.join('.'); +} + +/** + * Extract only user-supplied flags (not positionals, not defaults). + * Parses rawArgs directly instead of argv to avoid positionals and camelCase dupes. + */ +export function extractUserFlags(rawArgs: string[]): string[] { + const passedFlags = rawArgs + .filter((arg) => arg.startsWith('--') || (arg.startsWith('-') && arg.length === 2)) + .map((arg) => arg.replace(/^-+/, '').split('=')[0]); + return [...new Set(passedFlags)]; +} + +/** + * Yargs middleware that queues a PROVISIONAL command event immediately. + * This ensures there's always an event to persist via store-forward, + * even if the handler calls process.exit() before returning. + * + * The provisional event has success=true and duration=0. It gets + * updated by the handler wrapper on normal completion/failure. + */ +export function commandTelemetryMiddleware(rawArgs: string[]) { + return async (argv: Record) => { + if (!WORKOS_TELEMETRY_ENABLED) return; + + const commandParts = (argv._ as string[]) || []; + const commandName = resolveCanonicalName(commandParts); + const flags = extractUserFlags(rawArgs); + const startTime = Date.now(); + + // Store metadata for the handler wrapper to update later + argv.__telemetryCommandName = commandName; + argv.__telemetryStartTime = startTime; + argv.__telemetryFlags = flags; + + // Queue provisional event NOW, before the handler runs. + // If the handler calls process.exit(), store-forward persists this. + analytics.commandExecuted(commandName, 0, true, { flags }); + }; +} + +/** + * Wraps a yargs command handler to UPDATE the provisional event + * with actual duration and success/failure on completion. + * Designed to be called inside registerSubcommand(), not at each call site. + */ +export function wrapCommandHandler( + handler: (argv: any) => Promise, +): (argv: any) => Promise { + return async (argv) => { + const commandName = String(argv.__telemetryCommandName ?? 'unknown'); + const startTime = Number(argv.__telemetryStartTime ?? Date.now()); + const flags = (argv.__telemetryFlags as string[]) ?? []; + + try { + await handler(argv); + // Replace the provisional event with the real one + analytics.replaceLastCommandEvent(commandName, Date.now() - startTime, true, { flags }); + } catch (error) { + const err = error instanceof Error ? error : new Error(String(error)); + analytics.replaceLastCommandEvent(commandName, Date.now() - startTime, false, { + error: err, + flags, + }); + throw error; + } + }; +} diff --git a/src/utils/crash-reporter.spec.ts b/src/utils/crash-reporter.spec.ts new file mode 100644 index 00000000..a36b918d --- /dev/null +++ b/src/utils/crash-reporter.spec.ts @@ -0,0 +1,149 @@ +import { describe, it, expect, beforeEach, vi, afterEach } from 'vitest'; +import os from 'node:os'; + +const mockCaptureUnhandledCrash = vi.fn(); + +vi.mock('./analytics.js', () => ({ + analytics: { + captureUnhandledCrash: (...args: unknown[]) => mockCaptureUnhandledCrash(...args), + }, +})); + +describe('crash-reporter', () => { + beforeEach(() => { + vi.clearAllMocks(); + vi.resetModules(); + }); + + describe('sanitizeStack', () => { + let sanitizeStack: typeof import('./crash-reporter.js').sanitizeStack; + + beforeEach(async () => { + const mod = await import('./crash-reporter.js'); + sanitizeStack = mod.sanitizeStack; + }); + + it('returns empty string for undefined', () => { + expect(sanitizeStack(undefined)).toBe(''); + }); + + it('replaces home directory with ~', () => { + const home = os.homedir(); + const stack = `Error: test\n at ${home}/project/src/index.ts:1:1`; + expect(sanitizeStack(stack)).toContain('~'); + expect(sanitizeStack(stack)).not.toContain(home); + }); + + it('strips absolute paths to node_modules/dist/src', () => { + const stack = 'Error\n at /long/absolute/path/to/src/file.ts:1:1'; + const result = sanitizeStack(stack); + expect(result).toContain('src/'); + expect(result).not.toContain('/long/absolute/path/to/'); + }); + + it('truncates stacks longer than 4096 chars', () => { + const longStack = 'Error: test\n' + 'x'.repeat(5000); + const result = sanitizeStack(longStack); + expect(result.length).toBeLessThanOrEqual(4096 + '\n...[truncated]'.length); + expect(result).toContain('...[truncated]'); + }); + + it('does not truncate short stacks', () => { + const shortStack = 'Error: test\n at file.ts:1:1'; + const result = sanitizeStack(shortStack); + expect(result).toBe(shortStack); + expect(result).not.toContain('truncated'); + }); + }); + + describe('installCrashReporter', () => { + let processOnSpy: ReturnType; + let processExitSpy: ReturnType; + + beforeEach(() => { + processOnSpy = vi.spyOn(process, 'on'); + processExitSpy = vi.spyOn(process, 'exit').mockImplementation((() => {}) as never); + }); + + afterEach(() => { + processOnSpy.mockRestore(); + processExitSpy.mockRestore(); + }); + + it('registers uncaughtException and unhandledRejection handlers', async () => { + const { installCrashReporter } = await import('./crash-reporter.js'); + installCrashReporter(); + + const eventNames = processOnSpy.mock.calls.map((call) => call[0]); + expect(eventNames).toContain('uncaughtException'); + expect(eventNames).toContain('unhandledRejection'); + }); + + it('uncaughtException handler queues crash event and exits', async () => { + const { installCrashReporter } = await import('./crash-reporter.js'); + installCrashReporter(); + + const uncaughtHandler = processOnSpy.mock.calls.find((c) => c[0] === 'uncaughtException')?.[1] as ( + err: Error, + ) => void; + + const error = new Error('boom'); + uncaughtHandler(error); + + expect(mockCaptureUnhandledCrash).toHaveBeenCalledTimes(1); + const capturedError = mockCaptureUnhandledCrash.mock.calls[0][0]; + expect(capturedError.message).toBe('boom'); + expect(processExitSpy).toHaveBeenCalledWith(1); + }); + + it('unhandledRejection handler wraps non-Error reasons', async () => { + const { installCrashReporter } = await import('./crash-reporter.js'); + installCrashReporter(); + + const rejectionHandler = processOnSpy.mock.calls.find((c) => c[0] === 'unhandledRejection')?.[1] as ( + reason: unknown, + ) => void; + + rejectionHandler('string reason'); + + expect(mockCaptureUnhandledCrash).toHaveBeenCalledTimes(1); + const capturedError = mockCaptureUnhandledCrash.mock.calls[0][0]; + expect(capturedError.message).toBe('string reason'); + }); + + it('isCrashing guard prevents recursive handling', async () => { + // Simulate the crash handler being called, then itself crashing + mockCaptureUnhandledCrash.mockImplementationOnce(() => { + // First call succeeds + }); + + const { installCrashReporter } = await import('./crash-reporter.js'); + installCrashReporter(); + + const uncaughtHandler = processOnSpy.mock.calls.find((c) => c[0] === 'uncaughtException')?.[1] as ( + err: Error, + ) => void; + + // First call sets isCrashing + uncaughtHandler(new Error('first')); + // Second call should be guarded (module-level isCrashing = true) + uncaughtHandler(new Error('second')); + + // Only first call should have reached analytics + expect(mockCaptureUnhandledCrash).toHaveBeenCalledTimes(1); + }); + + it('handlers are synchronous (no async in the critical path)', async () => { + const { installCrashReporter } = await import('./crash-reporter.js'); + installCrashReporter(); + + const uncaughtHandler = processOnSpy.mock.calls.find((c) => c[0] === 'uncaughtException')?.[1] as ( + err: Error, + ) => void; + + // Verify the handler returns void, not a Promise + const result = uncaughtHandler(new Error('sync test')); + expect(result).toBeUndefined(); + }); + }); +}); diff --git a/src/utils/crash-reporter.ts b/src/utils/crash-reporter.ts new file mode 100644 index 00000000..14207793 --- /dev/null +++ b/src/utils/crash-reporter.ts @@ -0,0 +1,55 @@ +import { analytics } from './analytics.js'; +import { homedir } from 'node:os'; + +const MAX_STACK_LENGTH = 4096; +let isCrashing = false; + +/** + * Sanitize stack trace: strip absolute paths to relative, remove home dir. + * Prevents leaking file system layout in telemetry events. + */ +export function sanitizeStack(stack: string | undefined): string { + if (!stack) return ''; + const home = homedir(); + let sanitized = stack; + sanitized = sanitized.replaceAll(home, '~'); + sanitized = sanitized.replace(/\/[^\s:]+\/(node_modules|dist|src)\//g, '$1/'); + return sanitized.length > MAX_STACK_LENGTH + ? sanitized.slice(0, MAX_STACK_LENGTH) + '\n...[truncated]' + : sanitized; +} + +/** + * Register global handlers for uncaughtException and unhandledRejection + * that capture crash details before the process exits. + * + * Handlers are SYNCHRONOUS. Node does NOT await async uncaughtException handlers. + * We queue the event synchronously; store-forward's process.on('exit') handler + * persists it to disk. The next CLI invocation recovers and sends. + */ +export function installCrashReporter(): void { + process.on('uncaughtException', (error) => { + reportCrashSync(error); + process.exit(1); + }); + + process.on('unhandledRejection', (reason) => { + const error = reason instanceof Error ? reason : new Error(String(reason)); + reportCrashSync(error); + process.exit(1); + }); +} + +function reportCrashSync(error: Error): void { + if (isCrashing) return; + isCrashing = true; + try { + // Sanitize the stack before passing to analytics + const sanitized = new Error(error.message); + sanitized.name = error.name; + sanitized.stack = sanitizeStack(error.stack); + analytics.captureUnhandledCrash(sanitized); + } catch { + // Telemetry must never prevent exit + } +} diff --git a/src/utils/register-subcommand.ts b/src/utils/register-subcommand.ts index 39cb8b2f..610f4f35 100644 --- a/src/utils/register-subcommand.ts +++ b/src/utils/register-subcommand.ts @@ -1,5 +1,7 @@ import yargs from 'yargs'; import type { Argv } from 'yargs'; +import { wrapCommandHandler } from './command-telemetry.js'; +import { WORKOS_TELEMETRY_ENABLED } from '../lib/constants.js'; interface YargsOptions { demandedOptions: Record; @@ -43,5 +45,6 @@ export function registerSubcommand( // Builder threw during probe — fall back to unenriched description } - return parentYargs.command(usage, enrichedDescription, builder, handler); + const telemetryHandler = WORKOS_TELEMETRY_ENABLED ? wrapCommandHandler(handler) : handler; + return parentYargs.command(usage, enrichedDescription, builder, telemetryHandler); } diff --git a/src/utils/telemetry-client.spec.ts b/src/utils/telemetry-client.spec.ts index c4781e5a..23609263 100644 --- a/src/utils/telemetry-client.spec.ts +++ b/src/utils/telemetry-client.spec.ts @@ -15,6 +15,18 @@ vi.mock('../lib/credentials.js', () => ({ getCredentials: () => mockGetCredentials(), })); +// Mock fs for persistToFile tests +const mockMkdirSync = vi.fn(); +const mockWriteFileSync = vi.fn(); +vi.mock('node:fs', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + mkdirSync: (...args: unknown[]) => mockMkdirSync(...args), + writeFileSync: (...args: unknown[]) => mockWriteFileSync(...args), + }; +}); + // Import after mocks are set up const { TelemetryClient } = await import('./telemetry-client.js'); @@ -136,15 +148,26 @@ describe('TelemetryClient', () => { expect(mockFetch).toHaveBeenCalledTimes(1); }); - it('clears events even if flush fails', async () => { + it('retains events when flush fails (for store-forward)', async () => { mockFetch.mockRejectedValueOnce(new Error('Network error')); client.setGatewayUrl('http://localhost:8000'); client.queueEvent({ type: 'session.start', sessionId: '123', timestamp: new Date().toISOString() }); - await client.flush(); // Should not throw - await client.flush(); // Should be no-op + await client.flush(); // Should not throw, events retained + await client.flush(); // Should retry since events are still queued - expect(mockFetch).toHaveBeenCalledTimes(1); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it('retains events on non-ok response (for store-forward)', async () => { + mockFetch.mockResolvedValueOnce({ ok: false, status: 500 }); + client.setGatewayUrl('http://localhost:8000'); + client.queueEvent({ type: 'session.start', sessionId: '123', timestamp: new Date().toISOString() }); + + await client.flush(); // Events retained on 500 + await client.flush(); // Should retry + + expect(mockFetch).toHaveBeenCalledTimes(2); }); it('handles network errors silently', async () => { @@ -182,4 +205,79 @@ describe('TelemetryClient', () => { ); }); }); + + describe('replaceLastEventOfType', () => { + it('removes the last event of the specified type', async () => { + client.setGatewayUrl('http://localhost:8000'); + client.queueEvent({ type: 'command', sessionId: '1', timestamp: '2024-01-01T00:00:00Z' }); + client.queueEvent({ type: 'session.start', sessionId: '1', timestamp: '2024-01-01T00:00:01Z' }); + client.queueEvent({ type: 'command', sessionId: '1', timestamp: '2024-01-01T00:00:02Z' }); + + client.replaceLastEventOfType('command'); + + await client.flush(); + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.events).toHaveLength(2); + expect(body.events[0].type).toBe('command'); + expect(body.events[1].type).toBe('session.start'); + }); + + it('does nothing if no event of that type exists', () => { + client.queueEvent({ type: 'session.start', sessionId: '1', timestamp: '2024-01-01T00:00:00Z' }); + // Should not throw + client.replaceLastEventOfType('command'); + }); + }); + + describe('queueEvents', () => { + it('queues multiple events at once', async () => { + client.setGatewayUrl('http://localhost:8000'); + client.queueEvents([ + { type: 'command', sessionId: '1', timestamp: '2024-01-01T00:00:00Z' }, + { type: 'crash', sessionId: '1', timestamp: '2024-01-01T00:00:01Z' }, + ]); + + await client.flush(); + const body = JSON.parse(mockFetch.mock.calls[0][1].body); + expect(body.events).toHaveLength(2); + }); + }); + + describe('persistToFile', () => { + beforeEach(() => { + mockMkdirSync.mockReset(); + mockWriteFileSync.mockReset(); + }); + + it('writes events to file and clears queue', async () => { + client.queueEvent({ type: 'session.start', sessionId: '1', timestamp: '2024-01-01T00:00:00Z' }); + client.persistToFile('/tmp/test-persist.json'); + + expect(mockMkdirSync).toHaveBeenCalledWith('/tmp', { recursive: true }); + expect(mockWriteFileSync).toHaveBeenCalledWith( + '/tmp/test-persist.json', + expect.stringContaining('session.start'), + 'utf-8', + ); + + // Queue should be empty after persist + client.setGatewayUrl('http://localhost:8000'); + await client.flush(); + expect(mockFetch).not.toHaveBeenCalled(); + }); + + it('does nothing when no events queued', () => { + client.persistToFile('/tmp/test-persist.json'); + expect(mockWriteFileSync).not.toHaveBeenCalled(); + }); + + it('fails silently on write error', () => { + mockMkdirSync.mockImplementation(() => { + throw new Error('EACCES'); + }); + + client.queueEvent({ type: 'session.start', sessionId: '1', timestamp: '2024-01-01T00:00:00Z' }); + expect(() => client.persistToFile('/tmp/test-persist.json')).not.toThrow(); + }); + }); }); diff --git a/src/utils/telemetry-client.ts b/src/utils/telemetry-client.ts index f85aea1d..bf20dcc1 100644 --- a/src/utils/telemetry-client.ts +++ b/src/utils/telemetry-client.ts @@ -1,3 +1,5 @@ +import { mkdirSync, writeFileSync } from 'node:fs'; +import { dirname } from 'node:path'; import { debug } from './debug.js'; import type { TelemetryEvent, TelemetryRequest } from './telemetry-types.js'; import { getCredentials } from '../lib/credentials.js'; @@ -23,6 +25,26 @@ export class TelemetryClient { this.events.push(event); } + /** + * Remove the last queued event of a given type. + * Used to swap a provisional event with an updated one. + */ + replaceLastEventOfType(type: TelemetryEvent['type']): void { + for (let i = this.events.length - 1; i >= 0; i--) { + if (this.events[i].type === type) { + this.events.splice(i, 1); + return; + } + } + } + + /** + * Queue multiple pre-formed events (used by store-forward recovery). + */ + queueEvents(events: TelemetryEvent[]): void { + this.events.push(...events); + } + async flush(): Promise { if (this.events.length === 0) return; if (!this.gatewayUrl) { @@ -31,7 +53,7 @@ export class TelemetryClient { } const payload: TelemetryRequest = { events: [...this.events] }; - this.events = []; + // DO NOT clear this.events yet — retain until success const headers: Record = { 'Content-Type': 'application/json', @@ -56,15 +78,34 @@ export class TelemetryClient { signal: controller.signal, }); - if (!response.ok) { + if (response.ok) { + this.events = []; + } else { debug(`[Telemetry] Failed to send: ${response.status}`); + // Events remain in queue for store-forward to persist } } catch (error) { debug(`[Telemetry] Error sending events: ${error}`); + // Events remain in queue for store-forward to persist } finally { clearTimeout(timeout); } } + + /** + * Synchronously write pending events to a file. + * Used as last resort in process.on('exit') handler. + */ + persistToFile(filePath: string): void { + if (this.events.length === 0) return; + try { + mkdirSync(dirname(filePath), { recursive: true }); + writeFileSync(filePath, JSON.stringify(this.events), 'utf-8'); + this.events = []; + } catch { + // Silent failure — telemetry must never block exit + } + } } export const telemetryClient = new TelemetryClient(); diff --git a/src/utils/telemetry-store-forward.spec.ts b/src/utils/telemetry-store-forward.spec.ts new file mode 100644 index 00000000..28367b42 --- /dev/null +++ b/src/utils/telemetry-store-forward.spec.ts @@ -0,0 +1,159 @@ +import { describe, it, expect, beforeEach, vi } from 'vitest'; + +const mockPersistToFile = vi.fn(); +const mockQueueEvents = vi.fn(); +const mockFlush = vi.fn().mockResolvedValue(undefined); + +vi.mock('./telemetry-client.js', () => ({ + telemetryClient: { + persistToFile: (...args: unknown[]) => mockPersistToFile(...args), + queueEvents: (...args: unknown[]) => mockQueueEvents(...args), + flush: () => mockFlush(), + }, +})); + +vi.mock('./debug.js', () => ({ + debug: vi.fn(), +})); + +const mockExistsSync = vi.fn(); +const mockReaddirSync = vi.fn(); +const mockReadFileSync = vi.fn(); +const mockUnlinkSync = vi.fn(); + +vi.mock('node:fs', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + existsSync: (...args: unknown[]) => mockExistsSync(...args), + readdirSync: (...args: unknown[]) => mockReaddirSync(...args), + readFileSync: (...args: unknown[]) => mockReadFileSync(...args), + unlinkSync: (...args: unknown[]) => mockUnlinkSync(...args), + }; +}); + +describe('telemetry-store-forward', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + describe('installStoreForward', () => { + it('registers a process exit handler', async () => { + const onSpy = vi.spyOn(process, 'on'); + vi.resetModules(); + + const { installStoreForward } = await import('./telemetry-store-forward.js'); + installStoreForward(); + + const exitHandlers = onSpy.mock.calls.filter((c) => c[0] === 'exit'); + expect(exitHandlers.length).toBeGreaterThanOrEqual(1); + + onSpy.mockRestore(); + }); + + it('exit handler calls persistToFile with PID-based path', async () => { + const onSpy = vi.spyOn(process, 'on'); + vi.resetModules(); + + const { installStoreForward } = await import('./telemetry-store-forward.js'); + installStoreForward(); + + const exitHandler = onSpy.mock.calls.find((c) => c[0] === 'exit')?.[1] as () => void; + exitHandler(); + + expect(mockPersistToFile).toHaveBeenCalledTimes(1); + const filePath = mockPersistToFile.mock.calls[0][0] as string; + expect(filePath).toContain('workos-cli-telemetry'); + expect(filePath).toContain(`pending-${process.pid}`); + + onSpy.mockRestore(); + }); + }); + + describe('recoverPendingEvents', () => { + it('does nothing if pending dir does not exist', async () => { + mockExistsSync.mockReturnValue(false); + vi.resetModules(); + + const { recoverPendingEvents } = await import('./telemetry-store-forward.js'); + await recoverPendingEvents(); + + expect(mockReaddirSync).not.toHaveBeenCalled(); + expect(mockQueueEvents).not.toHaveBeenCalled(); + }); + + it('reads and queues events from pending files', async () => { + mockExistsSync.mockReturnValue(true); + mockReaddirSync.mockReturnValue(['pending-1234.json', 'pending-5678.json']); + const events1 = [{ type: 'command', sessionId: '1', timestamp: '2024-01-01T00:00:00Z' }]; + const events2 = [{ type: 'crash', sessionId: '2', timestamp: '2024-01-01T00:00:01Z' }]; + mockReadFileSync + .mockReturnValueOnce(JSON.stringify(events1)) + .mockReturnValueOnce(JSON.stringify(events2)); + + vi.resetModules(); + const { recoverPendingEvents } = await import('./telemetry-store-forward.js'); + await recoverPendingEvents(); + + expect(mockQueueEvents).toHaveBeenCalledTimes(2); + expect(mockQueueEvents).toHaveBeenCalledWith(events1); + expect(mockQueueEvents).toHaveBeenCalledWith(events2); + expect(mockFlush).toHaveBeenCalledTimes(1); + }); + + it('deletes files immediately after reading (before send)', async () => { + mockExistsSync.mockReturnValue(true); + mockReaddirSync.mockReturnValue(['pending-1234.json']); + mockReadFileSync.mockReturnValue(JSON.stringify([{ type: 'command', sessionId: '1', timestamp: 'x' }])); + + vi.resetModules(); + const { recoverPendingEvents } = await import('./telemetry-store-forward.js'); + await recoverPendingEvents(); + + // unlinkSync should be called before flush + expect(mockUnlinkSync).toHaveBeenCalledTimes(1); + const unlinkPath = mockUnlinkSync.mock.calls[0][0] as string; + expect(unlinkPath).toContain('pending-1234.json'); + }); + + it('handles corrupted files gracefully', async () => { + mockExistsSync.mockReturnValue(true); + mockReaddirSync.mockReturnValue(['pending-bad.json']); + mockReadFileSync.mockReturnValue('not valid json{{{'); + + vi.resetModules(); + const { recoverPendingEvents } = await import('./telemetry-store-forward.js'); + + // Should not throw + await expect(recoverPendingEvents()).resolves.toBeUndefined(); + // Should try to delete the corrupted file + expect(mockUnlinkSync).toHaveBeenCalled(); + }); + + it('skips non-pending files', async () => { + mockExistsSync.mockReturnValue(true); + mockReaddirSync.mockReturnValue(['pending-1234.json', 'other-file.txt', 'readme.md']); + + const events = [{ type: 'command', sessionId: '1', timestamp: 'x' }]; + mockReadFileSync.mockReturnValue(JSON.stringify(events)); + + vi.resetModules(); + const { recoverPendingEvents } = await import('./telemetry-store-forward.js'); + await recoverPendingEvents(); + + expect(mockReadFileSync).toHaveBeenCalledTimes(1); + }); + + it('skips empty event arrays', async () => { + mockExistsSync.mockReturnValue(true); + mockReaddirSync.mockReturnValue(['pending-1234.json']); + mockReadFileSync.mockReturnValue('[]'); + + vi.resetModules(); + const { recoverPendingEvents } = await import('./telemetry-store-forward.js'); + await recoverPendingEvents(); + + expect(mockQueueEvents).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/src/utils/telemetry-store-forward.ts b/src/utils/telemetry-store-forward.ts new file mode 100644 index 00000000..1339086f --- /dev/null +++ b/src/utils/telemetry-store-forward.ts @@ -0,0 +1,57 @@ +import { readFileSync, readdirSync, unlinkSync, existsSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { telemetryClient } from './telemetry-client.js'; +import { debug } from './debug.js'; + +const PENDING_DIR = join(tmpdir(), 'workos-cli-telemetry'); +const PENDING_FILE = join(PENDING_DIR, `pending-${process.pid}.json`); + +/** + * Register a sync exit handler that persists unsent events to disk. + * Called once at startup. Uses PID in filename to prevent concurrent + * CLI invocations from colliding. + */ +export function installStoreForward(): void { + process.on('exit', () => { + telemetryClient.persistToFile(PENDING_FILE); + }); +} + +/** + * On startup, check for ANY pending files from previous invocations + * (could be from different PIDs) and send them. Non-blocking, fire-and-forget. + */ +export async function recoverPendingEvents(): Promise { + try { + if (!existsSync(PENDING_DIR)) return; + const files = readdirSync(PENDING_DIR).filter( + (f) => f.startsWith('pending-') && f.endsWith('.json'), + ); + + for (const file of files) { + const filePath = join(PENDING_DIR, file); + try { + const raw = readFileSync(filePath, 'utf-8'); + unlinkSync(filePath); // Delete immediately to avoid double-send + + const events = JSON.parse(raw); + if (Array.isArray(events) && events.length > 0) { + telemetryClient.queueEvents(events); + } + } catch { + // Corrupted file — delete and move on + try { + unlinkSync(filePath); + } catch { + /* ignore */ + } + } + } + + // Flush all recovered events in one batch + await telemetryClient.flush(); + } catch { + debug('[Telemetry] Store-forward recovery failed silently'); + } +} From b641538414dc04fa54ab9726342ff392ee53ca7f Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Tue, 14 Apr 2026 14:42:17 -0500 Subject: [PATCH 03/10] feat: add WORKOS_DEBUG=1 env var for verbose logging on all commands - Enable debug output for non-installer commands via env var - Log telemetry event details (type, name, duration, attributes) on flush - Register in debug command's env var catalog --- src/bin.ts | 7 +++++++ src/commands/debug.ts | 1 + src/utils/telemetry-client.ts | 15 ++++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/bin.ts b/src/bin.ts index d8334468..50698ed8 100644 --- a/src/bin.ts +++ b/src/bin.ts @@ -37,6 +37,13 @@ import { installStoreForward, recoverPendingEvents } from './utils/telemetry-sto import { commandTelemetryMiddleware } from './utils/command-telemetry.js'; import { analytics } from './utils/analytics.js'; +// Enable debug logging for all commands via env var. +// Subsumes the installer's --debug flag for non-installer commands. +if (process.env.WORKOS_DEBUG === '1') { + const { enableDebugLogs } = await import('./utils/debug.js'); + enableDebugLogs(); +} + // Telemetry infrastructure: crash reporter, store-forward, and gateway init. // Must be before yargs so crashes during startup are captured. installCrashReporter(); diff --git a/src/commands/debug.ts b/src/commands/debug.ts index c523d402..a711e448 100644 --- a/src/commands/debug.ts +++ b/src/commands/debug.ts @@ -321,6 +321,7 @@ interface EnvVarInfo { } const ENV_VAR_CATALOG: { name: string; effect: string }[] = [ + { name: 'WORKOS_DEBUG', effect: 'Set to "1" to enable verbose debug logging for all commands' }, { name: 'WORKOS_API_KEY', effect: 'Bypasses credential resolution — used directly for API calls' }, { name: 'WORKOS_FORCE_TTY', effect: 'Forces human (non-JSON) output mode, even when piped' }, { name: 'WORKOS_NO_PROMPT', effect: 'Forces non-interactive/JSON mode' }, diff --git a/src/utils/telemetry-client.ts b/src/utils/telemetry-client.ts index bf20dcc1..57f442f7 100644 --- a/src/utils/telemetry-client.ts +++ b/src/utils/telemetry-client.ts @@ -69,7 +69,20 @@ export class TelemetryClient { const timeout = setTimeout(() => controller.abort(), 3000); try { - debug(`[Telemetry] Sending ${payload.events.length} events to ${this.gatewayUrl}/telemetry`); + const eventSummary = payload.events.map((e) => { + const attrs = e.attributes ?? {}; + switch (e.type) { + case 'session.start': return `session.start(mode=${attrs['installer.mode']}, os=${attrs['env.os']})`; + case 'session.end': return `session.end(outcome=${attrs['installer.outcome']}, duration=${attrs['installer.duration_ms']}ms)`; + case 'step': return `step(${(e as any).name}, ${(e as any).durationMs}ms, success=${(e as any).success})`; + case 'agent.tool': return `agent.tool(${(e as any).toolName}, ${(e as any).durationMs}ms)`; + case 'agent.llm': return `agent.llm(${(e as any).model}, in=${(e as any).inputTokens}, out=${(e as any).outputTokens})`; + case 'command': return `command(${attrs['command.name']}, ${attrs['command.duration_ms']}ms, success=${attrs['command.success']})`; + case 'crash': return `crash(${attrs['crash.error_type']}: ${attrs['crash.error_message']})`; + default: return e.type; + } + }).join('\n '); + debug(`[Telemetry] Sending ${payload.events.length} events to ${this.gatewayUrl}/telemetry:\n ${eventSummary}`); const response = await fetch(`${this.gatewayUrl}/telemetry`, { method: 'POST', From f0170759e9955758638763c9dde2f5fda9c36123 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Tue, 14 Apr 2026 15:31:03 -0500 Subject: [PATCH 04/10] fix: address review findings for telemetry implementation - Wrap inline command handlers (seed, setup-org, doctor, etc.) with wrapCommandHandler so they report real duration/success - Skip provisional telemetry event for install command (has own session telemetry) - Add claim -> env.claim to canonical alias map - Defer store-forward file deletion until after successful flush --- src/bin.ts | 38 ++++++++++++++-------------- src/lib/command-aliases.ts | 1 + src/utils/command-telemetry.ts | 7 +++++ src/utils/telemetry-store-forward.ts | 20 ++++++++++----- 4 files changed, 40 insertions(+), 26 deletions(-) diff --git a/src/bin.ts b/src/bin.ts index 50698ed8..691f92ff 100644 --- a/src/bin.ts +++ b/src/bin.ts @@ -34,7 +34,7 @@ import { registerSubcommand } from './utils/register-subcommand.js'; import { COMMAND_ALIASES } from './lib/command-aliases.js'; import { installCrashReporter } from './utils/crash-reporter.js'; import { installStoreForward, recoverPendingEvents } from './utils/telemetry-store-forward.js'; -import { commandTelemetryMiddleware } from './utils/command-telemetry.js'; +import { commandTelemetryMiddleware, wrapCommandHandler } from './utils/command-telemetry.js'; import { analytics } from './utils/analytics.js'; // Enable debug logging for all commands via env var. @@ -365,10 +365,10 @@ yargs(rawArgs) description: 'Copy report to clipboard', }, }), - async (argv) => { + wrapCommandHandler(async (argv) => { const { handleDoctor } = await import('./commands/doctor.js'); await handleDoctor(argv); - }, + }), ) // NOTE: When adding commands here, also update src/utils/help-json.ts .command('env', 'Manage environment configurations (API keys, endpoints, active environment)', (yargs) => { @@ -2040,7 +2040,7 @@ yargs(rawArgs) clean: { type: 'boolean', default: false, describe: 'Tear down seeded resources' }, init: { type: 'boolean', default: false, describe: 'Create an example workos-seed.yml file' }, }), - async (argv) => { + wrapCommandHandler(async (argv) => { await applyInsecureStorage(argv.insecureStorage); const { resolveApiKey, resolveApiBaseUrl } = await import('./lib/api-key.js'); const { runSeed } = await import('./commands/seed.js'); @@ -2049,7 +2049,7 @@ yargs(rawArgs) resolveApiKey({ apiKey: argv.apiKey }), resolveApiBaseUrl(), ); - }, + }), ) .command( 'setup-org ', @@ -2061,7 +2061,7 @@ yargs(rawArgs) domain: { type: 'string', describe: 'Domain to add and verify' }, roles: { type: 'string', describe: 'Comma-separated role slugs to create' }, }), - async (argv) => { + wrapCommandHandler(async (argv) => { await applyInsecureStorage(argv.insecureStorage); const { resolveApiKey, resolveApiBaseUrl } = await import('./lib/api-key.js'); const { runSetupOrg } = await import('./commands/setup-org.js'); @@ -2070,7 +2070,7 @@ yargs(rawArgs) resolveApiKey({ apiKey: argv.apiKey }), resolveApiBaseUrl(), ); - }, + }), ) .command( 'onboard-user ', @@ -2083,7 +2083,7 @@ yargs(rawArgs) role: { type: 'string', describe: 'Role slug to assign' }, wait: { type: 'boolean', default: false, describe: 'Wait for invitation acceptance' }, }), - async (argv) => { + wrapCommandHandler(async (argv) => { await applyInsecureStorage(argv.insecureStorage); const { resolveApiKey, resolveApiBaseUrl } = await import('./lib/api-key.js'); const { runOnboardUser } = await import('./commands/onboard-user.js'); @@ -2092,7 +2092,7 @@ yargs(rawArgs) resolveApiKey({ apiKey: argv.apiKey }), resolveApiBaseUrl(), ); - }, + }), ) .command( 'debug-sso ', @@ -2102,12 +2102,12 @@ yargs(rawArgs) ...insecureStorageOption, 'api-key': { type: 'string' as const, describe: 'WorkOS API key' }, }), - async (argv) => { + wrapCommandHandler(async (argv) => { await applyInsecureStorage(argv.insecureStorage); const { resolveApiKey, resolveApiBaseUrl } = await import('./lib/api-key.js'); const { runDebugSso } = await import('./commands/debug-sso.js'); await runDebugSso(argv.connectionId, resolveApiKey({ apiKey: argv.apiKey }), resolveApiBaseUrl()); - }, + }), ) .command( 'debug-sync ', @@ -2117,12 +2117,12 @@ yargs(rawArgs) ...insecureStorageOption, 'api-key': { type: 'string' as const, describe: 'WorkOS API key' }, }), - async (argv) => { + wrapCommandHandler(async (argv) => { await applyInsecureStorage(argv.insecureStorage); const { resolveApiKey, resolveApiBaseUrl } = await import('./lib/api-key.js'); const { runDebugSync } = await import('./commands/debug-sync.js'); await runDebugSync(argv.directoryId, resolveApiKey({ apiKey: argv.apiKey }), resolveApiBaseUrl()); - }, + }), ) // Alias — canonical command is `workos env claim` .command( @@ -2132,11 +2132,11 @@ yargs(rawArgs) yargs.options({ ...insecureStorageOption, }), - async (argv) => { + wrapCommandHandler(async (argv) => { await applyInsecureStorage(argv.insecureStorage); const { runClaim } = await import('./commands/claim.js'); await runClaim(); - }, + }), ) .command( 'install', @@ -2157,10 +2157,10 @@ yargs(rawArgs) port: { type: 'number', default: 4100, describe: 'Port to listen on' }, seed: { type: 'string', describe: 'Path to seed config file (YAML or JSON)' }, }), - async (argv) => { + wrapCommandHandler(async (argv) => { const { runEmulate } = await import('./commands/emulate.js'); await runEmulate({ port: argv.port, seed: argv.seed, json: argv.json as boolean }); - }, + }), ) .command( 'dev', @@ -2170,14 +2170,14 @@ yargs(rawArgs) port: { type: 'number', default: 4100, describe: 'Emulator port' }, seed: { type: 'string', describe: 'Path to seed config file' }, }), - async (argv) => { + wrapCommandHandler(async (argv) => { const { runDev } = await import('./commands/dev.js'); await runDev({ port: argv.port, seed: argv.seed, '--': argv['--'] as string[] | undefined, }); - }, + }), ) .command('debug', false, (yargs) => { yargs.options(insecureStorageOption); diff --git a/src/lib/command-aliases.ts b/src/lib/command-aliases.ts index 8f515ae3..7a8632ef 100644 --- a/src/lib/command-aliases.ts +++ b/src/lib/command-aliases.ts @@ -7,4 +7,5 @@ */ export const COMMAND_ALIASES: Record = { org: 'organization', + claim: 'env.claim', }; diff --git a/src/utils/command-telemetry.ts b/src/utils/command-telemetry.ts index 57603b19..887ba472 100644 --- a/src/utils/command-telemetry.ts +++ b/src/utils/command-telemetry.ts @@ -2,6 +2,9 @@ import { analytics } from './analytics.js'; import { COMMAND_ALIASES } from '../lib/command-aliases.js'; import { WORKOS_TELEMETRY_ENABLED } from '../lib/constants.js'; +/** Commands that have their own telemetry (e.g., installer session events). */ +const SKIP_TELEMETRY_COMMANDS = new Set(['install']); + /** * Resolve user-typed command parts to their canonical name. * Applies alias mapping to the top-level command only. @@ -51,6 +54,10 @@ export function commandTelemetryMiddleware(rawArgs: string[]) { argv.__telemetryStartTime = startTime; argv.__telemetryFlags = flags; + // Skip provisional event for commands with their own telemetry (e.g., install) + const topLevelCommand = commandParts[0] ?? ''; + if (SKIP_TELEMETRY_COMMANDS.has(topLevelCommand)) return; + // Queue provisional event NOW, before the handler runs. // If the handler calls process.exit(), store-forward persists this. analytics.commandExecuted(commandName, 0, true, { flags }); diff --git a/src/utils/telemetry-store-forward.ts b/src/utils/telemetry-store-forward.ts index 1339086f..1ed7a70d 100644 --- a/src/utils/telemetry-store-forward.ts +++ b/src/utils/telemetry-store-forward.ts @@ -29,28 +29,34 @@ export async function recoverPendingEvents(): Promise { (f) => f.startsWith('pending-') && f.endsWith('.json'), ); + const recoveredFiles: string[] = []; for (const file of files) { const filePath = join(PENDING_DIR, file); try { const raw = readFileSync(filePath, 'utf-8'); - unlinkSync(filePath); // Delete immediately to avoid double-send - const events = JSON.parse(raw); if (Array.isArray(events) && events.length > 0) { telemetryClient.queueEvents(events); + recoveredFiles.push(filePath); + } else { + // Empty file — delete immediately + try { unlinkSync(filePath); } catch { /* ignore */ } } } catch { // Corrupted file — delete and move on - try { - unlinkSync(filePath); - } catch { - /* ignore */ - } + try { unlinkSync(filePath); } catch { /* ignore */ } } } // Flush all recovered events in one batch await telemetryClient.flush(); + + // Only delete files AFTER successful flush. + // If flush fails, events are retained in memory and will be + // re-persisted by the exit handler (store-forward). + for (const filePath of recoveredFiles) { + try { unlinkSync(filePath); } catch { /* ignore */ } + } } catch { debug('[Telemetry] Store-forward recovery failed silently'); } From 77adb3391661ae33c9058b6e4b356ea79ed922da Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Tue, 14 Apr 2026 18:12:04 -0500 Subject: [PATCH 05/10] fix: drop telemetry events on 4xx responses to prevent accumulation Client errors (401, 403) are permanent failures that won't succeed on retry. Only retain events for 5xx (transient server errors) and network failures where store-forward retry is meaningful. --- src/utils/telemetry-client.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/utils/telemetry-client.ts b/src/utils/telemetry-client.ts index 57f442f7..877cdd81 100644 --- a/src/utils/telemetry-client.ts +++ b/src/utils/telemetry-client.ts @@ -95,7 +95,11 @@ export class TelemetryClient { this.events = []; } else { debug(`[Telemetry] Failed to send: ${response.status}`); - // Events remain in queue for store-forward to persist + // Clear on 4xx (permanent failures like 401/403 that won't succeed on retry). + // Retain only on 5xx (server errors that may be transient). + if (response.status >= 400 && response.status < 500) { + this.events = []; + } } } catch (error) { debug(`[Telemetry] Error sending events: ${error}`); From fc64eae4979fc3a0708e2f5e29e9833ec311fc65 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Tue, 14 Apr 2026 18:21:33 -0500 Subject: [PATCH 06/10] fix: flush returns boolean, splice prevents race, in-process delivery - flush() returns true (sent/dropped) or false (retryable) so callers can act on the result - Use splice(0, count) instead of clearing all events, protecting events queued concurrently during the fetch - wrapCommandHandler flushes in-process so events are sent immediately instead of always deferring to next invocation via store-forward - Store-forward recovery deletes files after loading into memory (events are re-persisted by exit handler if flush fails) - Skip provisional events for dashboard and $0 (installer entry points) - Add 4xx drop test coverage --- src/utils/command-telemetry.ts | 10 ++++++++-- src/utils/telemetry-client.spec.ts | 23 +++++++++++++++++------ src/utils/telemetry-client.ts | 28 +++++++++++++++++++--------- src/utils/telemetry-store-forward.ts | 12 ++++++------ 4 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/utils/command-telemetry.ts b/src/utils/command-telemetry.ts index 887ba472..facd812c 100644 --- a/src/utils/command-telemetry.ts +++ b/src/utils/command-telemetry.ts @@ -1,9 +1,11 @@ import { analytics } from './analytics.js'; +import { telemetryClient } from './telemetry-client.js'; import { COMMAND_ALIASES } from '../lib/command-aliases.js'; import { WORKOS_TELEMETRY_ENABLED } from '../lib/constants.js'; -/** Commands that have their own telemetry (e.g., installer session events). */ -const SKIP_TELEMETRY_COMMANDS = new Set(['install']); +/** Commands that have their own telemetry (e.g., installer session events). + * 'root' is the default $0 handler which prompts to run the installer. */ +const SKIP_TELEMETRY_COMMANDS = new Set(['install', 'dashboard', 'root']); /** * Resolve user-typed command parts to their canonical name. @@ -88,6 +90,10 @@ export function wrapCommandHandler( flags, }); throw error; + } finally { + // Flush in-process so events are sent immediately, not deferred to next invocation. + // If flush fails, store-forward persists on exit. + await telemetryClient.flush().catch(() => {}); } }; } diff --git a/src/utils/telemetry-client.spec.ts b/src/utils/telemetry-client.spec.ts index 23609263..4746f598 100644 --- a/src/utils/telemetry-client.spec.ts +++ b/src/utils/telemetry-client.spec.ts @@ -170,22 +170,33 @@ describe('TelemetryClient', () => { expect(mockFetch).toHaveBeenCalledTimes(2); }); - it('handles network errors silently', async () => { + it('returns false on network errors (retryable)', async () => { mockFetch.mockRejectedValueOnce(new Error('Network error')); client.setGatewayUrl('http://localhost:8000'); client.queueEvent({ type: 'session.start', sessionId: '123', timestamp: new Date().toISOString() }); - // Should not throw - await expect(client.flush()).resolves.toBeUndefined(); + await expect(client.flush()).resolves.toBe(false); }); - it('handles non-ok responses silently', async () => { + it('returns false on 5xx (retryable, events retained)', async () => { mockFetch.mockResolvedValueOnce({ ok: false, status: 500 }); client.setGatewayUrl('http://localhost:8000'); client.queueEvent({ type: 'session.start', sessionId: '123', timestamp: new Date().toISOString() }); - // Should not throw - await expect(client.flush()).resolves.toBeUndefined(); + await expect(client.flush()).resolves.toBe(false); + }); + + it('drops events on 4xx and returns true (permanent failure)', async () => { + mockFetch.mockResolvedValueOnce({ ok: false, status: 401 }); + client.setGatewayUrl('http://localhost:8000'); + client.queueEvent({ type: 'session.start', sessionId: '123', timestamp: new Date().toISOString() }); + + const result = await client.flush(); + expect(result).toBe(true); + // Verify events were cleared — second flush should be a no-op + mockFetch.mockClear(); + await client.flush(); + expect(mockFetch).not.toHaveBeenCalled(); }); it('sends correct Content-Type header', async () => { diff --git a/src/utils/telemetry-client.ts b/src/utils/telemetry-client.ts index 877cdd81..5d20cf3d 100644 --- a/src/utils/telemetry-client.ts +++ b/src/utils/telemetry-client.ts @@ -45,15 +45,21 @@ export class TelemetryClient { this.events.push(...events); } - async flush(): Promise { - if (this.events.length === 0) return; + /** + * Flush queued events. Returns true if events were sent or intentionally + * dropped (4xx), false if they should be retried (5xx/network error). + * Uses splice to only remove the events that were in the snapshot, + * protecting any events queued concurrently during the fetch. + */ + async flush(): Promise { + if (this.events.length === 0) return true; if (!this.gatewayUrl) { debug('[Telemetry] No gateway URL configured, skipping flush'); - return; + return false; } - const payload: TelemetryRequest = { events: [...this.events] }; - // DO NOT clear this.events yet — retain until success + const count = this.events.length; + const payload: TelemetryRequest = { events: this.events.slice(0, count) }; const headers: Record = { 'Content-Type': 'application/json', @@ -92,18 +98,22 @@ export class TelemetryClient { }); if (response.ok) { - this.events = []; + this.events.splice(0, count); + return true; } else { debug(`[Telemetry] Failed to send: ${response.status}`); - // Clear on 4xx (permanent failures like 401/403 that won't succeed on retry). - // Retain only on 5xx (server errors that may be transient). + // Drop on 4xx (permanent failures like 401/403 won't succeed on retry). + // Retain on 5xx (transient server errors) for store-forward. if (response.status >= 400 && response.status < 500) { - this.events = []; + this.events.splice(0, count); + return true; // intentionally dropped } + return false; } } catch (error) { debug(`[Telemetry] Error sending events: ${error}`); // Events remain in queue for store-forward to persist + return false; } finally { clearTimeout(timeout); } diff --git a/src/utils/telemetry-store-forward.ts b/src/utils/telemetry-store-forward.ts index 1ed7a70d..bda57b4b 100644 --- a/src/utils/telemetry-store-forward.ts +++ b/src/utils/telemetry-store-forward.ts @@ -48,15 +48,15 @@ export async function recoverPendingEvents(): Promise { } } - // Flush all recovered events in one batch - await telemetryClient.flush(); - - // Only delete files AFTER successful flush. - // If flush fails, events are retained in memory and will be - // re-persisted by the exit handler (store-forward). + // Delete source files — events are now in memory regardless of flush outcome. + // If flush succeeds: events sent, done. + // If flush fails: events stay in memory, exit handler re-persists to new PID file. for (const filePath of recoveredFiles) { try { unlinkSync(filePath); } catch { /* ignore */ } } + + // Flush all recovered events in one batch + await telemetryClient.flush(); } catch { debug('[Telemetry] Store-forward recovery failed silently'); } From db821a62d4b0c3edfc90bf99c704945848d79771 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Wed, 15 Apr 2026 09:27:42 -0500 Subject: [PATCH 07/10] docs: document telemetry wiring requirements for new commands Add a section to CLAUDE.md explaining which commands auto-emit telemetry (registerSubcommand) versus which need manual wrapCommandHandler wrapping (inline top-level .command() calls). Add a pointer comment in bin.ts near the workflow commands block. Prevents new top-level commands from silently emitting duration=0 telemetry. --- CLAUDE.md | 33 +++++++++++++++++++++++++++++++++ src/bin.ts | 4 ++++ 2 files changed, 37 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index c97a1678..0afd3004 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -48,6 +48,39 @@ pnpm typecheck # Type check 2. Register in `src/bin.ts` and update `src/utils/help-json.ts` command registry 3. Include JSON mode tests in spec file +## Telemetry Wiring for New Commands + +All commands auto-emit a `command` telemetry event with name, duration, and success/failure. How you register the command determines whether this is automatic: + +**Subcommands via `registerSubcommand()`** → auto-wired. Telemetry happens for free. + +```typescript +.command('user', 'Manage users', (yargs) => { + registerSubcommand(yargs, 'reset-password', '...', (y) => y, + async (argv) => { await runResetPassword(argv); }, // auto-wrapped + ); +}) +``` + +**Top-level `.command()` with inline handler** → MUST manually wrap with `wrapCommandHandler()`: + +```typescript +.command( + 'migrate', + 'Migrate from another provider', + (yargs) => yargs.options({...}), + wrapCommandHandler(async (argv) => { // <-- REQUIRED + await runMigrate(argv); + }), +) +``` + +If you forget `wrapCommandHandler`, the command still emits a telemetry event (queued by middleware), but duration will be `0` and success will always be `true` -- misleading data in dashboards. + +**Skip list**: commands in `SKIP_TELEMETRY_COMMANDS` (`command-telemetry.ts`) are excluded from command-level telemetry because they have their own session-based telemetry. Currently: `install`, `dashboard`, `root` (the default `$0` handler). Add to this set if you're building another installer entry point. + +**Aliases**: if you register a command with multiple names (e.g., `['organization', 'org']`), add the alias to `src/lib/command-aliases.ts` so metrics don't fragment across `org.list` and `organization.list`. + ## Do / Don't **Do:** diff --git a/src/bin.ts b/src/bin.ts index 691f92ff..44cfed84 100644 --- a/src/bin.ts +++ b/src/bin.ts @@ -2029,6 +2029,10 @@ yargs(rawArgs) return yargs.demandCommand(1, 'Please specify an org-domain subcommand').strict(); }) // --- Workflow Commands --- + // NOTE: Top-level `.command()` registrations with inline handlers MUST wrap + // the handler with `wrapCommandHandler()` for correct command telemetry. + // Subcommands registered via `registerSubcommand()` are auto-wrapped. + // See CLAUDE.md "Telemetry Wiring for New Commands". .command( 'seed', 'Seed WorkOS environment from a YAML config file', From d213fe17d1312e1646e9c52cff6cb206904e8455 Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Wed, 15 Apr 2026 09:39:26 -0500 Subject: [PATCH 08/10] feat: add user identification and unclaimed env support to telemetry - Add workos.user_id to command and crash events (from stored credentials or unclaimed environment clientId) so dashboards can count unique users - Add cli.version to command and crash events for release adoption tracking - Support claim-token auth path on the telemetry client, so unclaimed environments' telemetry reaches the API (guard accepts this path too) - Rename CrashEvent's installer.version to cli.version (crashes happen outside the installer too) - initForNonInstaller() now wires up user_id and claim-token auth --- src/utils/analytics.spec.ts | 8 +++++--- src/utils/analytics.ts | 33 ++++++++++++++++++++++++++++++--- src/utils/telemetry-client.ts | 16 ++++++++++++++++ src/utils/telemetry-types.ts | 5 ++++- 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/src/utils/analytics.spec.ts b/src/utils/analytics.spec.ts index c98531b8..f354e135 100644 --- a/src/utils/analytics.spec.ts +++ b/src/utils/analytics.spec.ts @@ -506,7 +506,7 @@ describe('Analytics', () => { 'crash.error_message': 'Unexpected failure', 'crash.stack': 'Error: Unexpected failure\n at foo.ts:1', 'crash.command': 'install', - 'installer.version': '1.0.0', + 'cli.version': '1.0.0', 'env.os': expect.any(String), 'env.node_version': expect.any(String), }), @@ -523,11 +523,13 @@ describe('Analytics', () => { expect(event.attributes['crash.stack'].length).toBe(4096); }); - it('defaults version to unknown when not provided', () => { + it('falls back to package version when not explicitly provided', () => { analytics.captureUnhandledCrash(new Error('test')); const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'crash')[0]; - expect(event.attributes['installer.version']).toBe('unknown'); + // Falls back to getVersion() which reads from package.json — any real version string + expect(event.attributes['cli.version']).toEqual(expect.any(String)); + expect(event.attributes['cli.version']).not.toBe(''); }); }); diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts index 9253b6e6..14931df6 100644 --- a/src/utils/analytics.ts +++ b/src/utils/analytics.ts @@ -12,8 +12,9 @@ import type { CrashEvent, } from './telemetry-types.js'; import { WORKOS_TELEMETRY_ENABLED } from '../lib/constants.js'; -import { getLlmGatewayUrl } from '../lib/settings.js'; +import { getLlmGatewayUrl, getVersion } from '../lib/settings.js'; import { getCredentials } from '../lib/credentials.js'; +import { getActiveEnvironment, isUnclaimedEnvironment } from '../lib/config-store.js'; export class Analytics { private tags: Record = {}; @@ -47,7 +48,14 @@ export class Analytics { /** * Initialize telemetry for non-installer commands. - * Sets gatewayUrl from default config and loads stored JWT if available. + * Sets gatewayUrl from default config and loads auth credentials. + * + * Auth priority: + * 1. Stored JWT (Bearer) — logged-in users + * 2. Claim token — unclaimed environments + * 3. None — API-key-only users (telemetry silently dropped by API guard) + * + * Also captures the user/environment identifier for per-user analytics. * The installer flow sets these itself in run-with-core.ts; this covers * management commands like `org list`, `auth login`, etc. */ @@ -61,6 +69,22 @@ export class Analytics { if (creds?.accessToken) { telemetryClient.setAccessToken(creds.accessToken); } + if (creds?.userId) { + this.distinctId = creds.userId; + } + + // Check for unclaimed environment — fall back to claim-token auth + // so unclaimed users' telemetry still reaches the backend. + try { + const env = getActiveEnvironment(); + if (env && isUnclaimedEnvironment(env)) { + telemetryClient.setClaimTokenAuth(env.clientId, env.claimToken); + // Tag distinctId so unclaimed sessions are identifiable in analytics + this.distinctId = this.distinctId ?? `unclaimed:${env.clientId}`; + } + } catch { + // Config-store failure is non-fatal for telemetry + } } setTag(key: string, value: string | boolean | number | null | undefined) { @@ -215,6 +239,8 @@ export class Analytics { 'command.name': name, 'command.duration_ms': durationMs, 'command.success': success, + 'cli.version': getVersion(), + ...(this.distinctId ? { 'workos.user_id': this.distinctId } : {}), ...(options?.error ? { 'command.error_type': options.error.name, @@ -264,7 +290,8 @@ export class Analytics { 'crash.error_message': error.message, 'crash.stack': truncatedStack, ...(options?.command ? { 'crash.command': options.command } : {}), - 'installer.version': options?.version ?? 'unknown', + 'cli.version': options?.version ?? getVersion(), + ...(this.distinctId ? { 'workos.user_id': this.distinctId } : {}), ...this.getEnvFingerprint(), }, }; diff --git a/src/utils/telemetry-client.ts b/src/utils/telemetry-client.ts index 5d20cf3d..3df750c3 100644 --- a/src/utils/telemetry-client.ts +++ b/src/utils/telemetry-client.ts @@ -11,6 +11,8 @@ import { getCredentials } from '../lib/credentials.js'; export class TelemetryClient { private events: TelemetryEvent[] = []; private accessToken: string | null = null; + private claimToken: string | null = null; + private clientId: string | null = null; private gatewayUrl: string | null = null; setGatewayUrl(url: string) { @@ -21,6 +23,16 @@ export class TelemetryClient { this.accessToken = token; } + /** + * Set claim-token auth for unclaimed environments. + * The API's LlmGatewayGuard accepts either a JWT (Bearer) or claim token + * (x-workos-claim-token + x-workos-client-id headers). + */ + setClaimTokenAuth(clientId: string, claimToken: string) { + this.clientId = clientId; + this.claimToken = claimToken; + } + queueEvent(event: TelemetryEvent) { this.events.push(event); } @@ -69,6 +81,10 @@ export class TelemetryClient { const token = freshCreds?.accessToken ?? this.accessToken; if (token) { headers['Authorization'] = `Bearer ${token}`; + } else if (this.claimToken && this.clientId) { + // Unclaimed environment auth path — guard accepts this instead of JWT + headers['x-workos-claim-token'] = this.claimToken; + headers['x-workos-client-id'] = this.clientId; } const controller = new AbortController(); diff --git a/src/utils/telemetry-types.ts b/src/utils/telemetry-types.ts index 93faa117..7d84eb99 100644 --- a/src/utils/telemetry-types.ts +++ b/src/utils/telemetry-types.ts @@ -70,6 +70,8 @@ export interface CommandEvent extends TelemetryEvent { 'command.error_type'?: string; 'command.error_message'?: string; 'command.flags'?: string; + 'cli.version': string; + 'workos.user_id'?: string; 'env.os': string; 'env.os_version': string; 'env.node_version': string; @@ -86,7 +88,8 @@ export interface CrashEvent extends TelemetryEvent { 'crash.error_message': string; 'crash.stack': string; 'crash.command'?: string; - 'installer.version': string; + 'cli.version': string; + 'workos.user_id'?: string; 'env.os': string; 'env.os_version': string; 'env.node_version': string; From 7103115740fdb8b9e984bf0779b161bd0a20516b Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Wed, 15 Apr 2026 14:03:37 -0500 Subject: [PATCH 09/10] fix: sanitize error.message and error.stack in telemetry events Closes security-audit finding #1 on PR #122 (telemetry message sanitization). `error.message` was flowing into 4 capture sites unsanitized, leaking homedir paths (and rarely, credentials) to the WorkOS gateway. - Add `sanitizeMessage()` in crash-reporter.ts: homedir strip + Bearer/ sk_*/JWT redaction + 1KB truncation. - Factor secret redaction into shared `redactSecrets()` used by both `sanitizeMessage` and `sanitizeStack` (Node echoes `.message` into the leading `Error.stack` line, so message-only sanitization was insufficient). - Add private `extractErrorFields()` chokepoint on `Analytics`; route all 4 capture sites through it (`captureException`, `stepCompleted`, `commandExecuted`, `captureUnhandledCrash`). `replaceLastCommandEvent` inherits sanitization via its delegation to `commandExecuted`. - `captureUnhandledCrash` now uses `sanitizeStack` instead of inline truncation, providing defense-in-depth for callers that bypass the crash-reporter wrapper. - Add regression guard test (`telemetry-sanitize.spec.ts`): poisons every capture method with homedir + Bearer + sk_live_ + JWT, asserts no marker reaches the serialized queue. Reviewed: ideation:reviewer cycle 1 PASS (0 critical, 0 high). --- src/utils/analytics.spec.ts | 6 +- src/utils/analytics.ts | 37 +++-- src/utils/crash-reporter.ts | 36 +++- src/utils/telemetry-sanitize.spec.ts | 235 +++++++++++++++++++++++++++ 4 files changed, 299 insertions(+), 15 deletions(-) create mode 100644 src/utils/telemetry-sanitize.spec.ts diff --git a/src/utils/analytics.spec.ts b/src/utils/analytics.spec.ts index f354e135..24bf8632 100644 --- a/src/utils/analytics.spec.ts +++ b/src/utils/analytics.spec.ts @@ -514,13 +514,15 @@ describe('Analytics', () => { ); }); - it('truncates stack traces to 4KB', () => { + it('truncates stack traces to 4KB with a truncation marker', () => { const error = new Error('Big stack'); error.stack = 'x'.repeat(5000); analytics.captureUnhandledCrash(error); const event = mockQueueEvent.mock.calls.find((c) => c[0].type === 'crash')[0]; - expect(event.attributes['crash.stack'].length).toBe(4096); + // sanitizeStack truncates at 4096 and appends '\n...[truncated]' + expect(event.attributes['crash.stack']).toMatch(/\n\.\.\.\[truncated\]$/); + expect(event.attributes['crash.stack'].startsWith('x'.repeat(4096))).toBe(true); }); it('falls back to package version when not explicitly provided', () => { diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts index 14931df6..52a525ad 100644 --- a/src/utils/analytics.ts +++ b/src/utils/analytics.ts @@ -15,6 +15,7 @@ import { WORKOS_TELEMETRY_ENABLED } from '../lib/constants.js'; import { getLlmGatewayUrl, getVersion } from '../lib/settings.js'; import { getCredentials } from '../lib/credentials.js'; import { getActiveEnvironment, isUnclaimedEnvironment } from '../lib/config-store.js'; +import { sanitizeMessage, sanitizeStack } from './crash-reporter.js'; export class Analytics { private tags: Record = {}; @@ -110,8 +111,9 @@ export class Analytics { if (!WORKOS_TELEMETRY_ENABLED) return; debug('[Analytics] captureException:', error.message, properties); - this.tags['error.type'] = error.name; - this.tags['error.message'] = error.message; + const { type, message } = this.extractErrorFields(error); + this.tags['error.type'] = type; + this.tags['error.message'] = message; } async getFeatureFlag(_flagKey: string): Promise { @@ -119,6 +121,18 @@ export class Analytics { return undefined; } + /** + * Single chokepoint for converting an Error into telemetry-safe fields. + * All capture methods that record error details MUST go through this. + * Sanitizes the message via sanitizeMessage (homedir + secret patterns + truncation). + */ + private extractErrorFields(error: Error): { type: string; message: string } { + return { + type: error.name, + message: sanitizeMessage(error.message), + }; + } + private detectCiProvider(): string | undefined { if (process.env.GITHUB_ACTIONS) return 'github-actions'; if (process.env.BUILDKITE) return 'buildkite'; @@ -179,7 +193,7 @@ export class Analytics { startTimestamp: new Date(Date.now() - durationMs).toISOString(), durationMs, success, - error: error ? { type: error.name, message: error.message } : undefined, + error: error ? this.extractErrorFields(error) : undefined, }; telemetryClient.queueEvent(event); @@ -231,6 +245,8 @@ export class Analytics { ) { if (!WORKOS_TELEMETRY_ENABLED) return; + const errorFields = options?.error ? this.extractErrorFields(options.error) : undefined; + const event: CommandEvent = { type: 'command', sessionId: this.sessionId, @@ -241,10 +257,10 @@ export class Analytics { 'command.success': success, 'cli.version': getVersion(), ...(this.distinctId ? { 'workos.user_id': this.distinctId } : {}), - ...(options?.error + ...(errorFields ? { - 'command.error_type': options.error.name, - 'command.error_message': options.error.message, + 'command.error_type': errorFields.type, + 'command.error_message': errorFields.message, } : {}), ...(options?.flags?.length @@ -278,17 +294,16 @@ export class Analytics { captureUnhandledCrash(error: Error, options?: { command?: string; version?: string }) { if (!WORKOS_TELEMETRY_ENABLED) return; - const stack = error.stack ?? ''; - const truncatedStack = stack.length > 4096 ? stack.slice(0, 4096) : stack; + const { type, message } = this.extractErrorFields(error); const event: CrashEvent = { type: 'crash', sessionId: this.sessionId, timestamp: new Date().toISOString(), attributes: { - 'crash.error_type': error.name, - 'crash.error_message': error.message, - 'crash.stack': truncatedStack, + 'crash.error_type': type, + 'crash.error_message': message, + 'crash.stack': sanitizeStack(error.stack), ...(options?.command ? { 'crash.command': options.command } : {}), 'cli.version': options?.version ?? getVersion(), ...(this.distinctId ? { 'workos.user_id': this.distinctId } : {}), diff --git a/src/utils/crash-reporter.ts b/src/utils/crash-reporter.ts index 14207793..299b56c1 100644 --- a/src/utils/crash-reporter.ts +++ b/src/utils/crash-reporter.ts @@ -2,11 +2,26 @@ import { analytics } from './analytics.js'; import { homedir } from 'node:os'; const MAX_STACK_LENGTH = 4096; +const MAX_MESSAGE_LENGTH = 1024; let isCrashing = false; /** - * Sanitize stack trace: strip absolute paths to relative, remove home dir. - * Prevents leaking file system layout in telemetry events. + * Redact known credential patterns from a string: + * Bearer tokens, sk_test_/sk_live_ keys, raw JWTs. + * Used by both sanitizeStack and sanitizeMessage so secrets that appear + * in error messages (which Node echoes into stack first lines) are removed + * regardless of which path captures them. + */ +function redactSecrets(s: string): string { + return s + .replace(/Bearer\s+[A-Za-z0-9._-]+/g, 'Bearer ') + .replace(/\bsk_(test|live)_[A-Za-z0-9]+/g, 'sk_') + .replace(/\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+/g, ''); +} + +/** + * Sanitize stack trace: strip absolute paths to relative, remove home dir, + * redact credentials that may appear via the leading "Error: " line. */ export function sanitizeStack(stack: string | undefined): string { if (!stack) return ''; @@ -14,11 +29,28 @@ export function sanitizeStack(stack: string | undefined): string { let sanitized = stack; sanitized = sanitized.replaceAll(home, '~'); sanitized = sanitized.replace(/\/[^\s:]+\/(node_modules|dist|src)\//g, '$1/'); + sanitized = redactSecrets(sanitized); return sanitized.length > MAX_STACK_LENGTH ? sanitized.slice(0, MAX_STACK_LENGTH) + '\n...[truncated]' : sanitized; } +/** + * Sanitize an error message before sending to telemetry. + * Strips home directory and redacts known credential patterns + * (Bearer tokens, sk_test_/sk_live_ keys, raw JWTs). + * Truncates to MAX_MESSAGE_LENGTH chars. + */ +export function sanitizeMessage(msg: string | undefined): string { + if (!msg) return ''; + const home = homedir(); + let sanitized = msg.replaceAll(home, '~'); + sanitized = redactSecrets(sanitized); + return sanitized.length > MAX_MESSAGE_LENGTH + ? sanitized.slice(0, MAX_MESSAGE_LENGTH) + '...[truncated]' + : sanitized; +} + /** * Register global handlers for uncaughtException and unhandledRejection * that capture crash details before the process exits. diff --git a/src/utils/telemetry-sanitize.spec.ts b/src/utils/telemetry-sanitize.spec.ts new file mode 100644 index 00000000..9724626e --- /dev/null +++ b/src/utils/telemetry-sanitize.spec.ts @@ -0,0 +1,235 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { homedir } from 'node:os'; +import { sanitizeMessage } from './crash-reporter.js'; + +// Mock telemetry client so we can inspect queued events without HTTP. +// Use vi.hoisted so these are available when the hoisted vi.mock factory runs +// (importing sanitizeMessage transitively loads analytics.ts which loads telemetry-client.ts). +const { + mockSetGatewayUrl, + mockSetAccessToken, + mockQueueEvent, + mockFlush, + mockReplaceLastEventOfType, +} = vi.hoisted(() => ({ + mockSetGatewayUrl: vi.fn(), + mockSetAccessToken: vi.fn(), + mockQueueEvent: vi.fn(), + mockFlush: vi.fn().mockResolvedValue(undefined), + mockReplaceLastEventOfType: vi.fn(), +})); + +vi.mock('./telemetry-client.js', () => ({ + telemetryClient: { + setGatewayUrl: mockSetGatewayUrl, + setAccessToken: mockSetAccessToken, + queueEvent: mockQueueEvent, + flush: mockFlush, + replaceLastEventOfType: (...args: unknown[]) => mockReplaceLastEventOfType(...args), + }, +})); + +vi.mock('./debug.js', () => ({ + debug: vi.fn(), +})); + +vi.mock('uuid', () => ({ + v4: () => 'test-session-id-sanitize', +})); + +vi.mock('../lib/settings.js', () => ({ + getLlmGatewayUrl: () => 'https://api.workos.com/llm-gateway', + getConfig: () => ({ + nodeVersion: '>=18', + logging: { debugMode: false }, + telemetry: { enabled: true, eventName: 'installer_interaction' }, + documentation: { + workosDocsUrl: 'https://workos.com/docs', + dashboardUrl: 'https://dashboard.workos.com', + issuesUrl: 'https://github.com', + }, + legacy: { oauthPort: 3000 }, + }), + getVersion: () => '0.0.0-test', +})); + +vi.mock('../lib/credentials.js', () => ({ + getCredentials: vi.fn(), +})); + +describe('sanitizeMessage', () => { + it('strips the home directory', () => { + const home = homedir(); + const input = `ENOENT: no such file or directory, open '${home}/.workos/credentials.json'`; + const out = sanitizeMessage(input); + expect(out).not.toContain(home); + expect(out).toContain('~/.workos/credentials.json'); + }); + + it('redacts Bearer tokens', () => { + const out = sanitizeMessage('401 Unauthorized: Bearer abc123.def456.ghi789 invalid'); + expect(out).not.toContain('abc123.def456.ghi789'); + expect(out).toContain('Bearer '); + }); + + it('redacts sk_live_ keys', () => { + const out = sanitizeMessage('Authentication failed for sk_live_xyzABC123'); + expect(out).not.toContain('sk_live_xyzABC123'); + expect(out).toContain('sk_'); + }); + + it('redacts sk_test_ keys', () => { + const out = sanitizeMessage('Bad key sk_test_qrsTUV456 in request'); + expect(out).not.toContain('sk_test_qrsTUV456'); + expect(out).toContain('sk_'); + }); + + it('redacts raw JWTs', () => { + const jwt = 'eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjMifQ.signature_value'; + const out = sanitizeMessage(`Token ${jwt} expired`); + expect(out).not.toContain(jwt); + expect(out).toContain(''); + }); + + it('truncates messages longer than 1024 chars with marker', () => { + const long = 'a'.repeat(2000); + const out = sanitizeMessage(long); + expect(out.length).toBe(1024 + '...[truncated]'.length); + expect(out.endsWith('...[truncated]')).toBe(true); + }); + + it('redacts before truncating so secrets near the boundary are not partially preserved', () => { + // Place a JWT at position 1010 so its tail would fall past the 1024 cap. + const padding = 'x'.repeat(1010); + const jwt = 'eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjMifQ.signature_value'; + const out = sanitizeMessage(padding + jwt); + expect(out).not.toContain('signature_value'); + expect(out).not.toContain('eyJhbGciOiJIUzI1NiJ9'); + }); + + it('returns empty string for undefined or empty input', () => { + expect(sanitizeMessage(undefined)).toBe(''); + expect(sanitizeMessage('')).toBe(''); + }); + + it('redacts all marker types in a single string', () => { + const home = homedir(); + const input = `${home}/x Bearer abc.def.ghi sk_live_ABC123 eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjMifQ.signature_value`; + const out = sanitizeMessage(input); + expect(out).not.toContain(home); + expect(out).not.toContain('abc.def.ghi'); + expect(out).not.toContain('sk_live_ABC123'); + expect(out).not.toContain('eyJhbGciOiJIUzI1NiJ9'); + expect(out).toContain('~/x'); + expect(out).toContain('Bearer '); + expect(out).toContain('sk_'); + expect(out).toContain(''); + }); +}); + +describe('Analytics: no PII or secrets in queued events', () => { + const home = homedir(); + const POISON_BEARER = 'abc123.def456.ghi789'; + const POISON_SK = 'sk_live_xyzABC123'; + const POISON_JWT = 'eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjMifQ.signature_value'; + const POISON_PATH = `${home}/.workos/credentials.json`; + const POISON_MESSAGE = `ENOENT at ${POISON_PATH} Bearer ${POISON_BEARER} key=${POISON_SK} jwt=${POISON_JWT}`; + + let Analytics: typeof import('./analytics.js').Analytics; + let analytics: InstanceType; + const originalEnv = process.env.WORKOS_TELEMETRY; + + beforeEach(async () => { + vi.clearAllMocks(); + delete process.env.WORKOS_TELEMETRY; + vi.resetModules(); + vi.doMock('./telemetry-client.js', () => ({ + telemetryClient: { + setGatewayUrl: mockSetGatewayUrl, + setAccessToken: mockSetAccessToken, + queueEvent: mockQueueEvent, + flush: mockFlush, + replaceLastEventOfType: (...args: unknown[]) => mockReplaceLastEventOfType(...args), + }, + })); + vi.doMock('../lib/settings.js', () => ({ + getLlmGatewayUrl: () => 'https://api.workos.com/llm-gateway', + getConfig: () => ({ + nodeVersion: '>=18', + logging: { debugMode: false }, + telemetry: { enabled: true, eventName: 'installer_interaction' }, + documentation: { + workosDocsUrl: 'https://workos.com/docs', + dashboardUrl: 'https://dashboard.workos.com', + issuesUrl: 'https://github.com', + }, + legacy: { oauthPort: 3000 }, + }), + getVersion: () => '0.0.0-test', + })); + vi.doMock('../lib/credentials.js', () => ({ + getCredentials: vi.fn(), + })); + const module = await import('./analytics.js'); + Analytics = module.Analytics; + analytics = new Analytics(); + }); + + afterEach(() => { + if (originalEnv !== undefined) { + process.env.WORKOS_TELEMETRY = originalEnv; + } else { + delete process.env.WORKOS_TELEMETRY; + } + }); + + function assertCleanQueue() { + const serialized = JSON.stringify(mockQueueEvent.mock.calls); + expect(serialized).not.toContain(home); + expect(serialized).not.toContain(POISON_BEARER); + expect(serialized).not.toContain(POISON_SK); + expect(serialized).not.toContain(POISON_JWT); + } + + it('stepCompleted: poisoned error.message does not leak markers', () => { + const err = new Error(POISON_MESSAGE); + analytics.stepCompleted('test-step', 100, false, err); + expect(mockQueueEvent).toHaveBeenCalled(); + assertCleanQueue(); + }); + + it('commandExecuted: poisoned error.message does not leak markers', () => { + const err = new Error(POISON_MESSAGE); + analytics.commandExecuted('test-command', 100, false, { error: err }); + expect(mockQueueEvent).toHaveBeenCalled(); + assertCleanQueue(); + }); + + it('captureUnhandledCrash: poisoned error.message does not leak markers', () => { + const err = new Error(POISON_MESSAGE); + analytics.captureUnhandledCrash(err); + expect(mockQueueEvent).toHaveBeenCalled(); + assertCleanQueue(); + }); + + it('captureException: poisoned error.message does not leak via session.end tags', async () => { + const err = new Error(POISON_MESSAGE); + analytics.captureException(err); + // captureException stores into tags; tags flow into session.end at shutdown. + await analytics.shutdown('error'); + expect(mockQueueEvent).toHaveBeenCalled(); + assertCleanQueue(); + }); + + it('replaceLastCommandEvent: inherits sanitization via commandExecuted', () => { + const err = new Error(POISON_MESSAGE); + // First queue a provisional event (would normally happen in middleware). + analytics.commandExecuted('test-command', 0, true); + mockQueueEvent.mockClear(); + // Then replace it with the real one carrying the poisoned error. + analytics.replaceLastCommandEvent('test-command', 100, false, { error: err }); + expect(mockReplaceLastEventOfType).toHaveBeenCalledWith('command'); + expect(mockQueueEvent).toHaveBeenCalled(); + assertCleanQueue(); + }); +}); From f1a391a532396935a7ca540ac94e29e63d25351d Mon Sep 17 00:00:00 2001 From: Nick Nisi Date: Wed, 15 Apr 2026 15:38:05 -0500 Subject: [PATCH 10/10] chore: de-slop --- src/utils/analytics.ts | 6 +----- src/utils/crash-reporter.ts | 37 +++++++++++-------------------------- 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/src/utils/analytics.ts b/src/utils/analytics.ts index 52a525ad..a66a61e0 100644 --- a/src/utils/analytics.ts +++ b/src/utils/analytics.ts @@ -121,11 +121,7 @@ export class Analytics { return undefined; } - /** - * Single chokepoint for converting an Error into telemetry-safe fields. - * All capture methods that record error details MUST go through this. - * Sanitizes the message via sanitizeMessage (homedir + secret patterns + truncation). - */ + /** All capture methods that record error details MUST go through this. */ private extractErrorFields(error: Error): { type: string; message: string } { return { type: error.name, diff --git a/src/utils/crash-reporter.ts b/src/utils/crash-reporter.ts index 299b56c1..36381fd7 100644 --- a/src/utils/crash-reporter.ts +++ b/src/utils/crash-reporter.ts @@ -3,14 +3,14 @@ import { homedir } from 'node:os'; const MAX_STACK_LENGTH = 4096; const MAX_MESSAGE_LENGTH = 1024; +const HOME = homedir(); let isCrashing = false; /** - * Redact known credential patterns from a string: - * Bearer tokens, sk_test_/sk_live_ keys, raw JWTs. - * Used by both sanitizeStack and sanitizeMessage so secrets that appear - * in error messages (which Node echoes into stack first lines) are removed - * regardless of which path captures them. + * Redact known credential patterns (Bearer tokens, sk_test_/sk_live_ keys, + * raw JWTs). Shared by sanitizeStack and sanitizeMessage because Node echoes + * `.message` into the leading `Error.stack` line, so secrets in messages also + * surface in stacks. */ function redactSecrets(s: string): string { return s @@ -19,15 +19,10 @@ function redactSecrets(s: string): string { .replace(/\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+/g, ''); } -/** - * Sanitize stack trace: strip absolute paths to relative, remove home dir, - * redact credentials that may appear via the leading "Error: " line. - */ +/** Sanitize stack trace for telemetry: homedir, absolute-path collapse, secrets, truncation. */ export function sanitizeStack(stack: string | undefined): string { if (!stack) return ''; - const home = homedir(); - let sanitized = stack; - sanitized = sanitized.replaceAll(home, '~'); + let sanitized = stack.replaceAll(HOME, '~'); sanitized = sanitized.replace(/\/[^\s:]+\/(node_modules|dist|src)\//g, '$1/'); sanitized = redactSecrets(sanitized); return sanitized.length > MAX_STACK_LENGTH @@ -35,17 +30,10 @@ export function sanitizeStack(stack: string | undefined): string { : sanitized; } -/** - * Sanitize an error message before sending to telemetry. - * Strips home directory and redacts known credential patterns - * (Bearer tokens, sk_test_/sk_live_ keys, raw JWTs). - * Truncates to MAX_MESSAGE_LENGTH chars. - */ +/** Sanitize an error message for telemetry (homedir, secrets, truncation). */ export function sanitizeMessage(msg: string | undefined): string { if (!msg) return ''; - const home = homedir(); - let sanitized = msg.replaceAll(home, '~'); - sanitized = redactSecrets(sanitized); + const sanitized = redactSecrets(msg.replaceAll(HOME, '~')); return sanitized.length > MAX_MESSAGE_LENGTH ? sanitized.slice(0, MAX_MESSAGE_LENGTH) + '...[truncated]' : sanitized; @@ -76,11 +64,8 @@ function reportCrashSync(error: Error): void { if (isCrashing) return; isCrashing = true; try { - // Sanitize the stack before passing to analytics - const sanitized = new Error(error.message); - sanitized.name = error.name; - sanitized.stack = sanitizeStack(error.stack); - analytics.captureUnhandledCrash(sanitized); + // captureUnhandledCrash sanitizes both message and stack at the analytics boundary. + analytics.captureUnhandledCrash(error); } catch { // Telemetry must never prevent exit }