From 9f9cc7aa98be1373db0267b39cb7dd67d82d5dd6 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Singh Date: Mon, 24 Nov 2025 22:54:59 -0500 Subject: [PATCH 1/5] feat: upgraded to sandbox sdk version 0.5.2 --- SandboxDockerfile | 2 +- bun.lock | 4 ++-- package.json | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/SandboxDockerfile b/SandboxDockerfile index 10daa45e..56a90597 100644 --- a/SandboxDockerfile +++ b/SandboxDockerfile @@ -1,5 +1,5 @@ # FROM docker.io/cloudflare/sandbox:0.1.3 -FROM docker.io/cloudflare/sandbox:0.4.14 +FROM docker.io/cloudflare/sandbox:0.5.2 ARG TARGETARCH RUN apt-get update && \ diff --git a/bun.lock b/bun.lock index 51ed7a61..9ab84be8 100644 --- a/bun.lock +++ b/bun.lock @@ -6,7 +6,7 @@ "dependencies": { "@ashishkumar472/cf-git": "1.0.5", "@cloudflare/containers": "^0.0.28", - "@cloudflare/sandbox": "0.4.14", + "@cloudflare/sandbox": "^0.5.2", "@noble/ciphers": "^1.3.0", "@octokit/rest": "^22.0.1", "@radix-ui/react-accordion": "^1.2.12", @@ -230,7 +230,7 @@ "@cloudflare/kv-asset-handler": ["@cloudflare/kv-asset-handler@0.4.0", "", { "dependencies": { "mime": "^3.0.0" } }, "sha512-+tv3z+SPp+gqTIcImN9o0hqE9xyfQjI1XD9pL6NuKjua9B1y7mNYv0S9cP+QEbA4ppVgGZEmKOvHX5G5Ei1CVA=="], - "@cloudflare/sandbox": ["@cloudflare/sandbox@0.4.14", "", { "dependencies": { "@cloudflare/containers": "^0.0.30" } }, "sha512-sI8yEqUd/vukhwSHGi93xau7xEtbP+OD/8xnjGbXr2iIGnAV+05VhLVIfJmwuU3VE8R96dnBJtEucEE5ZtXhcQ=="], + "@cloudflare/sandbox": ["@cloudflare/sandbox@0.5.2", "", { "dependencies": { "@cloudflare/containers": "^0.0.30" }, "peerDependencies": { "@openai/agents": "^0.3.2" }, "optionalPeers": ["@openai/agents"] }, "sha512-YXKeFr+FkPQTdKzmQv0khRV9JcK+W9ZpfxqZ4xu9TRzt0BHXLbjACSoIWkVtkuUPK1BTQdwqI5Ku1fYpM1pqGw=="], "@cloudflare/unenv-preset": ["@cloudflare/unenv-preset@2.7.8", "", { "peerDependencies": { "unenv": "2.0.0-rc.21", "workerd": "^1.20250927.0" }, "optionalPeers": ["workerd"] }, "sha512-Ky929MfHh+qPhwCapYrRPwPVHtA2Ioex/DbGZyskGyNRDe9Ru3WThYZivyNVaPy5ergQSgMs9OKrM9Ajtz9F6w=="], diff --git a/package.json b/package.json index c1719b0d..5e2b057f 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ "dependencies": { "@ashishkumar472/cf-git": "1.0.5", "@cloudflare/containers": "^0.0.28", - "@cloudflare/sandbox": "0.4.14", + "@cloudflare/sandbox": "0.5.2", "@noble/ciphers": "^1.3.0", "@octokit/rest": "^22.0.1", "@radix-ui/react-accordion": "^1.2.12", From bf200353137c19d822e154c1eb1cda7cf11d81f7 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Singh Date: Mon, 24 Nov 2025 22:58:02 -0500 Subject: [PATCH 2/5] refactor: improve error handling, prompt clarity, and template filtering Add null checks after inference calls in blueprint, template selection, phase generation, user conversation processing, project setup, and code fixer operations. Switch from REACT_RENDER_LOOP_PREVENTION to REACT_RENDER_LOOP_PREVENTION_LITE in code debugger and phase generation. Add explicit warning against installing cloudflare:workers/durable-objects dependencies in multiple prompts. Update phase generation to prioritize runtime --- worker/agents/assistants/codeDebugger.ts | 2 +- worker/agents/assistants/projectsetup.ts | 8 ++++ worker/agents/core/simpleGeneratorAgent.ts | 18 +++----- worker/agents/operations/PhaseGeneration.ts | 11 ++++- .../agents/operations/PostPhaseCodeFixer.ts | 5 +++ .../operations/UserConversationProcessor.ts | 10 +++++ worker/agents/planning/blueprint.ts | 16 ++++--- worker/agents/planning/templateSelector.ts | 7 ++- worker/agents/prompts.ts | 6 ++- worker/services/sandbox/BaseSandboxService.ts | 2 + worker/services/sandbox/sandboxTypes.ts | 45 ++++++++++--------- 11 files changed, 86 insertions(+), 44 deletions(-) diff --git a/worker/agents/assistants/codeDebugger.ts b/worker/agents/assistants/codeDebugger.ts index 64bcbbcb..09752066 100644 --- a/worker/agents/assistants/codeDebugger.ts +++ b/worker/agents/assistants/codeDebugger.ts @@ -459,7 +459,7 @@ The goal is working code, verified through evidence. Think internally, act decis The most important class of errors is the "Maximum update depth exceeded" error which you definitely need to identify and fix. Here are some important guidelines for identifying such issues and preventing them: -${PROMPT_UTILS.REACT_RENDER_LOOP_PREVENTION} +${PROMPT_UTILS.REACT_RENDER_LOOP_PREVENTION_LITE} ${PROMPT_UTILS.COMMON_DEP_DOCUMENTATION} diff --git a/worker/agents/assistants/projectsetup.ts b/worker/agents/assistants/projectsetup.ts index ef9c8f2e..2b22af1c 100644 --- a/worker/agents/assistants/projectsetup.ts +++ b/worker/agents/assistants/projectsetup.ts @@ -60,6 +60,8 @@ Output: - Focus on blueprint requirements only - cloudflare:workers is not needed, it's already installed +**Do not recommend installing \`cloudflare:workers\` or \`cloudflare:durable-objects\` as dependencies, these are already installed in the project always.** + ${PROMPT_UTILS.COMMANDS} @@ -128,6 +130,12 @@ ${error}`); context: this.inferenceContext, modelName: error? AIModels.GEMINI_2_5_FLASH : undefined, }); + + if (!results || !results.string) { + this.logger.error('Project setup returned no result after all retries'); + throw new Error('Failed to generate setup commands: inference returned null'); + } + this.logger.info(`Generated setup commands: ${results.string}`); this.save([createAssistantMessage(results.string)]); diff --git a/worker/agents/core/simpleGeneratorAgent.ts b/worker/agents/core/simpleGeneratorAgent.ts index badd16db..b68f95e7 100644 --- a/worker/agents/core/simpleGeneratorAgent.ts +++ b/worker/agents/core/simpleGeneratorAgent.ts @@ -30,7 +30,7 @@ import { ScreenshotAnalysisOperation } from '../operations/ScreenshotAnalysis'; // Database schema imports removed - using zero-storage OAuth flow import { BaseSandboxService } from '../../services/sandbox/BaseSandboxService'; import { WebSocketMessageData, WebSocketMessageType } from '../../api/websocketTypes'; -import { InferenceContext, ModelConfig } from '../inferutils/config.types'; +import { InferenceContext } from '../inferutils/config.types'; import { ModelConfigService } from '../../database/services/ModelConfigService'; import { fixProjectIssues } from '../../services/code-fixer'; import { GitVersionControl } from '../git'; @@ -355,22 +355,14 @@ export class SimpleCodeGeneratorAgent extends Agent { // Load the latest user configs const modelConfigService = new ModelConfigService(this.env); const userConfigsRecord = await modelConfigService.getUserModelConfigs(this.state.inferenceContext.userId); - - const userModelConfigs: Record = {}; - for (const [actionKey, mergedConfig] of Object.entries(userConfigsRecord)) { - if (mergedConfig.isUserOverride) { - const { isUserOverride, userConfigId, ...modelConfig } = mergedConfig; - userModelConfigs[actionKey] = modelConfig; - } - } this.setState({ ...this.state, inferenceContext: { ...this.state.inferenceContext, - userModelConfigs, + userModelConfigs: userConfigsRecord, }, }); - this.logger().info(`Agent ${this.getAgentId()} session: ${this.state.sessionId} onStart: User configs loaded successfully`, {userModelConfigs}); + this.logger().info(`Agent ${this.getAgentId()} session: ${this.state.sessionId} onStart: User configs loaded successfully`, {userConfigsRecord}); } private async gitInit() { @@ -1702,6 +1694,8 @@ export class SimpleCodeGeneratorAgent extends Agent { } const regenerated = await this.regenerateFile({ filePath: path, fileContents, filePurpose }, issues, 0); + // Invalidate cache + this.staticAnalysisCache = null; // Persist to sandbox instance await this.getSandboxServiceClient().writeFiles(sandboxInstanceId, [{ filePath: regenerated.filePath, fileContents: regenerated.fileContents }], `Deep debugger fix: ${path}`); return { path, diff: regenerated.lastDiff }; @@ -1766,6 +1760,8 @@ export class SimpleCodeGeneratorAgent extends Agent { fileCount: result.files.length }); + await this.deployToSandbox(savedFiles, false); + return { files: savedFiles.map(f => { return { path: f.filePath, diff --git a/worker/agents/operations/PhaseGeneration.ts b/worker/agents/operations/PhaseGeneration.ts index 1a824e4c..ad728af7 100644 --- a/worker/agents/operations/PhaseGeneration.ts +++ b/worker/agents/operations/PhaseGeneration.ts @@ -24,7 +24,7 @@ const SYSTEM_PROMPT = ` You are given the blueprint (PRD) and the client query. You will be provided with all previously implemented project phases, the current latest snapshot of the codebase, and any current runtime issues or static analysis reports. - **Your primary task:** Design the next phase of the project as a deployable milestone leading to project completion or to address any user feedbacks or reported bugs. + **Your primary task:** Design the next phase of the project as a deployable milestone leading to project completion or to address any user feedbacks or reported bugs (runtime error fixing is the highest priority). **Phase Planning Process:** 1. **ANALYZE** current codebase state and identify what's implemented vs. what remains @@ -36,6 +36,8 @@ const SYSTEM_PROMPT = ` - **Accessibility**: Proper semantic HTML, ARIA labels, keyboard navigation - **Supreme software development practices**: Follow the best coding principles and practices, and lay out the codebase in a way that is easy to maintain, extend and debug. 4. **VALIDATE** that the phase will be deployable with all views/pages working beautifully across devices + + Plan the phase name and description appropriately. They don't have to strictly adhere to the blueprint's roadmap as unforeseen issues may occur. The project needs to be fully ready to ship in a reasonable amount of time. Plan accordingly. If no more phases are needed, conclude by putting blank fields in the response. @@ -230,7 +232,7 @@ const issuesPromptFormatterWithGuidelines = (issues: IssueReport): string => { serialized = ` ${PROMPT_UTILS.COMMON_PITFALLS} -${issues.runtimeErrors.some((error) => error.message.includes('infinite loop') || error.message.includes('re-renders')) ? PROMPT_UTILS.REACT_RENDER_LOOP_PREVENTION: ''} +${issues.runtimeErrors.some((error) => error.message.includes('infinite loop') || error.message.includes('re-renders')) ? PROMPT_UTILS.REACT_RENDER_LOOP_PREVENTION_LITE: ''} ${serialized}`; } @@ -293,6 +295,11 @@ export class PhaseGenerationOperation extends AgentOperation You are a meticulous and forward-thinking Senior Software Architect and Product Manager at Cloudflare with extensive expertise in modern UI/UX design and visual excellence. - Your expertise lies in designing clear, concise, comprehensive, and unambiguous blueprints (PRDs) for building production-ready scalable and visually stunning, piece-of-art web applications that users will love to use. + Your expertise lies in designing clear, concise, comprehensive, and unambiguous blueprints (PRDs) for building production-ready scalable and visually stunning, piece-of-art web applications that users will love to use, using Cloudflare workers and durable objects. @@ -24,7 +24,8 @@ const SYSTEM_PROMPT = ` Focus on a clear and comprehensive design that prioritizes STUNNING VISUAL DESIGN, polish and depth, be to the point, explicit and detailed in your response, and adhere to our development process. Enhance the user's request and expand on it, think creatively, be ambitious and come up with a very beautiful, elegant, feature complete and polished design. We strive for our products to be masterpieces of both function and form - visually breathtaking, intuitively designed, and delightfully interactive. - **REMEMBER: This is not a toy or demo project. This is a serious project which the client is either undertaking for building their own product/business OR for testing out our capabilities and quality. We do not just expect an MVP, We expect a production-ready, polished, and exceptional solution** + **REMEMBER: This is not a toy or educational project. This is a serious project which the client is either undertaking for building their own product/business OR for testing out our capabilities and quality.** + **Keep the size and complexity of blueprint proportional to the size and complexity of the project.** eg, No need to overengineer a 'todo' app. @@ -35,7 +36,6 @@ const SYSTEM_PROMPT = ` **VISUAL DESIGN EXCELLENCE**: Design the application frontend with exceptional attention to visual details - specify exact components, navigation patterns, headers, footers, color schemes, typography scales, spacing systems, micro-interactions, animations, hover states, loading states, and responsive behaviors. **USER EXPERIENCE FOCUS**: Plan intuitive user flows, clear information hierarchy, accessible design patterns, and delightful interactions that make users want to use the application. Build upon the provided template. Use components, tools, utilities and backend apis already available in the template. - Think and **BREAKDOWN** The project into multiple incremental phases that build upon each other to create a complete, polished product following our . @@ -100,6 +100,7 @@ const SYSTEM_PROMPT = ` ✅ Icon libraries: lucide-react, heroicons (specify in frameworks) ❌ Never: .png, .jpg, .svg, .gif files in phase files list Binary files cannot be generated. Always use the approaches above for visual content. + Do not recommend installing \`cloudflare:workers\` or \`cloudflare:durable-objects\` as dependencies, these are already installed in the project always. @@ -239,11 +240,14 @@ export async function generateBlueprint({ env, inferenceContext, query, language stream: stream, }); - if (results) { - // Filter and remove any pdf files - results.initialPhase.files = results.initialPhase.files.filter(f => !f.path.endsWith('.pdf')); + if (!results) { + logger.error('Blueprint generation returned no result after all retries'); + throw new Error('Failed to generate blueprint: inference returned null'); } + // Filter and remove any pdf files + results.initialPhase.files = results.initialPhase.files.filter(f => !f.path.endsWith('.pdf')); + // // A hack // if (results?.initialPhase) { // results.initialPhase.lastPhase = false; diff --git a/worker/agents/planning/templateSelector.ts b/worker/agents/planning/templateSelector.ts index 857f4bd0..9b7e68db 100644 --- a/worker/agents/planning/templateSelector.ts +++ b/worker/agents/planning/templateSelector.ts @@ -35,7 +35,7 @@ export async function selectTemplate({ env, query, availableTemplates, inference templateCount: availableTemplates.length }); - availableTemplates = availableTemplates.filter(t => t.projectType !== 'presentation'); + availableTemplates = availableTemplates.filter(t => t.projectType !== 'presentation' && !t.disabled && !t.name.includes('minimal')); const validTemplateNames = availableTemplates.map(t => t.name); const templateDescriptions = availableTemplates.map((t, index) => @@ -128,6 +128,11 @@ ENTROPY SEED: ${generateSecureToken(64)} - for unique results`; maxTokens: 2000, }); + if (!selection) { + logger.error('Template selection returned no result after all retries'); + throw new Error('Failed to select template: inference returned null'); + } + logger.info(`AI template selection result: ${selection.selectedTemplateName || 'None'}, Reasoning: ${selection.reasoning}`); return selection; diff --git a/worker/agents/prompts.ts b/worker/agents/prompts.ts index 42dd8d6a..83389c45 100644 --- a/worker/agents/prompts.ts +++ b/worker/agents/prompts.ts @@ -70,8 +70,9 @@ Template Usage Instructions: ${template.description.usage} -These files are forbidden to be modified. Do not touch them under any circumstances. +These files are forbidden to be modified. Do not touch them under any circumstances. Doing so will break the application. ${(template.dontTouchFiles ?? []).join('\n')} +worker/core-utils.ts @@ -888,6 +889,7 @@ COMMON_PITFALLS: ` Applying this rule to your situation will fix both the type-check errors and the browser's runtime error. # Never write image files! Never write jpeg, png, svg, etc files yourself! Always use some image url from the web. + **Do not recommend installing \`cloudflare:workers\` or \`cloudflare:durable-objects\` as dependencies, these are already installed in the project always.** `, COMMON_DEP_DOCUMENTATION: ` @@ -1368,7 +1370,7 @@ export function issuesPromptFormatter(issues: IssueReport): string { ### 1. CRITICAL RUNTIME ERRORS (Fix First - Deployment Blockers) **Error Count:** ${issues.runtimeErrors?.length || 0} runtime errors detected -**Contains Render Loops:** ${runtimeErrorsText.includes('Maximum update depth') || runtimeErrorsText.includes('Too many re-renders') ? 'YES - HIGHEST PRIORITY' : 'No'} +**Contains Render Loops:** ${runtimeErrorsText.includes('Maximum update depth') || runtimeErrorsText.includes('Too many re-renders') || runtimeErrorsText.includes('infinite loop') ? 'YES - HIGHEST PRIORITY' : 'No'} ${runtimeErrorsText || 'No runtime errors detected'} diff --git a/worker/services/sandbox/BaseSandboxService.ts b/worker/services/sandbox/BaseSandboxService.ts index 152f5cd7..81e4f18e 100644 --- a/worker/services/sandbox/BaseSandboxService.ts +++ b/worker/services/sandbox/BaseSandboxService.ts @@ -92,6 +92,7 @@ export abstract class BaseSandboxService { language: t.language, frameworks: t.frameworks || [], description: t.description, + disabled: t.disabled ?? false, projectType: t.projectType || 'app', renderMode: t.renderMode, slideDirectory: t.slideDirectory, @@ -177,6 +178,7 @@ export abstract class BaseSandboxService { selection: catalogInfo?.description.selection || '', usage: catalogInfo?.description.usage || '' }, + disabled: catalogInfo?.disabled ?? false, fileTree, allFiles: filesMap, language: catalogInfo?.language, diff --git a/worker/services/sandbox/sandboxTypes.ts b/worker/services/sandbox/sandboxTypes.ts index 648e9b9a..68a8c001 100644 --- a/worker/services/sandbox/sandboxTypes.ts +++ b/worker/services/sandbox/sandboxTypes.ts @@ -21,27 +21,6 @@ export const TemplateFileSchema = z.object({ }) export type TemplateFile = z.infer -// --- Template Details --- - -export const TemplateDetailsSchema = z.object({ - name: z.string(), - description: z.object({ - selection: z.string(), - usage: z.string(), - }), - fileTree: FileTreeNodeSchema, - allFiles: z.record(z.string(), z.string()), - language: z.string().optional(), - deps: z.record(z.string(), z.string()), - frameworks: z.array(z.string()).optional(), - importantFiles: z.array(z.string()), - dontTouchFiles: z.array(z.string()), - redactedFiles: z.array(z.string()), - renderMode: z.enum(['sandbox', 'browser']).optional(), - slideDirectory: z.string().optional(), -}) -export type TemplateDetails = z.infer - // ========================================== // RUNTIME ERROR SCHEMAS // ========================================== @@ -98,6 +77,29 @@ export type CommandExecutionResult = z.infer + // /templates (GET) export const TemplateInfoSchema = z.object({ @@ -111,6 +113,7 @@ export const TemplateInfoSchema = z.object({ }), renderMode: z.enum(['sandbox', 'browser']).optional(), slideDirectory: z.string().optional(), + disabled: z.boolean(), }) export type TemplateInfo = z.infer From fff7f0d118b79a3626e2658ead5f9f2511f6a213 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Singh Date: Mon, 24 Nov 2025 22:58:02 -0500 Subject: [PATCH 3/5] refactor: improve error handling, prompt clarity, and template filtering Add null checks after inference calls in blueprint, template selection, phase generation, user conversation processing, project setup, and code fixer operations. Switch from REACT_RENDER_LOOP_PREVENTION to REACT_RENDER_LOOP_PREVENTION_LITE in code debugger and phase generation. Add explicit warning against installing cloudflare:workers/durable-objects dependencies in multiple prompts. Update phase generation to prioritize runtime --- worker/agents/assistants/codeDebugger.ts | 2 +- worker/agents/assistants/projectsetup.ts | 8 ++++ worker/agents/core/simpleGeneratorAgent.ts | 7 ++- worker/agents/operations/PhaseGeneration.ts | 11 ++++- .../agents/operations/PostPhaseCodeFixer.ts | 5 +++ .../operations/UserConversationProcessor.ts | 10 +++++ worker/agents/planning/blueprint.ts | 16 ++++--- worker/agents/planning/templateSelector.ts | 7 ++- worker/agents/prompts.ts | 6 ++- worker/services/sandbox/BaseSandboxService.ts | 2 + worker/services/sandbox/sandboxTypes.ts | 45 ++++++++++--------- 11 files changed, 84 insertions(+), 35 deletions(-) diff --git a/worker/agents/assistants/codeDebugger.ts b/worker/agents/assistants/codeDebugger.ts index 64bcbbcb..09752066 100644 --- a/worker/agents/assistants/codeDebugger.ts +++ b/worker/agents/assistants/codeDebugger.ts @@ -459,7 +459,7 @@ The goal is working code, verified through evidence. Think internally, act decis The most important class of errors is the "Maximum update depth exceeded" error which you definitely need to identify and fix. Here are some important guidelines for identifying such issues and preventing them: -${PROMPT_UTILS.REACT_RENDER_LOOP_PREVENTION} +${PROMPT_UTILS.REACT_RENDER_LOOP_PREVENTION_LITE} ${PROMPT_UTILS.COMMON_DEP_DOCUMENTATION} diff --git a/worker/agents/assistants/projectsetup.ts b/worker/agents/assistants/projectsetup.ts index ef9c8f2e..2b22af1c 100644 --- a/worker/agents/assistants/projectsetup.ts +++ b/worker/agents/assistants/projectsetup.ts @@ -60,6 +60,8 @@ Output: - Focus on blueprint requirements only - cloudflare:workers is not needed, it's already installed +**Do not recommend installing \`cloudflare:workers\` or \`cloudflare:durable-objects\` as dependencies, these are already installed in the project always.** + ${PROMPT_UTILS.COMMANDS} @@ -128,6 +130,12 @@ ${error}`); context: this.inferenceContext, modelName: error? AIModels.GEMINI_2_5_FLASH : undefined, }); + + if (!results || !results.string) { + this.logger.error('Project setup returned no result after all retries'); + throw new Error('Failed to generate setup commands: inference returned null'); + } + this.logger.info(`Generated setup commands: ${results.string}`); this.save([createAssistantMessage(results.string)]); diff --git a/worker/agents/core/simpleGeneratorAgent.ts b/worker/agents/core/simpleGeneratorAgent.ts index badd16db..cf9476e3 100644 --- a/worker/agents/core/simpleGeneratorAgent.ts +++ b/worker/agents/core/simpleGeneratorAgent.ts @@ -355,7 +355,6 @@ export class SimpleCodeGeneratorAgent extends Agent { // Load the latest user configs const modelConfigService = new ModelConfigService(this.env); const userConfigsRecord = await modelConfigService.getUserModelConfigs(this.state.inferenceContext.userId); - const userModelConfigs: Record = {}; for (const [actionKey, mergedConfig] of Object.entries(userConfigsRecord)) { if (mergedConfig.isUserOverride) { @@ -370,7 +369,7 @@ export class SimpleCodeGeneratorAgent extends Agent { userModelConfigs, }, }); - this.logger().info(`Agent ${this.getAgentId()} session: ${this.state.sessionId} onStart: User configs loaded successfully`, {userModelConfigs}); + this.logger().info(`Agent ${this.getAgentId()} session: ${this.state.sessionId} onStart: User configs loaded successfully`, {userConfigsRecord}); } private async gitInit() { @@ -1702,6 +1701,8 @@ export class SimpleCodeGeneratorAgent extends Agent { } const regenerated = await this.regenerateFile({ filePath: path, fileContents, filePurpose }, issues, 0); + // Invalidate cache + this.staticAnalysisCache = null; // Persist to sandbox instance await this.getSandboxServiceClient().writeFiles(sandboxInstanceId, [{ filePath: regenerated.filePath, fileContents: regenerated.fileContents }], `Deep debugger fix: ${path}`); return { path, diff: regenerated.lastDiff }; @@ -1766,6 +1767,8 @@ export class SimpleCodeGeneratorAgent extends Agent { fileCount: result.files.length }); + await this.deployToSandbox(savedFiles, false); + return { files: savedFiles.map(f => { return { path: f.filePath, diff --git a/worker/agents/operations/PhaseGeneration.ts b/worker/agents/operations/PhaseGeneration.ts index 1a824e4c..ad728af7 100644 --- a/worker/agents/operations/PhaseGeneration.ts +++ b/worker/agents/operations/PhaseGeneration.ts @@ -24,7 +24,7 @@ const SYSTEM_PROMPT = ` You are given the blueprint (PRD) and the client query. You will be provided with all previously implemented project phases, the current latest snapshot of the codebase, and any current runtime issues or static analysis reports. - **Your primary task:** Design the next phase of the project as a deployable milestone leading to project completion or to address any user feedbacks or reported bugs. + **Your primary task:** Design the next phase of the project as a deployable milestone leading to project completion or to address any user feedbacks or reported bugs (runtime error fixing is the highest priority). **Phase Planning Process:** 1. **ANALYZE** current codebase state and identify what's implemented vs. what remains @@ -36,6 +36,8 @@ const SYSTEM_PROMPT = ` - **Accessibility**: Proper semantic HTML, ARIA labels, keyboard navigation - **Supreme software development practices**: Follow the best coding principles and practices, and lay out the codebase in a way that is easy to maintain, extend and debug. 4. **VALIDATE** that the phase will be deployable with all views/pages working beautifully across devices + + Plan the phase name and description appropriately. They don't have to strictly adhere to the blueprint's roadmap as unforeseen issues may occur. The project needs to be fully ready to ship in a reasonable amount of time. Plan accordingly. If no more phases are needed, conclude by putting blank fields in the response. @@ -230,7 +232,7 @@ const issuesPromptFormatterWithGuidelines = (issues: IssueReport): string => { serialized = ` ${PROMPT_UTILS.COMMON_PITFALLS} -${issues.runtimeErrors.some((error) => error.message.includes('infinite loop') || error.message.includes('re-renders')) ? PROMPT_UTILS.REACT_RENDER_LOOP_PREVENTION: ''} +${issues.runtimeErrors.some((error) => error.message.includes('infinite loop') || error.message.includes('re-renders')) ? PROMPT_UTILS.REACT_RENDER_LOOP_PREVENTION_LITE: ''} ${serialized}`; } @@ -293,6 +295,11 @@ export class PhaseGenerationOperation extends AgentOperation You are a meticulous and forward-thinking Senior Software Architect and Product Manager at Cloudflare with extensive expertise in modern UI/UX design and visual excellence. - Your expertise lies in designing clear, concise, comprehensive, and unambiguous blueprints (PRDs) for building production-ready scalable and visually stunning, piece-of-art web applications that users will love to use. + Your expertise lies in designing clear, concise, comprehensive, and unambiguous blueprints (PRDs) for building production-ready scalable and visually stunning, piece-of-art web applications that users will love to use, using Cloudflare workers and durable objects. @@ -24,7 +24,8 @@ const SYSTEM_PROMPT = ` Focus on a clear and comprehensive design that prioritizes STUNNING VISUAL DESIGN, polish and depth, be to the point, explicit and detailed in your response, and adhere to our development process. Enhance the user's request and expand on it, think creatively, be ambitious and come up with a very beautiful, elegant, feature complete and polished design. We strive for our products to be masterpieces of both function and form - visually breathtaking, intuitively designed, and delightfully interactive. - **REMEMBER: This is not a toy or demo project. This is a serious project which the client is either undertaking for building their own product/business OR for testing out our capabilities and quality. We do not just expect an MVP, We expect a production-ready, polished, and exceptional solution** + **REMEMBER: This is not a toy or educational project. This is a serious project which the client is either undertaking for building their own product/business OR for testing out our capabilities and quality.** + **Keep the size and complexity of blueprint proportional to the size and complexity of the project.** eg, No need to overengineer a 'todo' app. @@ -35,7 +36,6 @@ const SYSTEM_PROMPT = ` **VISUAL DESIGN EXCELLENCE**: Design the application frontend with exceptional attention to visual details - specify exact components, navigation patterns, headers, footers, color schemes, typography scales, spacing systems, micro-interactions, animations, hover states, loading states, and responsive behaviors. **USER EXPERIENCE FOCUS**: Plan intuitive user flows, clear information hierarchy, accessible design patterns, and delightful interactions that make users want to use the application. Build upon the provided template. Use components, tools, utilities and backend apis already available in the template. - Think and **BREAKDOWN** The project into multiple incremental phases that build upon each other to create a complete, polished product following our . @@ -100,6 +100,7 @@ const SYSTEM_PROMPT = ` ✅ Icon libraries: lucide-react, heroicons (specify in frameworks) ❌ Never: .png, .jpg, .svg, .gif files in phase files list Binary files cannot be generated. Always use the approaches above for visual content. + Do not recommend installing \`cloudflare:workers\` or \`cloudflare:durable-objects\` as dependencies, these are already installed in the project always. @@ -239,11 +240,14 @@ export async function generateBlueprint({ env, inferenceContext, query, language stream: stream, }); - if (results) { - // Filter and remove any pdf files - results.initialPhase.files = results.initialPhase.files.filter(f => !f.path.endsWith('.pdf')); + if (!results) { + logger.error('Blueprint generation returned no result after all retries'); + throw new Error('Failed to generate blueprint: inference returned null'); } + // Filter and remove any pdf files + results.initialPhase.files = results.initialPhase.files.filter(f => !f.path.endsWith('.pdf')); + // // A hack // if (results?.initialPhase) { // results.initialPhase.lastPhase = false; diff --git a/worker/agents/planning/templateSelector.ts b/worker/agents/planning/templateSelector.ts index 857f4bd0..9b7e68db 100644 --- a/worker/agents/planning/templateSelector.ts +++ b/worker/agents/planning/templateSelector.ts @@ -35,7 +35,7 @@ export async function selectTemplate({ env, query, availableTemplates, inference templateCount: availableTemplates.length }); - availableTemplates = availableTemplates.filter(t => t.projectType !== 'presentation'); + availableTemplates = availableTemplates.filter(t => t.projectType !== 'presentation' && !t.disabled && !t.name.includes('minimal')); const validTemplateNames = availableTemplates.map(t => t.name); const templateDescriptions = availableTemplates.map((t, index) => @@ -128,6 +128,11 @@ ENTROPY SEED: ${generateSecureToken(64)} - for unique results`; maxTokens: 2000, }); + if (!selection) { + logger.error('Template selection returned no result after all retries'); + throw new Error('Failed to select template: inference returned null'); + } + logger.info(`AI template selection result: ${selection.selectedTemplateName || 'None'}, Reasoning: ${selection.reasoning}`); return selection; diff --git a/worker/agents/prompts.ts b/worker/agents/prompts.ts index 42dd8d6a..83389c45 100644 --- a/worker/agents/prompts.ts +++ b/worker/agents/prompts.ts @@ -70,8 +70,9 @@ Template Usage Instructions: ${template.description.usage} -These files are forbidden to be modified. Do not touch them under any circumstances. +These files are forbidden to be modified. Do not touch them under any circumstances. Doing so will break the application. ${(template.dontTouchFiles ?? []).join('\n')} +worker/core-utils.ts @@ -888,6 +889,7 @@ COMMON_PITFALLS: ` Applying this rule to your situation will fix both the type-check errors and the browser's runtime error. # Never write image files! Never write jpeg, png, svg, etc files yourself! Always use some image url from the web. + **Do not recommend installing \`cloudflare:workers\` or \`cloudflare:durable-objects\` as dependencies, these are already installed in the project always.** `, COMMON_DEP_DOCUMENTATION: ` @@ -1368,7 +1370,7 @@ export function issuesPromptFormatter(issues: IssueReport): string { ### 1. CRITICAL RUNTIME ERRORS (Fix First - Deployment Blockers) **Error Count:** ${issues.runtimeErrors?.length || 0} runtime errors detected -**Contains Render Loops:** ${runtimeErrorsText.includes('Maximum update depth') || runtimeErrorsText.includes('Too many re-renders') ? 'YES - HIGHEST PRIORITY' : 'No'} +**Contains Render Loops:** ${runtimeErrorsText.includes('Maximum update depth') || runtimeErrorsText.includes('Too many re-renders') || runtimeErrorsText.includes('infinite loop') ? 'YES - HIGHEST PRIORITY' : 'No'} ${runtimeErrorsText || 'No runtime errors detected'} diff --git a/worker/services/sandbox/BaseSandboxService.ts b/worker/services/sandbox/BaseSandboxService.ts index 152f5cd7..81e4f18e 100644 --- a/worker/services/sandbox/BaseSandboxService.ts +++ b/worker/services/sandbox/BaseSandboxService.ts @@ -92,6 +92,7 @@ export abstract class BaseSandboxService { language: t.language, frameworks: t.frameworks || [], description: t.description, + disabled: t.disabled ?? false, projectType: t.projectType || 'app', renderMode: t.renderMode, slideDirectory: t.slideDirectory, @@ -177,6 +178,7 @@ export abstract class BaseSandboxService { selection: catalogInfo?.description.selection || '', usage: catalogInfo?.description.usage || '' }, + disabled: catalogInfo?.disabled ?? false, fileTree, allFiles: filesMap, language: catalogInfo?.language, diff --git a/worker/services/sandbox/sandboxTypes.ts b/worker/services/sandbox/sandboxTypes.ts index 648e9b9a..68a8c001 100644 --- a/worker/services/sandbox/sandboxTypes.ts +++ b/worker/services/sandbox/sandboxTypes.ts @@ -21,27 +21,6 @@ export const TemplateFileSchema = z.object({ }) export type TemplateFile = z.infer -// --- Template Details --- - -export const TemplateDetailsSchema = z.object({ - name: z.string(), - description: z.object({ - selection: z.string(), - usage: z.string(), - }), - fileTree: FileTreeNodeSchema, - allFiles: z.record(z.string(), z.string()), - language: z.string().optional(), - deps: z.record(z.string(), z.string()), - frameworks: z.array(z.string()).optional(), - importantFiles: z.array(z.string()), - dontTouchFiles: z.array(z.string()), - redactedFiles: z.array(z.string()), - renderMode: z.enum(['sandbox', 'browser']).optional(), - slideDirectory: z.string().optional(), -}) -export type TemplateDetails = z.infer - // ========================================== // RUNTIME ERROR SCHEMAS // ========================================== @@ -98,6 +77,29 @@ export type CommandExecutionResult = z.infer + // /templates (GET) export const TemplateInfoSchema = z.object({ @@ -111,6 +113,7 @@ export const TemplateInfoSchema = z.object({ }), renderMode: z.enum(['sandbox', 'browser']).optional(), slideDirectory: z.string().optional(), + disabled: z.boolean(), }) export type TemplateInfo = z.infer From b0db701e82fad2d00820b7bd127b412017301cce Mon Sep 17 00:00:00 2001 From: Ashish Kumar Singh Date: Thu, 27 Nov 2025 23:52:37 -0500 Subject: [PATCH 4/5] feat: rewrite process monitoring and error handling Upgrade sandbox Docker image from 0.5.2 to 0.5.6. Add instance ID validation with pattern matching and length limits to prevent path traversal. Extract integer argument parsing to dedicated helper function. Improve Result type narrowing with explicit 'error' field checks throughout error and log commands. Convert async storage cleanup to synchronous close() calls. Enhance file lock mechanism in log reset with stale lock detection and retry logic matching --- SandboxDockerfile | 2 +- bun.lock | 4 +- container/cli-tools.ts | 223 +- container/monitor-cli.test.ts | 2645 ++++++++++++++++++++ container/process-monitor.ts | 919 +++++-- container/storage.ts | 62 +- container/types.ts | 18 +- package.json | 2 +- worker/agents/core/simpleGeneratorAgent.ts | 2 +- 9 files changed, 3598 insertions(+), 279 deletions(-) create mode 100644 container/monitor-cli.test.ts diff --git a/SandboxDockerfile b/SandboxDockerfile index 56a90597..9ad3e882 100644 --- a/SandboxDockerfile +++ b/SandboxDockerfile @@ -1,5 +1,5 @@ # FROM docker.io/cloudflare/sandbox:0.1.3 -FROM docker.io/cloudflare/sandbox:0.5.2 +FROM docker.io/cloudflare/sandbox:0.5.6 ARG TARGETARCH RUN apt-get update && \ diff --git a/bun.lock b/bun.lock index 9ab84be8..9994d001 100644 --- a/bun.lock +++ b/bun.lock @@ -6,7 +6,7 @@ "dependencies": { "@ashishkumar472/cf-git": "1.0.5", "@cloudflare/containers": "^0.0.28", - "@cloudflare/sandbox": "^0.5.2", + "@cloudflare/sandbox": "^0.5.6", "@noble/ciphers": "^1.3.0", "@octokit/rest": "^22.0.1", "@radix-ui/react-accordion": "^1.2.12", @@ -230,7 +230,7 @@ "@cloudflare/kv-asset-handler": ["@cloudflare/kv-asset-handler@0.4.0", "", { "dependencies": { "mime": "^3.0.0" } }, "sha512-+tv3z+SPp+gqTIcImN9o0hqE9xyfQjI1XD9pL6NuKjua9B1y7mNYv0S9cP+QEbA4ppVgGZEmKOvHX5G5Ei1CVA=="], - "@cloudflare/sandbox": ["@cloudflare/sandbox@0.5.2", "", { "dependencies": { "@cloudflare/containers": "^0.0.30" }, "peerDependencies": { "@openai/agents": "^0.3.2" }, "optionalPeers": ["@openai/agents"] }, "sha512-YXKeFr+FkPQTdKzmQv0khRV9JcK+W9ZpfxqZ4xu9TRzt0BHXLbjACSoIWkVtkuUPK1BTQdwqI5Ku1fYpM1pqGw=="], + "@cloudflare/sandbox": ["@cloudflare/sandbox@0.5.6", "", { "dependencies": { "@cloudflare/containers": "^0.0.30" }, "peerDependencies": { "@openai/agents": "^0.3.3" }, "optionalPeers": ["@openai/agents"] }, "sha512-N4mdUY5lVWQHQcAZnTecJne95uSyf7KVPbJGMr3G6jS7oyphriw6Hxa5I/HlDUdMWjaTBgeOuDEb4m3wo2IyOg=="], "@cloudflare/unenv-preset": ["@cloudflare/unenv-preset@2.7.8", "", { "peerDependencies": { "unenv": "2.0.0-rc.21", "workerd": "^1.20250927.0" }, "optionalPeers": ["workerd"] }, "sha512-Ky929MfHh+qPhwCapYrRPwPVHtA2Ioex/DbGZyskGyNRDe9Ru3WThYZivyNVaPy5ergQSgMs9OKrM9Ajtz9F6w=="], diff --git a/container/cli-tools.ts b/container/cli-tools.ts index 9eb35abb..216d5214 100755 --- a/container/cli-tools.ts +++ b/container/cli-tools.ts @@ -23,6 +23,40 @@ import { getLogDbPath } from './types.js'; +// Instance ID validation pattern - alphanumeric with dashes and underscores +const INSTANCE_ID_PATTERN = /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/; +const MAX_INSTANCE_ID_LENGTH = 64; + +/** + * Validate instance ID format to prevent path traversal and other issues + */ +function validateInstanceId(id: string): void { + if (!id || id.length === 0) { + throw new Error('Instance ID is required'); + } + if (id.length > MAX_INSTANCE_ID_LENGTH) { + throw new Error(`Instance ID must be ${MAX_INSTANCE_ID_LENGTH} characters or less`); + } + if (!INSTANCE_ID_PATTERN.test(id)) { + throw new Error('Instance ID must start with alphanumeric and contain only alphanumeric, dash, or underscore'); + } +} + +/** + * Safely parse integer argument with validation + */ +function parseIntArg(args: Record, key: string): number | undefined { + const value = args[key]; + if (value === undefined || value === null) { + return undefined; + } + const parsed = parseInt(String(value), 10); + if (isNaN(parsed)) { + throw new Error(`Invalid integer value for --${key}: ${value}`); + } + return parsed; +} + class SafeJSON { static stringify(data: unknown, space?: number): string { try { @@ -37,7 +71,7 @@ class SafeJSON { return value; }, space); return json; - } catch (error) { + } catch { // Fallback for any stringify errors return JSON.stringify({ error: 'Failed to serialize data', @@ -50,7 +84,7 @@ class SafeJSON { static parse(text: string): unknown { try { return JSON.parse(text); - } catch (error) { + } catch { // Return error object for failed parsing return { success: false, @@ -62,16 +96,15 @@ class SafeJSON { } class SafeCleanup { - static async closeStorage(storage: StorageManager | null, timeoutMs: number = 2000): Promise { + /** + * Safely close storage manager. + * Storage.close() is synchronous, so we just wrap it in try-catch. + */ + static closeStorage(storage: StorageManager | null): void { if (!storage) return; - + try { - const timeout = setTimeout(() => { - console.warn('Storage close timeout, forcing cleanup'); - }, timeoutMs); - storage.close(); - clearTimeout(timeout); } catch (error) { console.warn('Storage close error:', error); // Don't throw - we're in cleanup @@ -239,7 +272,7 @@ class ProcessCommands { }); const startResult = await runner.start(); - if (!startResult.success) { + if (!startResult.success && 'error' in startResult) { OutputFormatter.formatError(`Failed to start process: ${startResult.error.message}`); process.exit(1); } @@ -309,7 +342,7 @@ class ProcessCommands { if (runner) { const stopResult = await runner.stop(options.force); - if (!stopResult.success) { + if (!stopResult.success && 'error' in stopResult) { OutputFormatter.formatError(`Failed to stop process: ${stopResult.error.message}`); process.exit(1); } @@ -422,8 +455,10 @@ class ProcessRunner { // Convert Result to Result if (stopResult.success) { return { success: true, data: true }; - } else { + } else if ('error' in stopResult) { return { success: false, error: stopResult.error }; + } else { + return { success: false, error: new Error('Unknown error stopping process') }; } } catch (error) { return { @@ -503,7 +538,7 @@ class ErrorCommands { storage = new StorageManager(options.dbPath); const result = storage.getErrors(options.instanceId); - if (!result.success) { + if (!result.success && 'error' in result) { throw result.error; } @@ -559,7 +594,10 @@ class ErrorCommands { const clearResult = storage.clearErrors(options.instanceId); if (!clearResult.success) { - throw clearResult.error; + if ('error' in clearResult) { + throw clearResult.error; + } + throw new Error('Failed to clear errors'); } resetInfo = { clearedCount: clearResult.data.clearedCount }; @@ -588,18 +626,10 @@ class ErrorCommands { // Fallback if formatting fails console.error(SafeJSON.stringify({ success: false, error: String(error) })); } - - // Ensure cleanup before exit - if (storage) { - try { - storage.close(); - } catch (closeError) { - // Ignore close errors - } - } process.exit(1); } finally { - await SafeCleanup.closeStorage(storage); + // Single cleanup point - close storage only once + SafeCleanup.closeStorage(storage); } } @@ -608,7 +638,7 @@ class ErrorCommands { try { const result = storage.getErrorSummary(options.instanceId); - if (!result.success) { + if (!result.success && 'error' in result) { throw result.error; } @@ -630,7 +660,7 @@ class ErrorCommands { ); process.exit(1); } finally { - await SafeCleanup.closeStorage(storage); + SafeCleanup.closeStorage(storage); } } @@ -644,7 +674,7 @@ class ErrorCommands { try { const result = storage.clearErrors(options.instanceId); - if (!result.success) { + if (!result.success && 'error' in result) { throw result.error; } @@ -666,7 +696,7 @@ class ErrorCommands { ); process.exit(1); } finally { - await SafeCleanup.closeStorage(storage); + SafeCleanup.closeStorage(storage); } } @@ -707,7 +737,7 @@ class LogCommands { }; const result = storage.getLogs(filter); - if (!result.success) { + if (!result.success && 'error' in result) { throw result.error; } @@ -731,7 +761,7 @@ class LogCommands { ); process.exit(1); } finally { - await SafeCleanup.closeStorage(storage); + SafeCleanup.closeStorage(storage); } } @@ -746,66 +776,86 @@ class LogCommands { try { const { promises: fs } = require('fs'); const { join } = require('path'); - + const { randomUUID } = require('crypto'); + const logFilePath = join(getDataDirectory(), `${options.instanceId}-process.log`); const lockFilePath = `${logFilePath}.lock`; - const tempPath = `${logFilePath}.tmp.${Date.now()}.${process.pid}`; - + const tempPath = `${logFilePath}.tmp.${randomUUID()}.${process.pid}`; + let logs = ''; - - // Simple file-based locking to prevent concurrent access + + const LOCK_STALE_MS = 30000; // Must match FileLock in process-monitor.ts + const MAX_RETRIES = 10; + const RETRY_DELAY_MS = 50; + + // File-based locking coordinated with SimpleLogManager const acquireLock = async (): Promise => { - try { - await fs.writeFile(lockFilePath, process.pid.toString(), { flag: 'wx' }); - return true; - } catch (error) { - return false; + for (let attempt = 0; attempt < MAX_RETRIES; attempt++) { + try { + await fs.writeFile(lockFilePath, `${process.pid}:${Date.now()}`, { flag: 'wx' }); + return true; + } catch (error: unknown) { + const fsError = error as { code?: string }; + if (fsError?.code === 'EEXIST') { + // Lock exists - check if stale + try { + const content = await fs.readFile(lockFilePath, 'utf8'); + const [, timestamp] = content.split(':'); + const lockTime = parseInt(timestamp, 10); + if (Date.now() - lockTime > LOCK_STALE_MS) { + // Stale lock - remove and retry + await fs.unlink(lockFilePath).catch(() => {}); + continue; + } + } catch { + // Can't read lock file - try again + } + // Wait and retry + await new Promise(resolve => setTimeout(resolve, RETRY_DELAY_MS + Math.random() * RETRY_DELAY_MS)); + } else { + return false; + } + } } + return false; }; - + const releaseLock = async (): Promise => { try { await fs.unlink(lockFilePath); - } catch (error) { + } catch { // Ignore lock release errors } }; - - // Try to acquire lock with retry - let lockAcquired = false; - for (let attempt = 0; attempt < 5; attempt++) { - lockAcquired = await acquireLock(); - if (lockAcquired) break; - - // Wait briefly before retry - await new Promise(resolve => setTimeout(resolve, 50 + Math.random() * 50)); - } - + + const lockAcquired = await acquireLock(); + if (!lockAcquired) { - throw new Error('Could not acquire file lock for log reset operation'); + throw new Error('Could not acquire file lock for log operation'); } - + try { if (options.reset) { // Reset mode: Atomic operation to read and clear the file try { await fs.rename(logFilePath, tempPath); - + // Create new empty log file immediately await fs.writeFile(logFilePath, '', 'utf8').catch(() => {}); - + // Read from temp file and clean up try { logs = await fs.readFile(tempPath, 'utf8'); await fs.unlink(tempPath).catch(() => {}); // Clean up temp file - } catch (error) { + } catch { // If we can't read temp file, at least clean it up await fs.unlink(tempPath).catch(() => {}); logs = ''; } - } catch (error) { + } catch (error: unknown) { // File doesn't exist yet, return empty - if ((error as any).code === 'ENOENT') { + const fsError = error as { code?: string }; + if (fsError?.code === 'ENOENT') { logs = ''; } else { throw error; @@ -815,9 +865,10 @@ class LogCommands { // Read-only mode: Just read the file without resetting try { logs = await fs.readFile(logFilePath, 'utf8'); - } catch (error) { + } catch (error: unknown) { // File doesn't exist yet, return empty - if ((error as any).code === 'ENOENT') { + const fsError = error as { code?: string }; + if (fsError?.code === 'ENOENT') { logs = ''; } else { throw error; @@ -894,7 +945,7 @@ class LogCommands { try { const result = storage.getLogStats(options.instanceId); - if (!result.success) { + if (!result.success && 'error' in result) { throw result.error; } @@ -916,7 +967,7 @@ class LogCommands { ); process.exit(1); } finally { - await SafeCleanup.closeStorage(storage); + SafeCleanup.closeStorage(storage); } } @@ -930,7 +981,7 @@ class LogCommands { try { const result = storage.clearLogs(options.instanceId); - if (!result.success) { + if (!result.success && 'error' in result) { throw result.error; } @@ -952,7 +1003,7 @@ class LogCommands { ); process.exit(1); } finally { - await SafeCleanup.closeStorage(storage); + SafeCleanup.closeStorage(storage); } } } @@ -1185,20 +1236,21 @@ async function handleProcessCommand(subcommand: string, args: Record console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(30, 'VITE v5.4.0 ready in 234 ms'); + pinoLog(30, 'Local: http://localhost:5173/'); + pinoLog(30, 'Network: use --host to expose'); + // Keep alive + setInterval(() => { + pinoLog(30, 'HMR update detected'); + }, 500); + `, + + // Emit error then continue running + error: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(30, 'VITE v5.4.0 ready in 234 ms'); + setTimeout(() => { + pinoLog(50, 'React error: Cannot read property of undefined'); + }, 100); + setInterval(() => {}, 1000); + `, + + // Emit fatal error and crash + crash: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(30, 'VITE v5.4.0 ready in 150 ms'); + setTimeout(() => { + pinoLog(60, 'fatal error: Maximum call stack size exceeded'); + process.exit(1); + }, 200); + `, + + // Clean exit + cleanExit: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(30, 'VITE v5.4.0 ready in 100 ms'); + setTimeout(() => { + pinoLog(30, 'Shutting down...'); + process.exit(0); + }, 300); + `, + + // Hang (no output) + hang: ` + // Start but produce no output - simulates hung process + setInterval(() => {}, 1000); + `, + + // High-frequency log output + flood: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(30, 'VITE v5.4.0 ready in 50 ms'); + let counter = 0; + const interval = setInterval(() => { + for (let i = 0; i < 100; i++) { + pinoLog(30, 'HMR update ' + (counter++)); + } + if (counter >= 1000) { + clearInterval(interval); + pinoLog(30, 'Flood complete'); + } + }, 10); + `, + + // EADDRINUSE error + portConflict: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(60, 'Error: listen EADDRINUSE: address already in use :::5173'); + process.exit(1); + `, + + // Module not found error + moduleNotFound: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(60, 'Error: Cannot find module \\'./missing-component\\''); + process.exit(1); + `, + + // Compilation error + compilationError: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(30, 'VITE v5.4.0 ready in 200 ms'); + setTimeout(() => { + pinoLog(50, 'Failed to compile: SyntaxError: Unexpected token in App.tsx:42:15'); + pinoLog(50, 'Error in ./src/App.tsx'); + }, 150); + setInterval(() => {}, 1000); + `, + + // React error boundary crash + reactCrash: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + pinoLog(30, 'VITE v5.4.0 ready in 180 ms'); + setTimeout(() => { + pinoLog(50, 'Uncaught TypeError: Cannot read properties of null (reading \\'map\\')'); + pinoLog(50, 'The above error occurred in the component'); + pinoLog(50, 'React will try to recreate this component tree from scratch'); + }, 200); + setInterval(() => {}, 1000); + `, + + // Delayed startup (for health check tests) + slowStartup: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + // Wait 2 seconds before ready message + setTimeout(() => { + pinoLog(30, 'VITE v5.4.0 ready in 2000 ms'); + }, 2000); + setInterval(() => {}, 1000); + `, + + // Multiple crashes for restart testing + crashAfterRun: ` + const pinoLog = (level, msg, extra = {}) => console.log(JSON.stringify({ + level, msg, time: Date.now(), ...extra + })); + const runId = process.env.RUN_ID || Date.now(); + pinoLog(30, 'Starting run ' + runId); + pinoLog(30, 'VITE v5.4.0 ready in 100 ms'); + setTimeout(() => { + pinoLog(60, 'Crash ' + runId); + process.exit(1); + }, 500); + `, + + // Echo mode - echoes stdin lines (for testing input) + echo: ` + const readline = require('readline'); + const rl = readline.createInterface({ input: process.stdin }); + console.log(JSON.stringify({ level: 30, msg: 'Echo server ready', time: Date.now() })); + rl.on('line', (line) => { + console.log(JSON.stringify({ level: 30, msg: 'Echo: ' + line, time: Date.now() })); + }); + `, + + // Plain text output (non-JSON) + plainText: ` + console.log('Plain text log line 1'); + console.log('Plain text log line 2'); + console.error('Error output line'); + setTimeout(() => { + console.log('Done'); + process.exit(0); + }, 200); + `, + + // Mixed JSON and plain text + mixedOutput: ` + const pinoLog = (level, msg) => console.log(JSON.stringify({ level, msg, time: Date.now() })); + pinoLog(30, 'JSON message 1'); + console.log('Plain text message'); + pinoLog(50, 'JSON error message'); + console.error('Plain stderr message'); + pinoLog(30, 'JSON message 2'); + setTimeout(() => process.exit(0), 300); + ` +}; + +// ============================================================================ +// TEST HELPERS +// ============================================================================ + +/** + * Generate unique instance ID for test isolation + */ +function getTestInstanceId(): string { + return `test-${Date.now()}-${++testInstanceCounter}`; +} + +/** + * Create a temporary data directory for tests + */ +async function createTempDataDir(): Promise { + const dir = join(process.cwd(), '.test-data', randomUUID()); + await fs.mkdir(dir, { recursive: true }); + return dir; +} + +/** + * Clean up temporary directory + */ +async function cleanupTempDir(dir: string): Promise { + try { + await fs.rm(dir, { recursive: true, force: true }); + } catch { + // Ignore cleanup errors + } +} + +/** + * Wait for ProcessMonitor to reach a specific state + */ +function waitForState( + monitor: ProcessMonitor, + targetState: ProcessState, + timeout: number = TEST_TIMEOUT +): Promise { + return new Promise((resolve, reject) => { + const timeoutId = setTimeout(() => { + reject(new Error(`Timeout waiting for state: ${targetState} (current: ${monitor.getState()})`)); + }, timeout); + + const checkState = () => { + if (monitor.getState() === targetState) { + clearTimeout(timeoutId); + resolve(); + return true; + } + return false; + }; + + if (checkState()) return; + + const handler = (event: MonitoringEvent) => { + if (event.type === 'state_changed' && event.newState === targetState) { + clearTimeout(timeoutId); + monitor.removeListener('state_changed', handler); + resolve(); + } + }; + + monitor.on('state_changed', handler); + }); +} + +/** + * Wait for a specific event from ProcessMonitor + */ +function waitForEvent( + monitor: ProcessMonitor, + eventType: MonitoringEvent['type'], + timeout: number = TEST_TIMEOUT +): Promise { + return new Promise((resolve, reject) => { + const timeoutId = setTimeout(() => { + reject(new Error(`Timeout waiting for event: ${eventType}`)); + }, timeout); + + const handler = (event: MonitoringEvent) => { + if (event.type === eventType) { + clearTimeout(timeoutId); + monitor.removeListener(eventType, handler); + resolve(event); + } + }; + + monitor.on(eventType, handler); + }); +} + +/** + * Spawn a mock process with the given script + */ +function spawnMockProcess(scriptKey: keyof typeof MOCK_SCRIPTS, env?: Record): ChildProcess { + const script = MOCK_SCRIPTS[scriptKey]; + return spawn('bun', ['-e', script], { + env: { ...process.env, ...env }, + stdio: ['pipe', 'pipe', 'pipe'] + }); +} + +/** + * Create ProcessInfo for testing + */ +function createTestProcessInfo(instanceId: string, scriptKey: keyof typeof MOCK_SCRIPTS = 'startup'): ProcessInfo { + return { + id: `proc-${instanceId}-${Date.now()}`, + instanceId, + command: 'bun', + args: ['-e', MOCK_SCRIPTS[scriptKey]], + cwd: process.cwd(), + restartCount: 0 + }; +} + +/** + * Create ProcessMonitor with test defaults + */ +function createTestMonitor( + storage: StorageManager, + instanceId: string, + scriptKey: keyof typeof MOCK_SCRIPTS = 'startup', + options: MonitoringOptions = {} +): ProcessMonitor { + const processInfo = createTestProcessInfo(instanceId, scriptKey); + return new ProcessMonitor(processInfo, storage, { + autoRestart: false, + maxRestarts: 3, + restartDelay: 100, + healthCheckInterval: 5000, + errorBufferSize: 100, + ...options + }); +} + +/** + * Sleep helper + */ +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +// ============================================================================ +// CIRCULAR BUFFER TESTS +// ============================================================================ + +describe('Unit: CircularBuffer', () => { + // Since CircularBuffer is private, we test its behavior through ProcessMonitor's getRecentLogs + + it('should handle empty buffer', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + const logs = monitor.getRecentLogs(50); + expect(logs).toEqual([]); + } finally { + storage.close(); + } + }); + + it('should store and retrieve logs in order', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + const result = await monitor.start(); + expect(result.success).toBe(true); + + // Wait for some logs to accumulate + await sleep(500); + + const logs = monitor.getRecentLogs(50); + expect(logs.length).toBeGreaterThan(0); + + // Verify logs are in chronological order + for (let i = 1; i < logs.length; i++) { + expect(logs[i].timestamp.getTime()).toBeGreaterThanOrEqual(logs[i - 1].timestamp.getTime()); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should respect buffer capacity', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + const bufferSize = 50; + try { + const monitor = createTestMonitor(storage, instanceId, 'flood', { + errorBufferSize: bufferSize + }); + const result = await monitor.start(); + expect(result.success).toBe(true); + + // Wait for flood to complete + await sleep(2000); + + const logs = monitor.getRecentLogs(1000); + // Buffer should not exceed configured size + expect(logs.length).toBeLessThanOrEqual(bufferSize); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle slice operations correctly', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + const result = await monitor.start(); + expect(result.success).toBe(true); + + await sleep(1000); + + const allLogs = monitor.getRecentLogs(100); + const lastFive = monitor.getRecentLogs(5); + + expect(lastFive.length).toBeLessThanOrEqual(5); + if (allLogs.length >= 5) { + expect(lastFive).toEqual(allLogs.slice(-5)); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +// ============================================================================ +// FILE LOCK TESTS +// ============================================================================ + +describe('Unit: FileLock', () => { + it('should create lock file on acquire', async () => { + const instanceId = getTestInstanceId(); + const logFilePath = join(testDataDir, `${instanceId}-test.log`); + const lockFilePath = `${logFilePath}.lock`; + + // Write initial file + await fs.writeFile(logFilePath, 'test content'); + + // Acquire lock by simulating what FileLock does + await fs.writeFile(lockFilePath, `${process.pid}:${Date.now()}`, { flag: 'wx' }); + + // Verify lock file exists + const lockExists = await fs.access(lockFilePath).then(() => true).catch(() => false); + expect(lockExists).toBe(true); + + // Clean up + await fs.unlink(lockFilePath).catch(() => {}); + await fs.unlink(logFilePath).catch(() => {}); + }); + + it('should detect stale locks', async () => { + const instanceId = getTestInstanceId(); + const logFilePath = join(testDataDir, `${instanceId}-stale.log`); + const lockFilePath = `${logFilePath}.lock`; + + // Create stale lock (timestamp from 60 seconds ago) + const staleTime = Date.now() - 60000; + await fs.writeFile(lockFilePath, `99999:${staleTime}`); + + // Verify we can read the stale timestamp + const content = await fs.readFile(lockFilePath, 'utf8'); + const [, timestamp] = content.split(':'); + const lockTime = parseInt(timestamp, 10); + + // Lock is older than 30 seconds, should be considered stale + expect(Date.now() - lockTime).toBeGreaterThan(30000); + + // Clean up + await fs.unlink(lockFilePath).catch(() => {}); + }); + + it('should block concurrent lock attempts', async () => { + const instanceId = getTestInstanceId(); + const logFilePath = join(testDataDir, `${instanceId}-concurrent.log`); + const lockFilePath = `${logFilePath}.lock`; + + // First lock + await fs.writeFile(lockFilePath, `${process.pid}:${Date.now()}`, { flag: 'wx' }); + + // Second lock should fail with EEXIST + let error: Error | null = null; + try { + await fs.writeFile(lockFilePath, `${process.pid}:${Date.now()}`, { flag: 'wx' }); + } catch (e) { + error = e as Error; + } + + expect(error).not.toBeNull(); + expect((error as NodeJS.ErrnoException).code).toBe('EEXIST'); + + // Clean up + await fs.unlink(lockFilePath).catch(() => {}); + }); +}); + +// ============================================================================ +// SIMPLE LOG MANAGER TESTS (via ProcessMonitor) +// ============================================================================ + +describe('Unit: SimpleLogManager', () => { + it('should append and retrieve logs', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + const result = await monitor.start(); + expect(result.success).toBe(true); + + await sleep(500); + + const logs = await monitor.getAllLogsAndReset(); + expect(logs.length).toBeGreaterThan(0); + expect(logs).toContain('VITE'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should reset log file after retrieval', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + const result = await monitor.start(); + expect(result.success).toBe(true); + + await sleep(500); + + // First retrieval should have content + const logs1 = await monitor.getAllLogsAndReset(); + expect(logs1.length).toBeGreaterThan(0); + + // Immediately after reset, logs should be empty or minimal + const logs2 = await monitor.getAllLogsAndReset(); + expect(logs2.length).toBeLessThan(logs1.length); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle concurrent read/write operations', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'flood'); + const result = await monitor.start(); + expect(result.success).toBe(true); + + // Concurrent reads while flooding + const reads = await Promise.all([ + sleep(100).then(() => monitor.getAllLogsAndReset()), + sleep(200).then(() => monitor.getAllLogsAndReset()), + sleep(300).then(() => monitor.getAllLogsAndReset()) + ]); + + // All reads should complete without error + for (const logs of reads) { + expect(typeof logs).toBe('string'); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +// ============================================================================ +// PROCESS MONITOR STATE MACHINE TESTS +// ============================================================================ + +describe('Unit: ProcessMonitor State Machine', () => { + it('should start in stopped state', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + expect(monitor.getState()).toBe('stopped'); + } finally { + storage.close(); + } + }); + + it('should transition stopped -> starting -> running', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + const states: ProcessState[] = []; + + monitor.on('state_changed', (event: MonitoringEvent) => { + if (event.type === 'state_changed') { + states.push(event.newState); + } + }); + + const result = await monitor.start(); + expect(result.success).toBe(true); + expect(monitor.getState()).toBe('running'); + expect(states).toContain('starting'); + expect(states).toContain('running'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should transition running -> stopping -> stopped on graceful stop', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + await monitor.start(); + expect(monitor.getState()).toBe('running'); + + const states: ProcessState[] = []; + monitor.on('state_changed', (event: MonitoringEvent) => { + if (event.type === 'state_changed') { + states.push(event.newState); + } + }); + + await monitor.stop(); + expect(monitor.getState()).toBe('stopped'); + expect(states).toContain('stopping'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should transition running -> crashed on unexpected exit', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: false + }); + + const crashPromise = waitForEvent(monitor, 'process_crashed'); + await monitor.start(); + await crashPromise; + + expect(monitor.getState()).toBe('crashed'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should prevent starting when already running', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + await monitor.start(); + expect(monitor.getState()).toBe('running'); + + const result = await monitor.start(); + expect(result.success).toBe(false); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should prevent stopping when already stopped', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + expect(monitor.getState()).toBe('stopped'); + + const result = await monitor.stop(); + expect(result.success).toBe(true); // Idempotent + + await monitor.cleanup(); + } finally { + storage.close(); + } + }); +}); + +// ============================================================================ +// PROCESS MONITOR STREAM PROCESSING TESTS +// ============================================================================ + +describe('Unit: ProcessMonitor Stream Processing', () => { + it('should handle JSON log lines', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + await monitor.start(); + await sleep(500); + + const logs = monitor.getRecentLogs(50); + expect(logs.some(l => l.content.includes('VITE'))).toBe(true); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle plain text output', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'plainText'); + await monitor.start(); + await waitForState(monitor, 'stopped', SHORT_TIMEOUT); + + const allLogs = await monitor.getAllLogsAndReset(); + expect(allLogs).toContain('Plain text'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle mixed JSON and plain text', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'mixedOutput'); + await monitor.start(); + await waitForState(monitor, 'stopped', SHORT_TIMEOUT); + + const allLogs = await monitor.getAllLogsAndReset(); + expect(allLogs).toContain('JSON message'); + expect(allLogs).toContain('Plain text'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should separate stdout and stderr', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'plainText'); + await monitor.start(); + await waitForState(monitor, 'stopped', SHORT_TIMEOUT); + + const logs = monitor.getRecentLogs(50); + const stdoutLogs = logs.filter(l => l.stream === 'stdout'); + const stderrLogs = logs.filter(l => l.stream === 'stderr'); + + expect(stdoutLogs.length).toBeGreaterThan(0); + expect(stderrLogs.length).toBeGreaterThan(0); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle high-frequency output', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'flood', { + errorBufferSize: 200 + }); + await monitor.start(); + + // Wait for flood to complete + await sleep(3000); + + const logs = monitor.getRecentLogs(200); + expect(logs.length).toBeGreaterThan(50); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, STRESS_TIMEOUT); +}); + +// ============================================================================ +// PROCESS MONITOR ERROR DETECTION TESTS +// ============================================================================ + +describe('Unit: ProcessMonitor Error Detection', () => { + it('should detect level 50 errors', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'error'); + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + + const event = await errorPromise; + expect(event.type).toBe('error_detected'); + if (event.type === 'error_detected') { + expect(event.error.level).toBe(50); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should detect level 60 fatal errors', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: false + }); + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + + const event = await errorPromise; + expect(event.type).toBe('error_detected'); + if (event.type === 'error_detected') { + expect(event.error.level).toBe(60); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should store errors in storage', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'error'); + // Set up event listener BEFORE starting to avoid race condition + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + await errorPromise; + await sleep(200); + + const result = storage.getErrors(instanceId); + expect(result.success).toBe(true); + if (result.success) { + expect(result.data.length).toBeGreaterThan(0); + expect(result.data[0].level).toBe(50); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should detect EADDRINUSE as fatal', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'portConflict', { + autoRestart: false + }); + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + + const event = await errorPromise; + if (event.type === 'error_detected') { + expect(event.error.message).toContain('EADDRINUSE'); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should detect module not found as fatal', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'moduleNotFound', { + autoRestart: false + }); + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + + const event = await errorPromise; + if (event.type === 'error_detected') { + expect(event.error.message).toContain('Cannot find module'); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should capture compilation errors', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'compilationError'); + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + + const event = await errorPromise; + if (event.type === 'error_detected') { + expect(event.error.message).toContain('SyntaxError'); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should capture React errors', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'reactCrash'); + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + + const event = await errorPromise; + if (event.type === 'error_detected') { + expect(event.error.message).toContain('TypeError'); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +// ============================================================================ +// PROCESS MONITOR RESTART LOGIC TESTS +// ============================================================================ + +describe('Unit: ProcessMonitor Restart Logic', () => { + it('should auto-restart on crash when enabled', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: true, + maxRestarts: 2, + restartDelay: 100 + }); + + let startCount = 0; + monitor.on('process_started', () => { + startCount++; + }); + + await monitor.start(); + + // Wait for restart attempts + await sleep(2000); + + // Should have started multiple times (initial + restarts) + expect(startCount).toBeGreaterThan(1); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should respect maxRestarts limit', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const maxRestarts = 2; + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: true, + maxRestarts, + restartDelay: 100 + }); + + let startCount = 0; + monitor.on('process_started', () => { + startCount++; + }); + + await monitor.start(); + + // Wait for all restart attempts + await sleep(3000); + + // Initial start + maxRestarts + expect(startCount).toBeLessThanOrEqual(maxRestarts + 1); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should not restart on clean exit', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'cleanExit', { + autoRestart: true, + maxRestarts: 3, + restartDelay: 100 + }); + + let startCount = 0; + monitor.on('process_started', () => { + startCount++; + }); + + await monitor.start(); + await waitForState(monitor, 'stopped', SHORT_TIMEOUT); + + // Wait to ensure no restart + await sleep(500); + + // Should only start once (clean exit = no restart) + expect(startCount).toBe(1); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should respect restartDelay', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const restartDelay = 500; + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: true, + maxRestarts: 1, + restartDelay + }); + + const startTimes: number[] = []; + monitor.on('process_started', () => { + startTimes.push(Date.now()); + }); + + await monitor.start(); + + // Wait for restart + await sleep(2000); + + if (startTimes.length >= 2) { + const delay = startTimes[1] - startTimes[0]; + // Delay should be at least restartDelay (with some tolerance for crash time) + expect(delay).toBeGreaterThanOrEqual(restartDelay - 100); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +// ============================================================================ +// STORAGE MANAGER TESTS +// ============================================================================ + +describe('Unit: StorageManager', () => { + it('should initialize databases', async () => { + const instanceId = getTestInstanceId(); + const errorDbPath = join(testDataDir, `${instanceId}-errors.db`); + const logDbPath = join(testDataDir, `${instanceId}-logs.db`); + + const storage = new StorageManager(errorDbPath, logDbPath); + + try { + const errorDbExists = await fs.access(errorDbPath).then(() => true).catch(() => false); + const logDbExists = await fs.access(logDbPath).then(() => true).catch(() => false); + + expect(errorDbExists).toBe(true); + expect(logDbExists).toBe(true); + } finally { + storage.close(); + } + }); + + it('should store and retrieve errors', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const error: SimpleError = { + timestamp: new Date().toISOString(), + level: 50, + message: 'Test error message', + rawOutput: '{"level":50,"msg":"Test error message"}' + }; + + const storeResult = storage.storeError(instanceId, 'proc-1', error); + expect(storeResult.success).toBe(true); + + const getResult = storage.getErrors(instanceId); + expect(getResult.success).toBe(true); + if (getResult.success) { + expect(getResult.data.length).toBe(1); + expect(getResult.data[0].message).toBe('Test error message'); + } + } finally { + storage.close(); + } + }); + + it('should deduplicate errors by hash', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const error: SimpleError = { + timestamp: new Date().toISOString(), + level: 50, + message: 'Duplicate error', + rawOutput: '{"level":50,"msg":"Duplicate error"}' + }; + + // Store same error multiple times + storage.storeError(instanceId, 'proc-1', error); + storage.storeError(instanceId, 'proc-1', { ...error, timestamp: new Date().toISOString() }); + storage.storeError(instanceId, 'proc-1', { ...error, timestamp: new Date().toISOString() }); + + const getResult = storage.getErrors(instanceId); + expect(getResult.success).toBe(true); + if (getResult.success) { + // Should be deduplicated + expect(getResult.data.length).toBe(1); + // Occurrence count should be incremented + expect(getResult.data[0].occurrenceCount).toBe(3); + } + } finally { + storage.close(); + } + }); + + it('should clear errors', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const error: SimpleError = { + timestamp: new Date().toISOString(), + level: 50, + message: 'Error to clear', + rawOutput: '{}' + }; + + storage.storeError(instanceId, 'proc-1', error); + + const clearResult = storage.clearErrors(instanceId); + expect(clearResult.success).toBe(true); + if (clearResult.success) { + expect(clearResult.data.clearedCount).toBe(1); + } + + const getResult = storage.getErrors(instanceId); + if (getResult.success) { + expect(getResult.data.length).toBe(0); + } + } finally { + storage.close(); + } + }); + + it('should get error summary', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + storage.storeError(instanceId, 'proc-1', { + timestamp: new Date().toISOString(), + level: 50, + message: 'Error 1', + rawOutput: '{}' + }); + storage.storeError(instanceId, 'proc-1', { + timestamp: new Date().toISOString(), + level: 60, + message: 'Fatal 1', + rawOutput: '{}' + }); + + const summaryResult = storage.getErrorSummary(instanceId); + expect(summaryResult.success).toBe(true); + if (summaryResult.success) { + expect(summaryResult.data.totalErrors).toBe(2); + expect(summaryResult.data.uniqueErrors).toBe(2); + expect(summaryResult.data.errorsByLevel[50]).toBe(1); + expect(summaryResult.data.errorsByLevel[60]).toBe(1); + } + } finally { + storage.close(); + } + }); + + it('should store and retrieve logs', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const storeResult = storage.storeLogs([ + { instanceId, processId: 'proc-1', level: 'info', message: 'Log 1', stream: 'stdout' }, + { instanceId, processId: 'proc-1', level: 'error', message: 'Log 2', stream: 'stderr' } + ]); + expect(storeResult.success).toBe(true); + + const getResult = storage.getLogs({ instanceId }); + expect(getResult.success).toBe(true); + if (getResult.success) { + expect(getResult.data.logs.length).toBe(2); + } + } finally { + storage.close(); + } + }); + + it('should clear logs', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + storage.storeLogs([ + { instanceId, processId: 'proc-1', level: 'info', message: 'Log', stream: 'stdout' } + ]); + + const clearResult = storage.clearLogs(instanceId); + expect(clearResult.success).toBe(true); + + const getResult = storage.getLogs({ instanceId }); + if (getResult.success) { + expect(getResult.data.logs.length).toBe(0); + } + } finally { + storage.close(); + } + }); + + it('should handle transaction correctly', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + storage.transaction(() => { + storage.storeError(instanceId, 'proc-1', { + timestamp: new Date().toISOString(), + level: 50, + message: 'Transaction error', + rawOutput: '{}' + }); + }); + + const getResult = storage.getErrors(instanceId); + expect(getResult.success).toBe(true); + if (getResult.success) { + expect(getResult.data.length).toBe(1); + } + } finally { + storage.close(); + } + }); +}); + +// ============================================================================ +// CLI VALIDATION TESTS +// ============================================================================ + +describe('Unit: CLI Validation', () => { + // Import validation functions (we test via process spawn to avoid import issues) + + it('should accept valid instance IDs', async () => { + const validIds = [ + 'my-app', + 'myApp123', + 'test_instance', + 'a', + 'A123-test_app' + ]; + + for (const id of validIds) { + // Validate pattern directly + const pattern = /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/; + expect(pattern.test(id)).toBe(true); + } + }); + + it('should reject invalid instance IDs', async () => { + const invalidIds = [ + '', + '-starts-with-dash', + '_starts_with_underscore', + '../path/traversal', + 'has spaces', + 'has@special', + 'a'.repeat(65) // Too long + ]; + + const pattern = /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/; + const maxLen = 64; + + for (const id of invalidIds) { + const isValid = id.length > 0 && id.length <= maxLen && pattern.test(id); + expect(isValid).toBe(false); + } + }); + + it('should parse integer arguments correctly', () => { + const parseIntArg = (value: unknown): number | undefined => { + if (value === undefined || value === null) return undefined; + const parsed = parseInt(String(value), 10); + return isNaN(parsed) ? undefined : parsed; + }; + + expect(parseIntArg('123')).toBe(123); + expect(parseIntArg(456)).toBe(456); + expect(parseIntArg('0')).toBe(0); + expect(parseIntArg(undefined)).toBe(undefined); + expect(parseIntArg('invalid')).toBe(undefined); + }); + + it('should handle SafeJSON with circular references', () => { + // Simulate SafeJSON.stringify + const safeStringify = (data: unknown): string => { + try { + const seen = new WeakSet(); + return JSON.stringify(data, (_key, value) => { + if (typeof value === 'object' && value !== null) { + if (seen.has(value)) { + return '[Circular Reference]'; + } + seen.add(value); + } + return value; + }); + } catch { + return '{"error":"Failed to serialize"}'; + } + }; + + // Normal object + expect(safeStringify({ a: 1 })).toBe('{"a":1}'); + + // Circular reference + const circular: Record = { a: 1 }; + circular.self = circular; + const result = safeStringify(circular); + expect(result).toContain('[Circular Reference]'); + }); +}); + +// ============================================================================ +// INTEGRATION TESTS +// ============================================================================ + +describe('Integration: Full Process Lifecycle', () => { + it('should complete start -> monitor -> stop cycle', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + + // Start + const startResult = await monitor.start(); + expect(startResult.success).toBe(true); + expect(monitor.getState()).toBe('running'); + + // Monitor for a bit + await sleep(500); + + // Verify logs are being collected + const logs = monitor.getRecentLogs(50); + expect(logs.length).toBeGreaterThan(0); + + // Stop + const stopResult = await monitor.stop(); + expect(stopResult.success).toBe(true); + expect(monitor.getState()).toBe('stopped'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should capture errors during lifecycle', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'error'); + + await monitor.start(); + await sleep(500); + + // Verify errors were captured + const errorResult = storage.getErrors(instanceId); + expect(errorResult.success).toBe(true); + if (errorResult.success) { + expect(errorResult.data.length).toBeGreaterThan(0); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle process info correctly', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + + await monitor.start(); + + const info = monitor.getProcessInfo(); + expect(info.instanceId).toBe(instanceId); + expect(info.command).toBe('bun'); + expect(info.pid).toBeGreaterThan(0); + expect(info.status).toBe('running'); + expect(info.startTime).toBeInstanceOf(Date); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should emit all expected events', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'error'); + const events: string[] = []; + + monitor.on('process_started', () => events.push('process_started')); + monitor.on('error_detected', () => events.push('error_detected')); + monitor.on('process_stopped', () => events.push('process_stopped')); + monitor.on('state_changed', () => events.push('state_changed')); + + await monitor.start(); + await sleep(500); + await monitor.stop(); + + expect(events).toContain('process_started'); + expect(events).toContain('error_detected'); + expect(events).toContain('process_stopped'); + expect(events).toContain('state_changed'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +describe('Integration: Error Management', () => { + it('should accumulate multiple unique errors', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + // Store different errors + storage.storeError(instanceId, 'proc-1', { + timestamp: new Date().toISOString(), + level: 50, + message: 'Error type A', + rawOutput: '{}' + }); + storage.storeError(instanceId, 'proc-1', { + timestamp: new Date().toISOString(), + level: 50, + message: 'Error type B', + rawOutput: '{}' + }); + storage.storeError(instanceId, 'proc-1', { + timestamp: new Date().toISOString(), + level: 60, + message: 'Error type C', + rawOutput: '{}' + }); + + const result = storage.getErrors(instanceId); + expect(result.success).toBe(true); + if (result.success) { + expect(result.data.length).toBe(3); + } + } finally { + storage.close(); + } + }); + + it('should track error occurrence counts', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const error = { + timestamp: new Date().toISOString(), + level: 50, + message: 'Repeated error', + rawOutput: '{}' + }; + + for (let i = 0; i < 5; i++) { + storage.storeError(instanceId, 'proc-1', { + ...error, + timestamp: new Date().toISOString() + }); + } + + const result = storage.getErrors(instanceId); + expect(result.success).toBe(true); + if (result.success) { + expect(result.data.length).toBe(1); // Deduplicated + expect(result.data[0].occurrenceCount).toBe(5); + } + } finally { + storage.close(); + } + }); +}); + +describe('Integration: Log Management', () => { + it('should retrieve logs with pagination', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + // Store many logs + const logs = Array.from({ length: 50 }, (_, i) => ({ + instanceId, + processId: 'proc-1', + level: 'info' as const, + message: `Log ${i}`, + stream: 'stdout' as const + })); + storage.storeLogs(logs); + + // Get first page + const page1 = storage.getLogs({ instanceId, limit: 10, offset: 0 }); + expect(page1.success).toBe(true); + if (page1.success) { + expect(page1.data.logs.length).toBe(10); + expect(page1.data.hasMore).toBe(true); + } + + // Get second page + const page2 = storage.getLogs({ instanceId, limit: 10, offset: 10 }); + expect(page2.success).toBe(true); + if (page2.success) { + expect(page2.data.logs.length).toBe(10); + } + } finally { + storage.close(); + } + }); + + it('should return log statistics', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + storage.storeLogs([ + { instanceId, processId: 'proc-1', level: 'info', message: 'Info 1', stream: 'stdout' }, + { instanceId, processId: 'proc-1', level: 'info', message: 'Info 2', stream: 'stdout' }, + { instanceId, processId: 'proc-1', level: 'error', message: 'Error 1', stream: 'stderr' } + ]); + + const stats = storage.getLogStats(instanceId); + expect(stats.success).toBe(true); + if (stats.success) { + expect(stats.data.totalLogs).toBe(3); + expect(stats.data.logsByLevel.info).toBe(2); + expect(stats.data.logsByLevel.error).toBe(1); + expect(stats.data.logsByStream.stdout).toBe(2); + expect(stats.data.logsByStream.stderr).toBe(1); + } + } finally { + storage.close(); + } + }); +}); + +// ============================================================================ +// STRESS TESTS +// ============================================================================ + +describe('Stress: High Frequency Logs', () => { + it('should handle 1000+ log lines without data loss', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'flood', { + errorBufferSize: 500 + }); + + await monitor.start(); + + // Wait for flood to complete + await sleep(4000); + + // Get all logs + const logs = await monitor.getAllLogsAndReset(); + const lineCount = logs.split('\n').filter(l => l.trim()).length; + + // Should have captured many logs (some may be trimmed) + expect(lineCount).toBeGreaterThan(100); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, STRESS_TIMEOUT); + + it('should maintain responsiveness under load', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'flood'); + await monitor.start(); + + // Measure response time during flood + const startTime = Date.now(); + const state = monitor.getState(); + const elapsed = Date.now() - startTime; + + expect(state).toBe('running'); + expect(elapsed).toBeLessThan(100); // Should respond quickly + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +describe('Stress: Rapid Restarts', () => { + it('should handle rapid restart cycles', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: true, + maxRestarts: 5, + restartDelay: 50 + }); + + let restartCount = 0; + monitor.on('process_started', () => { + restartCount++; + }); + + await monitor.start(); + + // Wait for restart cycles + await sleep(3000); + + // Should have restarted multiple times + expect(restartCount).toBeGreaterThan(1); + expect(restartCount).toBeLessThanOrEqual(6); // Initial + 5 restarts + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, STRESS_TIMEOUT); + + it('should not leak resources on rapid start/stop', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + for (let i = 0; i < 5; i++) { + const monitor = createTestMonitor(storage, `${instanceId}-${i}`, 'startup'); + await monitor.start(); + await sleep(100); + await monitor.stop(); + await monitor.cleanup(); + } + + // If we got here without errors, resources are being cleaned up + expect(true).toBe(true); + } finally { + storage.close(); + } + }, STRESS_TIMEOUT); +}); + +describe('Stress: Concurrent Access', () => { + it('should handle concurrent log reads', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + await monitor.start(); + + // Concurrent reads + const reads = await Promise.all([ + monitor.getAllLogsAndReset(), + monitor.getAllLogsAndReset(), + monitor.getAllLogsAndReset(), + monitor.getRecentLogs(50), + monitor.getRecentLogs(50) + ]); + + // All reads should complete without error + for (const result of reads) { + expect(result).toBeDefined(); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle concurrent error storage', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + // Concurrent error writes + const writes = await Promise.all( + Array.from({ length: 10 }, (_, i) => + Promise.resolve(storage.storeError(instanceId, 'proc-1', { + timestamp: new Date().toISOString(), + level: 50, + message: `Concurrent error ${i}`, + rawOutput: '{}' + })) + ) + ); + + // All writes should succeed + for (const result of writes) { + expect(result.success).toBe(true); + } + + // Verify all errors stored + const getResult = storage.getErrors(instanceId); + if (getResult.success) { + expect(getResult.data.length).toBe(10); + } + } finally { + storage.close(); + } + }); +}); + +describe('Stress: Process Killing', () => { + it('should handle SIGTERM during running state', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + await monitor.start(); + + // Immediate stop + const stopResult = await monitor.stop(); + expect(stopResult.success).toBe(true); + expect(monitor.getState()).toBe('stopped'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle stop during starting state gracefully', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'slowStartup'); + + // Start but don't wait + const startPromise = monitor.start(); + + // Small delay then stop + await sleep(50); + const stopResult = await monitor.stop(); + + // Should either succeed or return error (not crash) + expect(stopResult).toBeDefined(); + + await startPromise.catch(() => {}); // Handle any start errors + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +// ============================================================================ +// REAL-WORLD SCENARIO TESTS +// ============================================================================ + +describe('Scenarios: Vite Dev Server', () => { + it('should detect successful startup', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + await monitor.start(); + await sleep(500); + + const logs = await monitor.getAllLogsAndReset(); + expect(logs).toContain('VITE'); + expect(logs).toContain('ready'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should capture compilation errors', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'compilationError'); + // Set up event listener BEFORE starting to avoid race condition + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + await errorPromise; + + const errors = storage.getErrors(instanceId); + if (errors.success && errors.data.length > 0) { + expect(errors.data.some(e => e.message.includes('SyntaxError'))).toBe(true); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle EADDRINUSE gracefully', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'portConflict', { + autoRestart: false + }); + + const crashPromise = waitForEvent(monitor, 'process_crashed', SHORT_TIMEOUT); + await monitor.start(); + await crashPromise; + + expect(monitor.getState()).toBe('crashed'); + + const errors = storage.getErrors(instanceId); + if (errors.success && errors.data.length > 0) { + expect(errors.data.some(e => e.message.includes('EADDRINUSE'))).toBe(true); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should restart on crash up to maxRestarts', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const maxRestarts = 2; + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: true, + maxRestarts, + restartDelay: 100 + }); + + let startCount = 0; + let crashCount = 0; + + monitor.on('process_started', () => startCount++); + monitor.on('process_crashed', () => crashCount++); + + await monitor.start(); + + // Wait for all restart attempts + await sleep(3000); + + // Should have exhausted all restart attempts + expect(crashCount).toBe(maxRestarts + 1); + expect(startCount).toBe(maxRestarts + 1); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, STRESS_TIMEOUT); + + it('should emit health_check_failed for unresponsive process', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'hang', { + healthCheckInterval: 500, // Very short for testing + autoRestart: false + }); + + let healthFailed = false; + monitor.on('health_check_failed', () => { + healthFailed = true; + }); + + await monitor.start(); + + // Wait for health check to fail (need 2x interval with no activity) + await sleep(2000); + + expect(healthFailed).toBe(true); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should handle React error boundary crashes', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'reactCrash'); + + const errorPromise = waitForEvent(monitor, 'error_detected'); + await monitor.start(); + await errorPromise; + + const errors = storage.getErrors(instanceId); + if (errors.success) { + expect(errors.data.some(e => + e.message.includes('TypeError') || e.message.includes('React') + )).toBe(true); + } + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should distinguish between graceful and forced shutdown', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + // Test graceful shutdown (clean exit) + const monitor1 = createTestMonitor(storage, `${instanceId}-1`, 'cleanExit', { + autoRestart: false + }); + await monitor1.start(); + await waitForState(monitor1, 'stopped', SHORT_TIMEOUT); + + // Should be stopped, not crashed + expect(monitor1.getState()).toBe('stopped'); + await monitor1.cleanup(); + + // Test forced shutdown (crash) + const monitor2 = createTestMonitor(storage, `${instanceId}-2`, 'crash', { + autoRestart: false + }); + await monitor2.start(); + await waitForState(monitor2, 'crashed', SHORT_TIMEOUT); + + // Should be crashed + expect(monitor2.getState()).toBe('crashed'); + await monitor2.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +// ============================================================================ +// MONITOR LOG COHERENCE TESTS +// ============================================================================ + +describe('Monitor Logs: Coherent Log Streams', () => { + it('should include monitor logs for process lifecycle events', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + await monitor.start(); + await sleep(500); + await monitor.stop(); + + const logs = await monitor.getAllLogsAndReset(); + + // Should have monitor logs for process creation and start + expect(logs).toContain('[MONITOR]'); + expect(logs).toContain('Starting process'); + expect(logs).toContain('Process started successfully'); + expect(logs).toContain('PID='); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should log crash events and restart attempts coherently', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: true, + maxRestarts: 2, + restartDelay: 100 + }); + + await monitor.start(); + + // Wait for restart cycles to complete + await sleep(2500); + + const logs = await monitor.getAllLogsAndReset(); + + // Should have multiple lifecycle events documented + expect(logs).toContain('[MONITOR]'); + expect(logs).toContain('Process crashed'); + expect(logs).toContain('Scheduling restart'); + expect(logs).toContain('attempt=1'); + expect(logs).toContain('attempt=2'); + + // Should document max restarts reached + expect(logs).toContain('maxRestarts'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, STRESS_TIMEOUT); + + it('should maintain coherent log stream across multiple restarts', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'crash', { + autoRestart: true, + maxRestarts: 3, + restartDelay: 50 + }); + + let startCount = 0; + monitor.on('process_started', () => startCount++); + + await monitor.start(); + + // Wait for all restart cycles + await sleep(3000); + + const logs = await monitor.getAllLogsAndReset(); + const logLines = logs.split('\n'); + + // Find all monitor log lines + const monitorLogs = logLines.filter(line => line.includes('[MONITOR]')); + + // Should have at least: start, crash, restart schedule, start, crash... for each cycle + expect(monitorLogs.length).toBeGreaterThanOrEqual(8); // At least 2 events per restart cycle + + // Verify chronological order - each "Starting process" should be followed by "Process started" or error + const startingLogs = monitorLogs.filter(l => l.includes('Starting process')); + const startedLogs = monitorLogs.filter(l => l.includes('Process started successfully')); + const crashedLogs = monitorLogs.filter(l => l.includes('Process crashed')); + + // Should have starts + crashes = total runs + expect(startingLogs.length).toBe(startCount); + expect(crashedLogs.length).toBeGreaterThanOrEqual(startCount - 1); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, STRESS_TIMEOUT); + + it('should include runtime duration in exit logs', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'cleanExit', { + autoRestart: false + }); + + await monitor.start(); + await waitForState(monitor, 'stopped', SHORT_TIMEOUT); + + const logs = await monitor.getAllLogsAndReset(); + + // Should log runtime information on exit + expect(logs).toContain('[MONITOR]'); + expect(logs).toContain('runtime='); + expect(logs).toContain('exitCode=0'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should log stop request and graceful shutdown', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = createTestMonitor(storage, instanceId, 'startup'); + await monitor.start(); + await sleep(300); + await monitor.stop(); + + const logs = await monitor.getAllLogsAndReset(); + + // Should log stop request and clean exit + expect(logs).toContain('[MONITOR]'); + expect(logs).toContain('Stop requested'); + // Process exits cleanly via SIGTERM (exit event comes first, marking it as clean exit) + expect(logs).toContain('exited cleanly'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +// ============================================================================ +// PORT HEALTH CHECK TESTS +// ============================================================================ + +describe('Health Check: Port Monitoring', () => { + // Mock script that binds to a port (using Bun's built-in server) + const createPortBindScript = (port: number) => ` + const pinoLog = (level, msg) => console.log(JSON.stringify({ + level, msg, time: Date.now() + })); + + pinoLog(30, 'Starting server...'); + + Bun.serve({ + port: ${port}, + fetch(req) { + return new Response('OK'); + } + }); + + pinoLog(30, 'Server listening on port ${port}'); + // Keep alive + setInterval(() => {}, 1000); + `; + + // Script that doesn't bind to any port (simulates misconfigured server that hangs) + const noPortBindScript = ` + const pinoLog = (level, msg) => console.log(JSON.stringify({ + level, msg, time: Date.now() + })); + + pinoLog(30, 'Starting but NOT binding to port...'); + + // Just stay alive without producing output (simulates a hung startup) + setInterval(() => {}, 1000); + `; + + it('should detect when expected port is bound', async () => { + const instanceId = getTestInstanceId(); + const testPort = 19000 + Math.floor(Math.random() * 1000); // Random port to avoid conflicts + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = new ProcessMonitor( + { + id: `proc-${instanceId}`, + instanceId, + command: 'bun', + args: ['-e', createPortBindScript(testPort)], + cwd: testDataDir, + restartCount: 0 + }, + storage, + { + ...DEFAULT_MONITORING_OPTIONS, + expectedPort: testPort, + healthCheckInterval: 500, + autoRestart: false + } + ); + + await monitor.start(); + + // Wait for port to bind and health check to run + await sleep(1500); + + const logs = await monitor.getAllLogsAndReset(); + + // Should log when port starts accepting connections + expect(logs).toContain('[MONITOR]'); + expect(logs).toContain(`Port ${testPort}`); + expect(logs).toContain('accepting connections'); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should report health issue when expected port is not bound', async () => { + const instanceId = getTestInstanceId(); + const testPort = 19000 + Math.floor(Math.random() * 1000); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + const monitor = new ProcessMonitor( + { + id: `proc-${instanceId}`, + instanceId, + command: 'bun', + args: ['-e', noPortBindScript], + cwd: testDataDir, + restartCount: 0 + }, + storage, + { + ...DEFAULT_MONITORING_OPTIONS, + expectedPort: testPort, + healthCheckInterval: 500, + autoRestart: false + } + ); + + let healthCheckFailed = false; + monitor.on('health_check_failed', () => { + healthCheckFailed = true; + }); + + await monitor.start(); + + // Wait for health check to detect missing port (needs 2x healthCheckInterval with no port) + await sleep(2000); + + // Should have detected the missing port + expect(healthCheckFailed).toBe(true); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should include expectedPort in startup log', async () => { + const instanceId = getTestInstanceId(); + const testPort = 19000 + Math.floor(Math.random() * 1000); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + // Create monitor with expectedPort option + const monitor = new ProcessMonitor( + { + id: `proc-${instanceId}`, + instanceId, + command: 'bun', + args: ['-e', MOCK_SCRIPTS.startup], + cwd: testDataDir, + restartCount: 0 + }, + storage, + { + ...DEFAULT_MONITORING_OPTIONS, + expectedPort: testPort, + healthCheckInterval: 1000, + autoRestart: false + } + ); + + await monitor.start(); + await sleep(500); + + const logs = await monitor.getAllLogsAndReset(); + + // Should include port info in startup log + expect(logs).toContain('[MONITOR]'); + expect(logs).toContain(`expectedPort=${testPort}`); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); + + it('should check PID liveness during health check', async () => { + const instanceId = getTestInstanceId(); + const storage = new StorageManager( + join(testDataDir, `${instanceId}-errors.db`), + join(testDataDir, `${instanceId}-logs.db`) + ); + + try { + // Use hang script which produces no output - this will trigger health check + const monitor = createTestMonitor(storage, instanceId, 'hang', { + healthCheckInterval: 300, + autoRestart: false + }); + + let healthCheckEvents: MonitoringEvent[] = []; + monitor.on('health_check_failed', (event) => { + healthCheckEvents.push(event); + }); + + await monitor.start(); + + // Wait for health checks to run + await sleep(1500); + + // Health check should have run (due to inactivity) but PID should still be alive + // So we shouldn't get "PID not responding" - just inactivity warnings + const hasInactivityWarning = healthCheckEvents.length > 0; + expect(hasInactivityWarning).toBe(true); + + await monitor.cleanup(); + } finally { + storage.close(); + } + }, TEST_TIMEOUT); +}); + +// ============================================================================ +// SETUP AND TEARDOWN +// ============================================================================ + +beforeAll(async () => { + testDataDir = await createTempDataDir(); + // Set environment for tests + process.env.CLI_DATA_DIR = testDataDir; +}); + +afterAll(async () => { + await cleanupTempDir(testDataDir); +}); + +afterEach(async () => { + // Small delay between tests for cleanup + await sleep(100); +}); diff --git a/container/process-monitor.ts b/container/process-monitor.ts index 5109b11d..e650cf42 100644 --- a/container/process-monitor.ts +++ b/container/process-monitor.ts @@ -1,24 +1,175 @@ import { EventEmitter } from 'events'; -import { spawn, ChildProcess } from 'child_process'; +import { spawn, ChildProcess, execSync } from 'child_process'; import { StorageManager } from './storage.js'; import { promises as fs } from 'fs'; import { join } from 'path'; -import { - ProcessInfo, - ProcessState, +import { StringDecoder } from 'string_decoder'; +import { randomUUID } from 'crypto'; +import { + ProcessInfo, + ProcessState, MonitoringOptions, MonitoringEvent, LogLine, Result, SimpleError, - getDataDirectory + getDataDirectory, + DEFAULT_MONITORING_OPTIONS } from './types.js'; + +// Type for merged options (all required except expectedPort which is optional) +type ResolvedMonitoringOptions = Omit, 'expectedPort'> & { expectedPort?: number }; + +/** + * Simple mutex for preventing concurrent file operations. + * Uses a promise queue to ensure only one operation runs at a time. + */ +class SimpleMutex { + private locked = false; + private queue: Array<() => void> = []; + + async acquire(): Promise { + return new Promise((resolve) => { + if (!this.locked) { + this.locked = true; + resolve(); + } else { + this.queue.push(resolve); + } + }); + } + + release(): void { + const next = this.queue.shift(); + if (next) { + next(); + } else { + this.locked = false; + } + } + + async tryAcquire(): Promise { + if (!this.locked) { + this.locked = true; + return true; + } + return false; + } +} + +/** + * Circular buffer for O(1) push operations with fixed capacity. + * Replaces array + shift() pattern which is O(n). + */ +class CircularBuffer { + private buffer: (T | undefined)[]; + private head = 0; + private tail = 0; + private size = 0; + + constructor(private capacity: number) { + this.buffer = new Array(capacity); + } + + push(item: T): void { + this.buffer[this.tail] = item; + this.tail = (this.tail + 1) % this.capacity; + if (this.size < this.capacity) { + this.size++; + } else { + this.head = (this.head + 1) % this.capacity; + } + } + + toArray(): T[] { + const result: T[] = []; + for (let i = 0; i < this.size; i++) { + result.push(this.buffer[(this.head + i) % this.capacity] as T); + } + return result; + } + + slice(start?: number): T[] { + const arr = this.toArray(); + return start !== undefined ? arr.slice(start) : arr; + } + + clear(): void { + this.head = this.tail = this.size = 0; + this.buffer = new Array(this.capacity); + } + + get length(): number { + return this.size; + } +} +/** + * File-based locking for cross-process coordination. + * Both SimpleLogManager and CLI commands use this for mutual exclusion. + */ +class FileLock { + private lockFilePath: string; + private acquired = false; + private static readonly MAX_RETRIES = 10; + private static readonly RETRY_DELAY_MS = 50; + private static readonly LOCK_STALE_MS = 30000; // 30 seconds + + constructor(filePath: string) { + this.lockFilePath = `${filePath}.lock`; + } + + async acquire(): Promise { + for (let attempt = 0; attempt < FileLock.MAX_RETRIES; attempt++) { + try { + // Try to create lock file exclusively + await fs.writeFile(this.lockFilePath, `${process.pid}:${Date.now()}`, { flag: 'wx' }); + this.acquired = true; + return true; + } catch (error: unknown) { + const fsError = error as { code?: string }; + if (fsError?.code === 'EEXIST') { + // Lock exists - check if stale + try { + const content = await fs.readFile(this.lockFilePath, 'utf8'); + const [, timestamp] = content.split(':'); + const lockTime = parseInt(timestamp, 10); + if (Date.now() - lockTime > FileLock.LOCK_STALE_MS) { + // Stale lock - remove and retry + await fs.unlink(this.lockFilePath).catch(() => {}); + continue; + } + } catch { + // Can't read lock file - try again + } + // Wait and retry + await new Promise(resolve => setTimeout(resolve, FileLock.RETRY_DELAY_MS + Math.random() * FileLock.RETRY_DELAY_MS)); + } else { + throw error; + } + } + } + return false; + } + + async release(): Promise { + if (!this.acquired) return; + try { + await fs.unlink(this.lockFilePath); + } catch { + // Ignore - lock file may already be gone + } + this.acquired = false; + } +} + class SimpleLogManager { private logFilePath: string; private maxLines: number; private maxFileSize: number; // in bytes private appendCount = 0; private static readonly CHECK_INTERVAL = 100; // Check file size every 100 appends + private static readonly FILE_SIZE_CHECK_THRESHOLD = 50000; // bytes + private readonly trimMutex = new SimpleMutex(); constructor(instanceId: string, maxLines: number = 1000, maxFileSize: number = 1024 * 1024) { // 1MB default this.logFilePath = join(getDataDirectory(), `${instanceId}-process.log`); @@ -26,52 +177,92 @@ class SimpleLogManager { this.maxFileSize = maxFileSize; } - async appendLog(content: string, stream: 'stdout' | 'stderr'): Promise { + async appendLog(content: string, stream: 'stdout' | 'stderr' | 'monitor'): Promise { try { const timestamp = new Date().toISOString(); const logLine = `[${timestamp}] [${stream}] ${content}\n`; - + await fs.appendFile(this.logFilePath, logLine, 'utf8'); - + // Only check file size periodically to reduce I/O overhead - if (++this.appendCount % SimpleLogManager.CHECK_INTERVAL === 0) { - await this.trimLogIfNeeded(); + // Note: appendCount increment is not atomic, but that's acceptable for + // periodic trimming - worst case we trim slightly more/less often + const currentCount = ++this.appendCount; + if (currentCount % SimpleLogManager.CHECK_INTERVAL === 0) { + // Fire and forget - don't await to avoid blocking appends + this.trimLogIfNeeded().catch((err) => { + console.warn('Background trim failed:', err); + }); } } catch (error) { console.warn('Failed to append to log file:', error); } } + /** + * Append a monitor-internal log entry. + * These are distinguished from child process output by the [monitor] stream tag. + */ + async appendMonitorLog(message: string): Promise { + return this.appendLog(`[MONITOR] ${message}`, 'monitor'); + } + async getAllLogsAndReset(): Promise { + const fileLock = new FileLock(this.logFilePath); + const lockAcquired = await fileLock.acquire(); + + if (!lockAcquired) { + console.warn('Could not acquire file lock for log reset'); + return ''; + } + try { - const tempPath = `${this.logFilePath}.tmp.${Date.now()}`; - + // Use UUID + PID + timestamp for guaranteed unique temp file + const tempPath = `${this.logFilePath}.tmp.${randomUUID()}.${process.pid}`; + try { await fs.rename(this.logFilePath, tempPath); - } catch (error: any) { - if (error?.code === 'ENOENT') { + } catch (error: unknown) { + const fsError = error as { code?: string }; + if (fsError?.code === 'ENOENT') { return ''; } throw error; } - - await fs.writeFile(this.logFilePath, '', 'utf8').catch(() => {}); - + + // Create new empty log file + // Use 'wx' flag to fail if file exists (created by concurrent append) + try { + await fs.writeFile(this.logFilePath, '', { flag: 'wx' }); + } catch { + // File was created by concurrent append between rename and writeFile + // That's fine - the new file already has fresh data + } + try { const logs = await fs.readFile(tempPath, 'utf8'); await fs.unlink(tempPath).catch(() => {}); return logs; } catch (error) { + // Attempt cleanup of temp file even if read failed await fs.unlink(tempPath).catch(() => {}); + console.warn('Failed to read temp log file:', error); return ''; } } catch (error) { console.warn('Failed to read/reset log file:', error); return ''; + } finally { + await fileLock.release(); } } private async trimLogIfNeeded(): Promise { + // Only allow one trim operation at a time + if (!await this.trimMutex.tryAcquire()) { + return; // Another trim in progress, skip + } + try { const stats = await fs.stat(this.logFilePath).catch(() => null); if (!stats) return; @@ -81,57 +272,61 @@ class SimpleLogManager { return; } - if (stats.size > 50000) { + if (stats.size > SimpleLogManager.FILE_SIZE_CHECK_THRESHOLD) { const content = await fs.readFile(this.logFilePath, 'utf8'); const lines = content.split('\n'); - + if (lines.length > this.maxLines) { await this.trimLogFile(); } } } catch (error) { console.warn('Failed to check/trim log file:', error); + } finally { + this.trimMutex.release(); } } private async trimLogFile(): Promise { + const tempPath = `${this.logFilePath}.trim.${randomUUID()}`; + try { const content = await fs.readFile(this.logFilePath, 'utf8'); const lines = content.split('\n'); - + const keepLines = Math.floor(this.maxLines * 0.7); const trimmedContent = lines.slice(-keepLines).join('\n'); - - await fs.writeFile(this.logFilePath, trimmedContent, 'utf8'); + + // Write to temp file first, then atomic rename + await fs.writeFile(tempPath, trimmedContent, 'utf8'); + await fs.rename(tempPath, this.logFilePath); } catch (error) { console.warn('Failed to trim log file:', error); + // Clean up temp file if it exists + await fs.unlink(tempPath).catch(() => {}); } } async cleanup(): Promise { + const fileLock = new FileLock(this.logFilePath); + const lockAcquired = await fileLock.acquire(); + try { await fs.unlink(this.logFilePath).catch(() => {}); } catch (error) { console.warn('Failed to cleanup log file:', error); + } finally { + if (lockAcquired) { + await fileLock.release(); + } } } } -const DEFAULT_MONITORING_OPTIONS: Required = { - autoRestart: true, - maxRestarts: 3, - restartDelay: 1000, - healthCheckInterval: 30000, - errorBufferSize: 100, - enableMetrics: false, - env: {}, - killTimeout: 10000 -}; - export class ProcessMonitor extends EventEmitter { private processInfo: ProcessInfo; private childProcess?: ChildProcess; - private options: Required; + private options: ResolvedMonitoringOptions; private storage: StorageManager; private simpleLogManager: SimpleLogManager; private state: ProcessState = 'stopped'; @@ -139,7 +334,22 @@ export class ProcessMonitor extends EventEmitter { private restartTimer?: NodeJS.Timeout; private healthCheckTimer?: NodeJS.Timeout; private lastActivity = new Date(); - private logBuffer: LogLine[] = []; + private logBuffer!: CircularBuffer; + + // Stream buffering for incomplete lines (Fix H1) + private stdoutBuffer = ''; + private stderrBuffer = ''; + private stdoutDecoder = new StringDecoder('utf8'); + private stderrDecoder = new StringDecoder('utf8'); + + // Stability tracking for restart counter reset (Fix C7) + private lastSuccessfulStart?: Date; + private static readonly STABILITY_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes of stable run resets counter + private static readonly INACTIVITY_THRESHOLD_MS = 30000; // 30 seconds of inactivity suggests process died + + // Port health check tracking + private portBindConfirmed = false; + private lastPortCheckTime?: Date; constructor( processInfo: ProcessInfo, @@ -147,23 +357,65 @@ export class ProcessMonitor extends EventEmitter { options: MonitoringOptions = {} ) { super(); - + + // Validate required fields (Fix M5) + if (!processInfo.command?.trim()) { + throw new Error('ProcessInfo.command is required'); + } + if (!processInfo.instanceId?.trim()) { + throw new Error('ProcessInfo.instanceId is required'); + } + this.processInfo = { ...processInfo }; - this.options = { ...DEFAULT_MONITORING_OPTIONS, ...options } as Required; + this.options = { ...DEFAULT_MONITORING_OPTIONS, ...options } as ResolvedMonitoringOptions; this.storage = storage; this.simpleLogManager = new SimpleLogManager(this.processInfo.instanceId); + this.logBuffer = new CircularBuffer(this.options.errorBufferSize); - this.startHealthMonitoring(); + // Note: Health monitoring starts when process starts (Fix H2) } public async start(): Promise> { try { - if (this.state === 'running') { - return { success: false, error: new Error('Process is already running') }; + // Fix H5: Check for transition states, not just running + if (this.state !== 'stopped' && this.state !== 'crashed') { + return { + success: false, + error: new Error(`Cannot start: process is in '${this.state}' state`) + }; } this.setState('starting'); + // Log monitor event: Process creation attempt + const fullCommand = `${this.processInfo.command} ${this.processInfo.args?.join(' ') || ''}`.trim(); + await this.simpleLogManager.appendMonitorLog( + `Starting process: command="${fullCommand}", cwd="${this.processInfo.cwd}", instanceId="${this.processInfo.instanceId}", restartCount=${this.restartCount}` + ).catch(() => {}); + + // Fix H3: Clear buffers on restart + this.logBuffer.clear(); + this.stdoutBuffer = ''; + this.stderrBuffer = ''; + this.stdoutDecoder = new StringDecoder('utf8'); + this.stderrDecoder = new StringDecoder('utf8'); + + // Reset port confirmation on new start + this.portBindConfirmed = false; + this.lastPortCheckTime = undefined; + + // Fix C7: Reset restart count if previous run was stable + if (this.lastSuccessfulStart && this.processInfo.endTime) { + const runDuration = this.processInfo.endTime.getTime() - this.lastSuccessfulStart.getTime(); + if (runDuration > ProcessMonitor.STABILITY_THRESHOLD_MS) { + console.log(`Previous run was stable (${Math.round(runDuration / 1000)}s), resetting restart count`); + await this.simpleLogManager.appendMonitorLog( + `Previous run was stable (${Math.round(runDuration / 1000)}s), resetting restart counter from ${this.restartCount} to 0` + ).catch(() => {}); + this.restartCount = 0; + } + } + this.childProcess = spawn(this.processInfo.command, this.processInfo.args || [], { cwd: this.processInfo.cwd, env: { ...process.env, ...this.options.env }, @@ -172,12 +424,11 @@ export class ProcessMonitor extends EventEmitter { shell: false // Don't use shell to avoid escaping issues }); - this.processInfo.pid = this.childProcess.pid; - if (!this.childProcess.pid) { + await this.simpleLogManager.appendMonitorLog(`Process spawn failed: no PID assigned`).catch(() => {}); throw new Error('Failed to start process - no PID assigned'); } - + // Update process info this.processInfo = { ...this.processInfo, @@ -188,13 +439,24 @@ export class ProcessMonitor extends EventEmitter { status: 'running' }; + // Fix C2: Clean up old listeners before setting up new ones + this.cleanupChildProcessListeners(); + this.setupProcessMonitoring(); this.setupStreamMonitoring(); + // Fix H2: Start health monitoring when process starts + this.startHealthMonitoring(); + this.setState('running'); this.lastActivity = new Date(); + this.lastSuccessfulStart = new Date(); - await this.simpleLogManager.appendLog(`Process started: ${this.processInfo.command} ${this.processInfo.args?.join(' ') || ''}`, 'stdout').catch(() => {}); + // Log monitor event: Process started successfully + const portInfo = this.options.expectedPort ? `, expectedPort=${this.options.expectedPort}` : ''; + await this.simpleLogManager.appendMonitorLog( + `Process started successfully: PID=${this.processInfo.pid}${portInfo}` + ).catch(() => {}); this.emit('process_started', { type: 'process_started', @@ -206,48 +468,104 @@ export class ProcessMonitor extends EventEmitter { console.log(`Process started: ${this.processInfo.command}`); - return { - success: true, - data: this.processInfo + return { + success: true, + data: this.processInfo }; } catch (error) { this.setState('stopped'); const errorMessage = error instanceof Error ? error.message : 'Failed to start process'; console.error(`Failed to start process: ${errorMessage}`); - - return { - success: false, - error: new Error(errorMessage) + + // Log monitor event: Process start failed + await this.simpleLogManager.appendMonitorLog( + `Process start failed: ${errorMessage}` + ).catch(() => {}); + + return { + success: false, + error: new Error(errorMessage) }; } } + /** + * Fix C2: Remove all event listeners from child process to prevent accumulation + */ + private cleanupChildProcessListeners(): void { + if (!this.childProcess) return; + + this.childProcess.removeAllListeners('exit'); + this.childProcess.removeAllListeners('error'); + this.childProcess.removeAllListeners('spawn'); + this.childProcess.stdout?.removeAllListeners('data'); + this.childProcess.stderr?.removeAllListeners('data'); + } + public async stop(): Promise> { try { if (this.state === 'stopped') { return { success: true, data: undefined }; } + // Fix H5: Prevent stop during start + if (this.state === 'starting') { + return { + success: false, + error: new Error('Cannot stop: process is still starting') + }; + } + + // Log monitor event: Stop requested + await this.simpleLogManager.appendMonitorLog( + `Stop requested: PID=${this.processInfo.pid}, state=${this.state}` + ).catch(() => {}); + this.setState('stopping'); + // Clear restart timer if pending if (this.restartTimer) { clearTimeout(this.restartTimer); this.restartTimer = undefined; + await this.simpleLogManager.appendMonitorLog(`Cancelled pending restart`).catch(() => {}); } - if (this.childProcess && !this.childProcess.killed) { + // Fix H2: Clear health timer on stop + if (this.healthCheckTimer) { + clearInterval(this.healthCheckTimer); + this.healthCheckTimer = undefined; + } + + // Flush any remaining stream buffers + this.flushStreamBuffers(); + + if (this.childProcess) { await this.killProcess(false); } - this.setState('stopped'); + // Fix C6: Do NOT emit process_stopped here - the exit handler will emit it + // This prevents double emission when killing a running process + // If the process was already dead, killProcess returns immediately and + // the exit handler has already fired - // Emit stop event - this.emit('process_stopped', { - type: 'process_stopped', - processId: this.processInfo.id, - instanceId: this.processInfo.instanceId, - timestamp: new Date() - } as MonitoringEvent); + // Only set state if exit handler didn't already do it + if (this.state === 'stopping') { + this.setState('stopped'); + + // Log monitor event: Process stopped successfully + await this.simpleLogManager.appendMonitorLog( + `Process stopped gracefully` + ).catch(() => {}); + + // Emit event only if we're the ones transitioning to stopped + // (process was already dead before we called killProcess) + this.emit('process_stopped', { + type: 'process_stopped', + processId: this.processInfo.id, + instanceId: this.processInfo.instanceId, + timestamp: new Date() + } as MonitoringEvent); + } console.log(`Process stopped: ${this.processInfo.command}`); @@ -255,85 +573,156 @@ export class ProcessMonitor extends EventEmitter { } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Failed to stop process'; console.error(`Failed to stop process: ${errorMessage}`); - - return { - success: false, - error: new Error(errorMessage) + + // Log monitor event: Stop failed + await this.simpleLogManager.appendMonitorLog( + `Stop failed: ${errorMessage}` + ).catch(() => {}); + + return { + success: false, + error: new Error(errorMessage) }; } } + /** + * Flush any incomplete lines remaining in stream buffers + */ + private flushStreamBuffers(): void { + if (this.stdoutBuffer.trim()) { + this.simpleLogManager.appendLog(this.stdoutBuffer.trim(), 'stdout').catch(() => {}); + const logLine: LogLine = { + content: this.stdoutBuffer.trim(), + timestamp: new Date(), + stream: 'stdout', + processId: this.processInfo.id + }; + this.logBuffer.push(logLine); + } + + if (this.stderrBuffer.trim()) { + this.simpleLogManager.appendLog(this.stderrBuffer.trim(), 'stderr').catch(() => {}); + const logLine: LogLine = { + content: this.stderrBuffer.trim(), + timestamp: new Date(), + stream: 'stderr', + processId: this.processInfo.id + }; + this.logBuffer.push(logLine); + } + + this.stdoutBuffer = ''; + this.stderrBuffer = ''; + } + private setupProcessMonitoring(): void { if (!this.childProcess) return; this.childProcess.on('exit', (code, signal) => { + // Capture state BEFORE any transitions + const wasRunning = this.state === 'running'; + const wasStopping = this.state === 'stopping'; + const runDuration = this.processInfo.startTime + ? Math.round((Date.now() - this.processInfo.startTime.getTime()) / 1000) + : 0; + // Update process info this.processInfo = { ...this.processInfo, exitCode: code ?? undefined, endTime: new Date() }; - - const wasUnexpected = this.state === 'running'; - const wasStopping = this.state === 'stopping'; - - this.setState('stopped'); - this.simpleLogManager.appendLog(`Process exited with code ${code}${signal ? ` (signal: ${signal})` : ''}`, 'stdout').catch(() => {}); + // Log monitor event: Process exited + this.simpleLogManager.appendMonitorLog( + `Process exited: code=${code}, signal=${signal ?? 'none'}, runtime=${runDuration}s, wasRunning=${wasRunning}, wasStopping=${wasStopping}` + ).catch(() => {}); - // Emit stop event - this.emit('process_stopped', { - type: 'process_stopped', - processId: this.processInfo.id, - instanceId: this.processInfo.instanceId, - exitCode: code, - reason: signal ? `Signal: ${signal}` : `Exit code: ${code}`, - timestamp: new Date() - } as MonitoringEvent); + // Fix C3: Determine final state BEFORE transitioning (no stopped -> crashed) + const shouldRestart = wasRunning && this.options.autoRestart && + this.shouldRestartAfterExit(code, signal, wasStopping); + const isUnexpectedCrash = wasRunning && (code !== 0 || shouldRestart); - if (wasUnexpected) { + // Single state transition - either crashed or stopped, never both + if (isUnexpectedCrash) { console.log(`Process exited unexpectedly: code=${code}, signal=${signal}`); - - const shouldRestart = this.options.autoRestart && this.shouldRestartAfterExit(code, signal, wasStopping); - - if (code !== 0 || shouldRestart) { - this.setState('crashed'); - - this.emit('process_crashed', { - type: 'process_crashed', + + // Log monitor event: Process crashed + this.simpleLogManager.appendMonitorLog( + `Process crashed: exitCode=${code}, signal=${signal ?? 'none'}, willRestart=${shouldRestart}, restartCount=${this.restartCount}/${this.options.maxRestarts}` + ).catch(() => {}); + + this.setState('crashed'); + + this.emit('process_crashed', { + type: 'process_crashed', + processId: this.processInfo.id, + instanceId: this.processInfo.instanceId, + exitCode: code, + signal: signal, + willRestart: shouldRestart, + timestamp: new Date() + } as MonitoringEvent); + + if (shouldRestart) { + this.scheduleRestart(); + } else { + // Log monitor event: Max restarts reached + this.simpleLogManager.appendMonitorLog( + `Process will not restart: maxRestarts (${this.options.maxRestarts}) reached` + ).catch(() => {}); + } + } else { + // Clean exit or intentional stop - only emit if not already stopped + // (stop() method may have already set state and emitted) + if (this.state !== 'stopped') { + // Log monitor event: Clean exit + this.simpleLogManager.appendMonitorLog( + `Process exited cleanly: exitCode=${code}, runtime=${runDuration}s` + ).catch(() => {}); + + this.setState('stopped'); + this.emit('process_stopped', { + type: 'process_stopped', processId: this.processInfo.id, instanceId: this.processInfo.instanceId, exitCode: code, - signal: signal, - willRestart: shouldRestart, + reason: signal ? `Signal: ${signal}` : `Exit code: ${code}`, timestamp: new Date() } as MonitoringEvent); - - if (shouldRestart) { - this.scheduleRestart(); - } } } }); this.childProcess.on('error', (error) => { console.error(`Process ${this.processInfo.id} error:`, error); - + this.processInfo = { ...this.processInfo, lastError: error.message }; - this.setState('crashed'); + + // Log monitor event: Process error + this.simpleLogManager.appendMonitorLog( + `Process spawn/runtime error: ${error.message}` + ).catch(() => {}); + + // Only transition if we're still in a running-ish state + // (error event usually precedes exit event) + if (this.state === 'running' || this.state === 'starting') { + this.setState('crashed'); + } this.simpleLogManager.appendLog(`Process error: ${error.message}`, 'stderr').catch(() => {}); - + const simpleError: SimpleError = { timestamp: new Date().toISOString(), level: 60, // fatal message: `Process error: ${error.message}`, rawOutput: error.stack || error.message }; - + this.storage.storeError( this.processInfo.instanceId, this.processInfo.id, @@ -354,12 +743,29 @@ export class ProcessMonitor extends EventEmitter { }); } + /** + * Fix H1: Process stream data with proper line buffering + * Handles incomplete lines that span multiple data chunks and + * multi-byte UTF-8 characters that might be split across chunks + */ private processStreamData(data: Buffer, stream: 'stdout' | 'stderr'): void { - const content = data.toString('utf8'); + // Use StringDecoder to properly handle multi-byte UTF-8 characters + const decoder = stream === 'stdout' ? this.stdoutDecoder : this.stderrDecoder; + const bufferKey = stream === 'stdout' ? 'stdoutBuffer' : 'stderrBuffer'; + + // Decode buffer (handles partial UTF-8 sequences correctly) + const decoded = decoder.write(data); + + // Combine with any incomplete line from previous chunk + const content = this[bufferKey] + decoded; const lines = content.split('\n'); this.lastActivity = new Date(); + // The last element might be incomplete - save for next chunk + this[bufferKey] = lines.pop() || ''; + + // Process complete lines for (const line of lines) { const trimmedLine = line.trim(); if (!trimmedLine) continue; @@ -372,11 +778,8 @@ export class ProcessMonitor extends EventEmitter { stream, processId: this.processInfo.id }; + // CircularBuffer automatically handles capacity - O(1) push this.logBuffer.push(logLine); - - if (this.logBuffer.length > this.options.errorBufferSize) { - this.logBuffer.shift(); - } this.parseJsonLog(trimmedLine); } @@ -387,10 +790,10 @@ export class ProcessMonitor extends EventEmitter { if (!line.startsWith('{')) return; const logData = JSON.parse(line); - + if (logData.level && logData.level >= 50) { const message = logData.msg || 'Unknown error'; - + const simpleError: SimpleError = { timestamp: logData.time ? new Date(logData.time).toISOString() : new Date().toISOString(), level: logData.level, @@ -406,7 +809,7 @@ export class ProcessMonitor extends EventEmitter { if (storeResult.success) { console.log(`Error detected (level ${logData.level}): ${message.substring(0, 100)}...`); - + // Emit error event this.emit('error_detected', { type: 'error_detected', @@ -422,6 +825,11 @@ export class ProcessMonitor extends EventEmitter { } } } catch (e) { + // Non-JSON lines are common (plain text logs from Vite, etc.) + // Only log if line looked like JSON but failed to parse + if (line.startsWith('{') && line.endsWith('}')) { + console.debug(`Failed to parse JSON log line: ${line.substring(0, 100)}...`); + } } } @@ -442,9 +850,19 @@ export class ProcessMonitor extends EventEmitter { return fatalPatterns.some(pattern => pattern.test(message)); } + /** + * Fix H4: Only kill process if we're in running state + * Prevents interference with graceful shutdown + */ private handleFatalError(error: SimpleError): void { console.error(`Fatal error detected: ${error.message}`); - + + // Don't interfere if we're already stopping or stopped + if (this.state !== 'running') { + console.log('Ignoring fatal error - process is not in running state'); + return; + } + if (this.childProcess && !this.childProcess.killed) { console.log('Killing process due to fatal error...'); this.childProcess.kill('SIGTERM'); @@ -469,12 +887,12 @@ export class ProcessMonitor extends EventEmitter { if (exitCode === 0) { const timeSinceLastActivity = Date.now() - this.lastActivity.getTime(); - - if (timeSinceLastActivity > 30000) { // More than 30 seconds + + if (timeSinceLastActivity > ProcessMonitor.INACTIVITY_THRESHOLD_MS) { console.log(`Process exited with code 0 but was unresponsive for ${Math.round(timeSinceLastActivity/1000)}s, assuming killed, will restart`); return true; } - + console.log('Process exited cleanly with code 0, not restarting'); return false; } @@ -487,83 +905,293 @@ export class ProcessMonitor extends EventEmitter { return false; } + /** + * Fix H6: Prevent timer overwrite race condition + */ private scheduleRestart(): void { + // Clear any existing restart timer to prevent duplicates + if (this.restartTimer) { + clearTimeout(this.restartTimer); + this.restartTimer = undefined; + } + this.restartCount++; - + console.log(`Scheduling restart ${this.restartCount}/${this.options.maxRestarts} in ${this.options.restartDelay}ms...`); - + + // Log monitor event: Scheduling restart + this.simpleLogManager.appendMonitorLog( + `Scheduling restart: attempt=${this.restartCount}/${this.options.maxRestarts}, delay=${this.options.restartDelay}ms` + ).catch(() => {}); + this.restartTimer = setTimeout(async () => { + this.restartTimer = undefined; + + // Check if stop() was called during the delay + if (this.state === 'stopped' || this.state === 'stopping') { + console.log('Restart cancelled - process was stopped during delay'); + // Log monitor event: Restart cancelled + this.simpleLogManager.appendMonitorLog( + `Restart cancelled: process was stopped during delay` + ).catch(() => {}); + return; + } + console.log(`Restarting process (attempt ${this.restartCount}/${this.options.maxRestarts})...`); - + + // Log monitor event: Restarting now + await this.simpleLogManager.appendMonitorLog( + `Restarting process now: attempt=${this.restartCount}/${this.options.maxRestarts}` + ).catch(() => {}); + const result = await this.start(); - if (!result.success) { - console.error(`Failed to restart process: ${result.error?.message}`); - + if (!result.success && 'error' in result) { + const errorMessage = result.error.message ?? 'Unknown error'; + console.error(`Failed to restart process: ${errorMessage}`); + + // Log monitor event: Restart failed + await this.simpleLogManager.appendMonitorLog( + `Restart failed: attempt=${this.restartCount}, error="${errorMessage}"` + ).catch(() => {}); + // Emit restart failed event this.emit('restart_failed', { type: 'restart_failed', processId: this.processInfo.id, instanceId: this.processInfo.instanceId, attempt: this.restartCount, - error: result.error?.message, + error: errorMessage, timestamp: new Date() } as MonitoringEvent); } }, this.options.restartDelay); } + /** + * Fix C1: Prevent deadlock when process already exited + * + * The original implementation could deadlock because: + * 1. Process exits naturally before stop() is called + * 2. childProcess.killed is false (we didn't kill it) + * 3. We register exit listener, but event already fired + * 4. We call kill() on dead process + * 5. Exit event never fires again - promise never resolves + * + * This fix checks process.exitCode/signalCode to detect already-exited processes. + */ private async killProcess(force: boolean = false): Promise { - if (!this.childProcess || this.childProcess.killed) return; + if (!this.childProcess) { + return; + } + + // Check if process already exited (exitCode or signalCode is set) + if (this.childProcess.exitCode !== null || this.childProcess.signalCode !== null) { + console.log('Process already exited, no kill needed'); + return; + } return new Promise((resolve) => { const killTimeout = this.options.killTimeout || 10000; - - if (force) { - this.childProcess!.kill('SIGKILL'); + let resolved = false; + + const cleanup = () => { + if (resolved) return; + resolved = true; + clearTimeout(timeout); + this.childProcess?.removeListener('exit', onExit); + }; + + const onExit = () => { + cleanup(); + resolve(); + }; + + // Register exit listener BEFORE checking state again + this.childProcess!.once('exit', onExit); + + // Double-check after registering listener (closes race window) + if (this.childProcess!.exitCode !== null || this.childProcess!.signalCode !== null) { + cleanup(); resolve(); return; } - + + // Set up timeout for graceful shutdown const timeout = setTimeout(() => { - if (this.childProcess && !this.childProcess.killed) { + if (resolved) return; + + // Check if process exited during timeout + if (this.childProcess && this.childProcess.exitCode === null) { console.log('Process did not exit gracefully, force killing...'); - this.childProcess.kill('SIGKILL'); + try { + this.childProcess.kill('SIGKILL'); + } catch (e) { + // Process may have exited between check and kill + console.log('SIGKILL failed - process likely already exited'); + } } - resolve(); - }, killTimeout); - this.childProcess!.once('exit', () => { - clearTimeout(timeout); + cleanup(); resolve(); - }); + }, force ? 0 : killTimeout); - this.childProcess!.kill('SIGTERM'); + // Send the kill signal + const signal = force ? 'SIGKILL' : 'SIGTERM'; + try { + const signalSent = this.childProcess!.kill(signal); + if (!signalSent) { + // kill() returns false if process already dead (no such process) + console.log(`${signal} signal not sent - process already dead`); + cleanup(); + resolve(); + } + } catch (e) { + // ESRCH = no such process + console.log(`Failed to send ${signal} - process likely already exited`); + cleanup(); + resolve(); + } }); } + /** + * Fix H2: Start health monitoring only when process starts, + * prevent duplicate intervals, and clear on stop + */ private startHealthMonitoring(): void { if (this.options.healthCheckInterval <= 0) return; + // Clear existing timer to prevent duplicates + if (this.healthCheckTimer) { + clearInterval(this.healthCheckTimer); + this.healthCheckTimer = undefined; + } + this.healthCheckTimer = setInterval(() => { if (this.state === 'running') { - const now = new Date(); - const timeSinceLastActivity = now.getTime() - this.lastActivity.getTime(); - - if (timeSinceLastActivity > this.options.healthCheckInterval * 2) { - console.warn(`Process appears unresponsive (no activity for ${timeSinceLastActivity}ms)`); - - this.emit('health_check_failed', { - type: 'health_check_failed', - processId: this.processInfo.id, - instanceId: this.processInfo.instanceId, - lastActivity: this.lastActivity, - timestamp: now - } as MonitoringEvent); - } + this.performHealthCheck(); } }, this.options.healthCheckInterval); } + /** + * Perform comprehensive health check: + * 1. Check if PID is still alive + * 2. Check if expected port is bound + * 3. Check for inactivity + */ + private async performHealthCheck(): Promise { + const now = new Date(); + const timeSinceLastActivity = now.getTime() - this.lastActivity.getTime(); + const healthIssues: string[] = []; + + // 1. Check if PID is alive + const pidAlive = this.checkPidAlive(); + if (!pidAlive) { + healthIssues.push('PID not responding'); + } + + // 2. Check port if configured + if (this.options.expectedPort) { + const portResult = await this.checkPortBound(this.options.expectedPort); + if (portResult.bound) { + if (!this.portBindConfirmed) { + this.portBindConfirmed = true; + await this.simpleLogManager.appendMonitorLog( + `Port ${this.options.expectedPort} is now accepting connections` + ).catch(() => {}); + } + } else if (this.portBindConfirmed) { + // Port was bound but is no longer + healthIssues.push(`Port ${this.options.expectedPort} no longer accepting connections`); + } else if (timeSinceLastActivity > this.options.healthCheckInterval) { + // Port never bound and process has been running for a while + healthIssues.push(`Port ${this.options.expectedPort} not yet bound after ${Math.round(timeSinceLastActivity / 1000)}s`); + } + this.lastPortCheckTime = now; + } + + // 3. Check for inactivity + if (timeSinceLastActivity > this.options.healthCheckInterval * 2) { + healthIssues.push(`No output for ${Math.round(timeSinceLastActivity / 1000)}s`); + } + + // Report health status + if (healthIssues.length > 0) { + const issueList = healthIssues.join(', '); + console.warn(`Health check warning: ${issueList}`); + + await this.simpleLogManager.appendMonitorLog( + `Health check warning: ${issueList}` + ).catch(() => {}); + + this.emit('health_check_failed', { + type: 'health_check_failed', + processId: this.processInfo.id, + instanceId: this.processInfo.instanceId, + lastActivity: this.lastActivity, + timestamp: now + } as MonitoringEvent); + } + } + + /** + * Check if the process PID is still alive + */ + private checkPidAlive(): boolean { + if (!this.processInfo.pid) return false; + + try { + // Sending signal 0 checks if process exists without actually sending a signal + process.kill(this.processInfo.pid, 0); + return true; + } catch { + // ESRCH means process doesn't exist, EPERM means it exists but we can't signal it + return false; + } + } + + /** + * Check if a port is bound using lsof. + * Returns the PID that owns the port if found. + */ + private checkPortBound(port: number): { bound: boolean; pid?: number; error?: string } { + try { + // Use lsof to check if the port is in use + // -i :port - filter by port + // -t - terse output (just PIDs) + // -sTCP:LISTEN - only show listening TCP sockets + const result = execSync(`lsof -i :${port} -t -sTCP:LISTEN 2>/dev/null`, { + encoding: 'utf8', + timeout: 2000 + }).trim(); + + if (result) { + const pids = result.split('\n').map(p => parseInt(p, 10)).filter(p => !isNaN(p)); + if (pids.length > 0) { + // Check if our process owns the port + const ourPid = this.processInfo.pid; + const ownerPid = pids[0]; + return { + bound: true, + pid: ownerPid + }; + } + } + return { bound: false }; + } catch (error) { + // lsof returns non-zero if no matching processes found + // This is expected when port is not bound + const exitError = error as { status?: number; message?: string }; + if (exitError.status === 1) { + // No process found - port not bound + return { bound: false }; + } + // Other error (timeout, lsof not found, etc.) + return { bound: false, error: exitError.message || 'lsof failed' }; + } + } + private setState(newState: ProcessState): void { const oldState = this.state; this.state = newState; @@ -589,7 +1217,8 @@ export class ProcessMonitor extends EventEmitter { } public getRecentLogs(limit: number = 50): LogLine[] { - return this.logBuffer.slice(-limit); + const logs = this.logBuffer.toArray(); + return logs.slice(-limit); } public async getAllLogsAndReset(): Promise { diff --git a/container/storage.ts b/container/storage.ts index 6dc95c12..a66851fd 100644 --- a/container/storage.ts +++ b/container/storage.ts @@ -1,9 +1,10 @@ import { Database } from 'bun:sqlite'; import { createHash } from 'crypto'; -import { +import * as fs from 'fs'; +import * as path from 'path'; +import { SimpleError, StoredError, - ProcessInfo, StoredLog, LogLevel, ErrorSummary, @@ -68,17 +69,14 @@ export class StorageManager { } private ensureDataDirectory(dbPath: string): void { - const fs = require('fs'); - const path = require('path'); const dir = path.dirname(dbPath); - + if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } } private initializeDatabase(dbPath: string): Database { - const fs = require('fs'); try { const dbExists = fs.existsSync(dbPath); @@ -103,10 +101,17 @@ export class StorageManager { } } + private maintenanceInterval?: ReturnType; + private setupMaintenanceTasks(): void { - setInterval(() => { - if (this.errorStorage) { - // Maintenance tasks if needed + // Run maintenance every hour - cleanup old data based on retention settings + this.maintenanceInterval = setInterval(() => { + try { + if (this.logStorage) { + this.logStorage.cleanupOldLogs(); + } + } catch (error) { + console.warn('Maintenance task failed:', error); } }, 60 * 60 * 1000); } @@ -138,14 +143,6 @@ export class StorageManager { } } - public storeProcessInfo(processInfo: ProcessInfo): Result { - try { - return { success: true, data: true }; - } catch (error) { - return { success: false, error: this.toError(error) }; - } - } - public storeError(instanceId: string, processId: string, error: SimpleError): Result { return this.wrapRetryOperation(() => this.errorStorage.storeError(instanceId, processId, error)); } @@ -195,9 +192,15 @@ export class StorageManager { */ public close(): void { try { + // Clear maintenance interval to prevent memory leaks + if (this.maintenanceInterval) { + clearInterval(this.maintenanceInterval); + this.maintenanceInterval = undefined; + } + this.errorStorage.close(); this.logStorage.close(); - + if (this.errorDb !== this.logDb) { this.logDb.close(); } @@ -211,10 +214,9 @@ export class StorageManager { class ErrorStorage { private db: Database; private options: Required; - + // Prepared statements private insertErrorStmt: ReturnType; - private updateErrorStmt: ReturnType; private selectErrorsStmt: ReturnType; private countErrorsStmt: ReturnType; private deleteErrorsStmt: ReturnType; @@ -269,13 +271,7 @@ class ErrorStorage { instance_id, process_id, error_hash, timestamp, level, message, raw_output ) VALUES (?, ?, ?, ?, ?, ?, ?) `); - - this.updateErrorStmt = this.db.query(` - UPDATE simple_errors - SET occurrence_count = occurrence_count + 1, timestamp = ? - WHERE error_hash = ? AND instance_id = ? - `); - + this.selectErrorsStmt = this.db.query(` WITH deduplicated AS ( SELECT @@ -455,11 +451,10 @@ class ErrorStorage { class LogStorage { private db: Database; private options: Required; - + // Prepared statements private insertLogStmt: ReturnType; private selectLogsStmt: ReturnType; - private selectLogsSinceStmt: ReturnType; private countLogsStmt: ReturnType; private deleteOldLogsStmt: ReturnType; private getLastSequenceStmt: ReturnType; @@ -508,19 +503,12 @@ class LogStorage { `); this.selectLogsStmt = this.db.query(` - SELECT * FROM process_logs + SELECT * FROM process_logs WHERE instance_id = ? ORDER BY sequence DESC LIMIT ? OFFSET ? `); - this.selectLogsSinceStmt = this.db.query(` - SELECT * FROM process_logs - WHERE instance_id = ? AND sequence > ? - ORDER BY sequence ASC - LIMIT ? - `); - this.countLogsStmt = this.db.query(` SELECT COUNT(*) as count FROM process_logs WHERE instance_id = ? `); diff --git a/container/types.ts b/container/types.ts index 619dbf26..f5172948 100644 --- a/container/types.ts +++ b/container/types.ts @@ -83,8 +83,7 @@ export const ProcessStateSchema = z.enum([ 'running', 'stopping', 'stopped', - 'crashed', - 'restarting' + 'crashed' ]); export type ProcessState = z.infer; @@ -110,9 +109,9 @@ export interface MonitoringOptions { readonly restartDelay?: number; readonly healthCheckInterval?: number; readonly errorBufferSize?: number; - readonly enableMetrics?: boolean; readonly env?: Record; readonly killTimeout?: number; + readonly expectedPort?: number; // Port the child process should bind to (for health checks) } // ========================================== @@ -294,13 +293,16 @@ export type Result = // CONSTANTS // ========================================== -export const DEFAULT_MONITORING_OPTIONS: MonitoringOptions = { +// Note: expectedPort is optional so we use Omit to exclude it from Required +export const DEFAULT_MONITORING_OPTIONS: Omit, 'expectedPort'> & { expectedPort?: number } = { autoRestart: true, - maxRestarts: 6, + maxRestarts: 3, restartDelay: 1000, - errorBufferSize: 300, - healthCheckInterval: 10000, - enableMetrics: false + errorBufferSize: 100, + healthCheckInterval: 30000, + env: {}, + killTimeout: 10000, + expectedPort: undefined } as const; export const DEFAULT_STORAGE_OPTIONS: ErrorStoreOptions = { diff --git a/package.json b/package.json index 5e2b057f..0c0d6965 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ "dependencies": { "@ashishkumar472/cf-git": "1.0.5", "@cloudflare/containers": "^0.0.28", - "@cloudflare/sandbox": "0.5.2", + "@cloudflare/sandbox": "0.5.6", "@noble/ciphers": "^1.3.0", "@octokit/rest": "^22.0.1", "@radix-ui/react-accordion": "^1.2.12", diff --git a/worker/agents/core/simpleGeneratorAgent.ts b/worker/agents/core/simpleGeneratorAgent.ts index cf9476e3..c2cc0047 100644 --- a/worker/agents/core/simpleGeneratorAgent.ts +++ b/worker/agents/core/simpleGeneratorAgent.ts @@ -1507,7 +1507,7 @@ export class SimpleCodeGeneratorAgent extends Agent { const match = issue.reason.match(/External package ["'](.+?)["']/); const name = match?.[1]; return (typeof name === 'string' && name.trim().length > 0 && !name.startsWith('@shared')) ? [name] : []; - }); + }).filter((name) => !name.includes('cloudflare:')); if (moduleNames.length > 0) { const installCommands = moduleNames.map(moduleName => `bun install ${moduleName}`); await this.executeCommands(installCommands, false); From c806f9d0db2b28e9ad40ee755084d2c6d508f719 Mon Sep 17 00:00:00 2001 From: Ashish Kumar Singh Date: Fri, 28 Nov 2025 00:41:20 -0500 Subject: [PATCH 5/5] refactor: improve error handling and fallback behavior in agent operations Switch blueprint agent fallback from Gemini 2.5 Flash to OpenAI 5 Mini. Comment out 429 rate limit error throwing in inference core to allow fallback retry logic. Remove null check exceptions in phase generation, phase implementation README generation, code fixer, and user conversation processing - return empty/default values instead of throwing errors. Update phase generation to handle null results with empty phase object --- worker/agents/inferutils/config.ts | 2 +- worker/agents/inferutils/core.ts | 7 ++++--- worker/agents/operations/PhaseGeneration.ts | 18 +++++++++++++----- .../agents/operations/PhaseImplementation.ts | 5 ----- worker/agents/operations/PostPhaseCodeFixer.ts | 2 +- .../operations/UserConversationProcessor.ts | 5 ----- 6 files changed, 19 insertions(+), 20 deletions(-) diff --git a/worker/agents/inferutils/config.ts b/worker/agents/inferutils/config.ts index 2e0a97f6..900dc8e5 100644 --- a/worker/agents/inferutils/config.ts +++ b/worker/agents/inferutils/config.ts @@ -76,7 +76,7 @@ const PLATFORM_AGENT_CONFIG: AgentConfig = { reasoning_effort: 'medium', max_tokens: 8000, temperature: 1, - fallbackModel: AIModels.GEMINI_2_5_FLASH, + fallbackModel: AIModels.OPENAI_5_MINI, }, firstPhaseImplementation: { name: AIModels.GEMINI_2_5_PRO, diff --git a/worker/agents/inferutils/core.ts b/worker/agents/inferutils/core.ts index 5fa9fd07..559b7b5b 100644 --- a/worker/agents/inferutils/core.ts +++ b/worker/agents/inferutils/core.ts @@ -614,9 +614,10 @@ export async function infer({ } console.error(`Failed to get inference response from OpenAI: ${error}`); - if ((error instanceof Error && error.message.includes('429')) || (typeof error === 'string' && error.includes('429'))) { - throw new RateLimitExceededError('Rate limit exceeded in LLM calls, Please try again later', RateLimitType.LLM_CALLS); - } + // if ((error instanceof Error && error.message.includes('429')) || (typeof error === 'string' && error.includes('429'))) { + + // throw new RateLimitExceededError('Rate limit exceeded in LLM calls, Please try again later', RateLimitType.LLM_CALLS); + // } throw error; } let toolCalls: ChatCompletionMessageFunctionToolCall[] = []; diff --git a/worker/agents/operations/PhaseGeneration.ts b/worker/agents/operations/PhaseGeneration.ts index ad728af7..e0b6723e 100644 --- a/worker/agents/operations/PhaseGeneration.ts +++ b/worker/agents/operations/PhaseGeneration.ts @@ -285,7 +285,7 @@ export class PhaseGenerationOperation extends AgentOperation