From 032e0f456f598139ada9abd8bdc91b1aee6f5ec0 Mon Sep 17 00:00:00 2001 From: Matt Morgan Date: Mon, 25 May 2026 16:23:51 -0700 Subject: [PATCH 1/2] feat(mcp): share one serve --mcp per project across MCP clients (#411) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multiple agents pointing at the same project (interactive Claude Code + worktree + background /loop, parallel sub-agents, etc.) used to spawn one codegraph serve --mcp per client. Each child independently registered its own recursive inotify watch set, held its own SQLite handle, paid its own cold-start indexing, and reconciled its own changes — so N agents on a large monorepo brought the kernel's per-user inotify budget into play (see #276) and caused SQLite lock contention (#238). A new shared-daemon mode collapses this to "pay once": - The first invocation per project root acquires .codegraph/daemon.pid (atomic O_EXCL create) and listens on .codegraph/daemon.sock (Unix socket; named pipe on Windows). - Subsequent invocations connect to the socket, verify a same-version hello line, and run as a near-transparent stdio↔socket proxy — no watcher, no DB open, no parser warm-up of their own. - The daemon refcounts connected clients (its stdio launcher plus any socket-attached proxies). When the last client disconnects it lingers for CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS (default 300s) so back-to-back sessions don't repay startup. - Stale lockfiles (dead pid / unbound socket) are detected and cleared; a version mismatch in the hello line makes the proxy fall back to direct mode rather than risk wire incompatibility. - CODEGRAPH_NO_DAEMON=1 opts out entirely (per-client behavior, useful for debugging). Implementation is layered to keep the protocol logic identical between direct and daemon modes: - transport.ts — JsonRpcTransport interface; StdioTransport (with optional onClose so the daemon's launcher session doesn't drag the daemon down) + new SocketTransport. - engine.ts — shared CodeGraph + watcher + ToolHandler (one per daemon, shared across all sessions). - session.ts — per-connection initialize / tools/list / tools/call state machine (was inline in MCPServer; same regression tests cover it). - daemon.ts — socket listener, lockfile, hello line, ref count + idle timer, graceful shutdown. - proxy.ts — stdio↔socket pipe + PPID watchdog (same parent-death detection as direct mode; the daemon's own lifecycle is driven by its idle timer instead). - index.ts — MCPServer now decides mode at start() and dispatches. Public API (new MCPServer(path).start()) is unchanged. Coverage: - The full existing suite (815 tests) still passes unmodified. - 5 new tests in __tests__/mcp-daemon.test.ts cover: second client attaches as proxy; CODEGRAPH_NO_DAEMON=1 opts out; stale pidfile takeover; idle-timeout exit; version-mismatch fallback to direct mode. Closes #411. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 20 ++ __tests__/mcp-daemon.test.ts | 322 +++++++++++++++++ src/mcp/daemon-paths.ts | 99 +++++ src/mcp/daemon.ts | 381 ++++++++++++++++++++ src/mcp/engine.ts | 232 ++++++++++++ src/mcp/index.ts | 677 ++++++++++++----------------------- src/mcp/proxy.ts | 243 +++++++++++++ src/mcp/session.ts | 270 ++++++++++++++ src/mcp/transport.ts | 354 ++++++++++++++---- src/mcp/version.ts | 36 ++ 10 files changed, 2100 insertions(+), 534 deletions(-) create mode 100644 __tests__/mcp-daemon.test.ts create mode 100644 src/mcp/daemon-paths.ts create mode 100644 src/mcp/daemon.ts create mode 100644 src/mcp/engine.ts create mode 100644 src/mcp/proxy.ts create mode 100644 src/mcp/session.ts create mode 100644 src/mcp/version.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 173aaf93b..2cd6f67fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,26 @@ a [GitHub Release](https://github.com/colbymchenry/codegraph/releases) tagged This project follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added +- **Shared MCP daemon — running multiple AI agents in the same project no + longer multiplies the inotify, SQLite, and indexing cost.** The first + `codegraph serve --mcp` per project becomes a per-project daemon listening + on `.codegraph/daemon.sock` (named pipe on Windows). Subsequent invocations + for the same project attach as thin stdio↔socket proxies — one file + watcher, one SQLite connection, one tree-sitter warm-up no matter how many + Claude Code / Cursor / Codex / opencode sessions you point at the repo. + Two concurrent sessions on a large monorepo used to consume ~880k of the + Linux 1,048,576 per-user inotify budget; they now share ~440k. The daemon + lingers for `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` (default 300s) after the + last client disconnects so back-to-back sessions don't repay startup cost. + Resolves issue #411. +- **`CODEGRAPH_NO_DAEMON=1` — opt out of the shared daemon.** Restores the + pre-issue-#411 behavior of one independent server process per client. + Useful for debugging or for environments that don't permit local + IPC sockets. + ## [0.9.5] - 2026-05-25 ### Fixed diff --git a/__tests__/mcp-daemon.test.ts b/__tests__/mcp-daemon.test.ts new file mode 100644 index 000000000..c8019b786 --- /dev/null +++ b/__tests__/mcp-daemon.test.ts @@ -0,0 +1,322 @@ +/** + * Shared MCP daemon — issue #411. + * + * Validates the contract added in `src/mcp/{daemon,proxy,session}.ts`: + * - Two `serve --mcp` invocations against the same project share *one* + * daemon process; the second invocation attaches as a proxy. + * - A stale lockfile (PID gone, no socket) gets cleared so the next + * invocation can become the new daemon. + * - `CODEGRAPH_NO_DAEMON=1` opts out — both processes run independently. + * - The proxy refuses to attach across a version mismatch. + * + * These tests intentionally spawn real `node dist/bin/codegraph.js` processes + * over real sockets — the same surface a Claude Code / Cursor / Codex install + * would exercise. Idle timeouts are forced short via + * `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` to keep the suite fast. + */ + +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { ChildProcessWithoutNullStreams, spawn } from 'child_process'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { CodeGraph } from '../src'; + +const BIN = path.resolve(__dirname, '../dist/bin/codegraph.js'); + +interface SpawnedServer { + child: ChildProcessWithoutNullStreams; + stdout: string[]; + stderr: string[]; + // Resolves once the child has emitted at least one stderr line — gives us a + // stable signal that the process is past the `relaunchWithWasmRuntimeFlagsIfNeeded` + // re-exec dance. + spawnedSettled: Promise; +} + +function spawnServer(cwd: string, env: NodeJS.ProcessEnv = {}): SpawnedServer { + const child = spawn(process.execPath, [BIN, 'serve', '--mcp'], { + cwd, + stdio: ['pipe', 'pipe', 'pipe'], + env: { ...process.env, ...env }, + }) as ChildProcessWithoutNullStreams; + const stdout: string[] = []; + const stderr: string[] = []; + let stdoutBuf = ''; + let stderrBuf = ''; + let firstStderrResolve!: () => void; + const spawnedSettled = new Promise((resolve) => { firstStderrResolve = resolve; }); + child.stdout.on('data', (chunk: Buffer) => { + stdoutBuf += chunk.toString('utf8'); + let idx: number; + while ((idx = stdoutBuf.indexOf('\n')) !== -1) { + stdout.push(stdoutBuf.slice(0, idx)); + stdoutBuf = stdoutBuf.slice(idx + 1); + } + }); + child.stderr.on('data', (chunk: Buffer) => { + stderrBuf += chunk.toString('utf8'); + let idx: number; + while ((idx = stderrBuf.indexOf('\n')) !== -1) { + stderr.push(stderrBuf.slice(0, idx)); + stderrBuf = stderrBuf.slice(idx + 1); + } + firstStderrResolve(); + }); + return { child, stdout, stderr, spawnedSettled }; +} + +function sendInitialize(child: ChildProcessWithoutNullStreams, rootUri: string, id: number = 0) { + const msg = JSON.stringify({ + jsonrpc: '2.0', + id, + method: 'initialize', + params: { + protocolVersion: '2025-11-25', + capabilities: {}, + clientInfo: { name: 'test', version: '0.0.0' }, + rootUri, + }, + }); + child.stdin.write(msg + '\n'); +} + +function waitFor( + predicate: () => T | undefined | null | false, + timeoutMs: number, + pollMs: number = 25, +): Promise { + return new Promise((resolve, reject) => { + const started = Date.now(); + const tick = () => { + const v = predicate(); + if (v) return resolve(v as T); + if (Date.now() - started > timeoutMs) { + return reject(new Error(`Timed out after ${timeoutMs}ms`)); + } + setTimeout(tick, pollMs); + }; + tick(); + }); +} + +function findInitializeResponse(stdout: string[], id: number) { + for (const line of stdout) { + if (!line.trim()) continue; + try { + const parsed = JSON.parse(line); + if (parsed.id === id && parsed.result?.serverInfo) return parsed; + } catch { /* not JSON */ } + } + return null; +} + +function killTree(...procs: ChildProcessWithoutNullStreams[]) { + for (const p of procs) { + if (!p.killed) { + try { p.kill('SIGKILL'); } catch { /* already gone */ } + } + } +} + +async function waitProcessExit(child: ChildProcessWithoutNullStreams, timeoutMs: number): Promise { + return new Promise((resolve) => { + if (child.exitCode !== null || child.signalCode !== null) return resolve(true); + const timer = setTimeout(() => resolve(false), timeoutMs); + child.once('exit', () => { clearTimeout(timer); resolve(true); }); + }); +} + +describe('Shared MCP daemon (issue #411)', () => { + let tempDir: string; + const servers: SpawnedServer[] = []; + + beforeEach(async () => { + tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-mcp-daemon-')); + // Initialize a real CodeGraph project — the daemon needs `.codegraph/` to + // know where to put its socket + pidfile. `CodeGraph.init` writes the SQL + // schema synchronously, so by the time we spawn the server it's ready. + const cg = await CodeGraph.init(tempDir); + cg.close(); + }); + + afterEach(async () => { + killTree(...servers.map((s) => s.child)); + // Give the OS a moment to reap and remove socket files before rmSync. + await new Promise((resolve) => setTimeout(resolve, 50)); + servers.length = 0; + fs.rmSync(tempDir, { recursive: true, force: true }); + }); + + it('second invocation attaches as a proxy to the first', async () => { + // Short idle so the suite doesn't have to wait the production 5 minutes + // if anything leaks — but long enough that the second client's lifetime + // overlaps with the daemon's. + const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '5000' }; + + const first = spawnServer(tempDir, env); + servers.push(first); + sendInitialize(first.child, `file://${tempDir}`, 1); + const firstResponse = await waitFor( + () => findInitializeResponse(first.stdout, 1), + 8000, + ); + expect(firstResponse.result.serverInfo.name).toBe('codegraph'); + + // Daemon should be advertising itself on stderr — proves the daemon path + // ran, not the direct-mode fallback. + expect(first.stderr.some((l) => l.includes('[CodeGraph daemon] Listening on'))).toBe(true); + + // Lockfile + socket exist. + const pidPath = path.join(tempDir, '.codegraph', 'daemon.pid'); + const sockPath = path.join(tempDir, '.codegraph', 'daemon.sock'); + expect(fs.existsSync(pidPath)).toBe(true); + // On POSIX the socket lives at the in-project path unless its absolute + // path exceeded the limit — `os.tmpdir()`-based fallback is rare for + // mkdtemp paths. + expect(fs.existsSync(sockPath)).toBe(true); + + // Second server in the same project should attach as a proxy. + const second = spawnServer(tempDir, env); + servers.push(second); + sendInitialize(second.child, `file://${tempDir}`, 2); + const secondResponse = await waitFor( + () => findInitializeResponse(second.stdout, 2), + 8000, + ); + expect(secondResponse.result.serverInfo.name).toBe('codegraph'); + // The proxy logs its attach to stderr; that's the canonical witness. + await waitFor( + () => second.stderr.some((l) => l.includes('Attached to shared daemon')), + 5000, + ); + }, 30000); + + it('CODEGRAPH_NO_DAEMON=1 keeps both processes independent (no socket)', async () => { + const env = { CODEGRAPH_NO_DAEMON: '1' }; + const first = spawnServer(tempDir, env); + servers.push(first); + sendInitialize(first.child, `file://${tempDir}`, 1); + await waitFor(() => findInitializeResponse(first.stdout, 1), 8000); + // Direct mode — no daemon listener log. + expect(first.stderr.some((l) => l.includes('[CodeGraph daemon] Listening on'))).toBe(false); + // No pidfile in opt-out mode. + expect(fs.existsSync(path.join(tempDir, '.codegraph', 'daemon.pid'))).toBe(false); + }, 20000); + + it('stale pidfile from a dead daemon gets cleared and a fresh daemon takes over', async () => { + // Plant a lockfile pointing at a definitely-dead pid. PID 999999 is + // outside the usual Linux pid_max default (4194304) — but `process.kill` + // probing returns ESRCH for nonexistent pids, which is what we want. + const pidPath = path.join(tempDir, '.codegraph', 'daemon.pid'); + fs.writeFileSync( + pidPath, + JSON.stringify({ + pid: 999_999, + version: '0.0.0-fake', + socketPath: path.join(tempDir, '.codegraph', 'daemon.sock'), + startedAt: Date.now() - 1000, + }), + ); + + const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '5000' }; + const server = spawnServer(tempDir, env); + servers.push(server); + sendInitialize(server.child, `file://${tempDir}`, 1); + let response: { result?: { serverInfo?: { name: string } } } | null = null; + try { + response = await waitFor(() => findInitializeResponse(server.stdout, 1), 8000); + } catch (err) { + throw new Error( + `${(err as Error).message}\nstderr:\n${server.stderr.join('\n')}\nstdout:\n${server.stdout.join('\n')}`, + ); + } + expect(response?.result?.serverInfo?.name).toBe('codegraph'); + // Daemon mode took over. + await waitFor( + () => server.stderr.some((l) => l.includes('[CodeGraph daemon] Listening on')), + 8000, + ); + // Pidfile now reflects a live daemon, not the planted-dead one. (Note: + // we can't compare to `server.child.pid` directly because the CLI may + // re-exec itself with `--liftoff-only`; the daemon lives in the + // grandchild, not the immediate child. What matters is that the pid + // recorded in the lockfile is *alive*, which the planted 999999 wasn't.) + const lockBody = JSON.parse(fs.readFileSync(pidPath, 'utf8')); + expect(lockBody.pid).not.toBe(999_999); + expect(() => process.kill(lockBody.pid, 0)).not.toThrow(); + }, 30000); + + it('proxy falls back to direct mode on daemon version mismatch', async () => { + // Stand up a daemon at a known socket, then write a hello line with a + // mismatched version into a *separate* test socket. The probe path + // doesn't actually need a full daemon — just a peer that produces a + // hello line. We use a hand-rolled mini-server so this test stays + // hermetic and doesn't depend on lockfile-aware behavior of the real + // daemon. + const net = await import('net'); + const sockPath = path.join(tempDir, '.codegraph', 'daemon.sock'); + // Pre-plant a lockfile pointing at a *live* (this test process) pid so + // the takeover loop doesn't unlink the lockfile mid-test. + fs.writeFileSync( + path.join(tempDir, '.codegraph', 'daemon.pid'), + JSON.stringify({ + pid: process.pid, + version: '0.0.0-mismatch', + socketPath: sockPath, + startedAt: Date.now(), + }), + ); + const miniServer = net.createServer((sock) => { + sock.write(JSON.stringify({ codegraph: '0.0.0-mismatch', pid: 1, socketPath: sockPath, protocol: 1 }) + '\n'); + }); + await new Promise((resolve) => miniServer.listen(sockPath, () => resolve())); + + try { + const server = spawnServer(tempDir); + servers.push(server); + sendInitialize(server.child, `file://${tempDir}`, 1); + // Despite the mismatched-version daemon, the client should still get + // an initialize response — proxy refuses to attach and we fall back + // to direct mode. + const response = await waitFor( + () => findInitializeResponse(server.stdout, 1), + 8000, + ); + expect(response.result.serverInfo.name).toBe('codegraph'); + // The version-mismatch fallback message goes to stderr. + await waitFor( + () => server.stderr.some((l) => l.includes('version') && l.includes('falling back to direct mode')), + 4000, + ); + } finally { + await new Promise((resolve) => miniServer.close(() => resolve())); + } + }, 30000); + + it('daemon idle-times-out after the last client disconnects', async () => { + // 800ms idle is enough to ride out any post-disconnect grace; with the + // poll-based unref'd timer it fires quickly. We deliberately don't go + // below ~500ms because the watcher catch-up sync runs in the background + // and chowns the event loop briefly during teardown. + const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '800' }; + const server = spawnServer(tempDir, env); + servers.push(server); + sendInitialize(server.child, `file://${tempDir}`, 1); + await waitFor(() => findInitializeResponse(server.stdout, 1), 8000); + await waitFor( + () => server.stderr.some((l) => l.includes('[CodeGraph daemon] Listening on')), + 5000, + ); + + // Close stdin → launcher session drops → no clients → idle timer arms. + server.child.stdin.end(); + + // The daemon should exit on idle. Give it a generous window: idle timer + // (800ms) + a few seconds slack for engine teardown on a slow CI box. + const exited = await waitProcessExit(server.child, 8000); + expect(exited).toBe(true); + // After exit, lockfile + socket should be cleaned up. + expect(fs.existsSync(path.join(tempDir, '.codegraph', 'daemon.pid'))).toBe(false); + }, 30000); +}); diff --git a/src/mcp/daemon-paths.ts b/src/mcp/daemon-paths.ts new file mode 100644 index 000000000..b1afc40df --- /dev/null +++ b/src/mcp/daemon-paths.ts @@ -0,0 +1,99 @@ +/** + * Daemon socket + lockfile path helpers — issue #411. + * + * One shared `codegraph serve --mcp` daemon per project root means we need a + * stable, project-keyed rendezvous between cooperating processes. The IPC + * surface area is just two file paths: + * + * - `daemon.sock` — Unix domain socket / named pipe the daemon listens on. + * - `daemon.pid` — atomic-create lockfile holding the daemon's pid + version. + * + * Both live under `.codegraph/` so the project-scoped uninstall (`codegraph + * uninit`) sweeps them up for free. + * + * Special-case: Unix domain socket paths have a hard length limit (~104 on + * macOS, ~108 on Linux); when the in-project path exceeds it we fall back to + * an absolute-path hash under `os.tmpdir()`. The pidfile always stays in the + * project (it doesn't have a length limit) — and acts as the authoritative + * pointer to the socket path the daemon chose. + */ + +import * as crypto from 'crypto'; +import * as os from 'os'; +import * as path from 'path'; +import { getCodeGraphDir } from '../directory'; + +/** Soft upper bound for in-project socket paths. */ +const POSIX_SOCKET_PATH_LIMIT = 100; + +/** Short stable identifier for a project root — used in tmpdir/pipe names. */ +function projectHash(projectRoot: string): string { + return crypto.createHash('sha256').update(path.resolve(projectRoot)).digest('hex').slice(0, 16); +} + +/** + * Compute the socket / named-pipe path the daemon should listen on (and the + * proxy should connect to) for `projectRoot`. Deterministic given a project + * root, so independent processes converge without coordination. + */ +export function getDaemonSocketPath(projectRoot: string): string { + if (process.platform === 'win32') { + return `\\\\.\\pipe\\codegraph-${projectHash(projectRoot)}`; + } + const inProject = path.join(getCodeGraphDir(projectRoot), 'daemon.sock'); + if (inProject.length <= POSIX_SOCKET_PATH_LIMIT) return inProject; + // Long project paths (deep monorepos, Bazel out dirs) need tmpdir fallback + // or `bind` returns EADDRINUSE / ENAMETOOLONG. Hash keeps it project-scoped. + return path.join(os.tmpdir(), `codegraph-${projectHash(projectRoot)}.sock`); +} + +/** Absolute path to the daemon pid lockfile for `projectRoot`. */ +export function getDaemonPidPath(projectRoot: string): string { + return path.join(getCodeGraphDir(projectRoot), 'daemon.pid'); +} + +/** Structured contents of the pid lockfile. */ +export interface DaemonLockInfo { + pid: number; + version: string; + socketPath: string; + startedAt: number; +} + +/** + * Serialize a {@link DaemonLockInfo} for writing to the pidfile. JSON for + * human readability — operators occasionally `cat` this when debugging. + */ +export function encodeLockInfo(info: DaemonLockInfo): string { + return JSON.stringify(info, null, 2) + '\n'; +} + +/** + * Parse a pidfile body. Tolerant of old-format pidfiles (plain decimal pid) so + * a 0.10.x daemon doesn't trip over a 0.9.x lockfile if that ever happens — + * we treat such a lockfile as "process is unknown version, refuse to share." + */ +export function decodeLockInfo(raw: string): DaemonLockInfo | null { + const trimmed = raw.trim(); + if (!trimmed) return null; + try { + const parsed = JSON.parse(trimmed); + if ( + parsed && + typeof parsed.pid === 'number' && + typeof parsed.version === 'string' && + typeof parsed.socketPath === 'string' && + typeof parsed.startedAt === 'number' + ) { + return parsed as DaemonLockInfo; + } + return null; + } catch { + // Fall through to legacy plain-pid handling. + } + const pid = Number(trimmed); + if (Number.isFinite(pid) && pid > 0) { + return { pid, version: 'unknown', socketPath: '', startedAt: 0 }; + } + return null; +} diff --git a/src/mcp/daemon.ts b/src/mcp/daemon.ts new file mode 100644 index 000000000..86a2be008 --- /dev/null +++ b/src/mcp/daemon.ts @@ -0,0 +1,381 @@ +/** + * Shared MCP daemon — issue #411. + * + * One `codegraph serve --mcp` process per project root, accepting N concurrent + * MCP clients over a Unix-domain socket (or named pipe on Windows). Each + * incoming connection gets its own {@link MCPSession}; all sessions share a + * single {@link MCPEngine}, which means a single file watcher (one inotify + * set), a single SQLite connection (one WAL writer), and a single tree-sitter + * warm-up — paid once, amortized across every agent talking to the project. + * + * What this file owns: + * - Listening on the daemon socket and spawning per-connection sessions. + * - The handshake "hello" line that lets a proxy verify it found a + * same-version daemon before piping any JSON-RPC through it. + * - The lockfile (`.codegraph/daemon.pid`) that races between daemons are + * resolved against — atomic `O_EXCL` create + cleanup on exit. + * - Reference counting + idle timeout: when the last client disconnects + * the daemon lingers for `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` (default + * 300s) so back-to-back agent runs in the same project don't repay + * startup. New connection cancels the timer. + * - Graceful shutdown on SIGTERM/SIGINT and idle exit. + * + * What this file does NOT own: + * - The proxy side (`./proxy.ts`). + * - The decision of *whether* to run as daemon at all — that's `MCPServer`. + * - The MCP protocol state machine — that's `./session.ts`. + */ + +import * as fs from 'fs'; +import * as net from 'net'; +import * as path from 'path'; +import { MCPEngine } from './engine'; +import { MCPSession } from './session'; +import { SocketTransport, StdioTransport } from './transport'; +import { + DaemonLockInfo, + decodeLockInfo, + encodeLockInfo, + getDaemonPidPath, + getDaemonSocketPath, +} from './daemon-paths'; +import { CodeGraphPackageVersion } from './version'; + +/** Default idle linger after the last client disconnects. */ +const DEFAULT_IDLE_TIMEOUT_MS = 300_000; + +/** Bytes/parse-window for an oversized hello line — bounded against a malicious peer. */ +const MAX_HELLO_LINE_BYTES = 4096; + +/** + * Wire format for the one-shot hello line the daemon emits on every new + * connection. Versioned with the package's own semver so a 0.9.x proxy never + * pipes through a 0.10.x daemon (or vice-versa) — the proxy falls back to + * direct mode on mismatch rather than risk subtle wire incompatibilities. + */ +export interface DaemonHello { + codegraph: string; // package version (must match the proxy's own version) + pid: number; // daemon pid (informational; for `ps` debugging) + socketPath: string; // echoed back so the proxy can log it + protocol: 1; // bump if the hello shape changes +} + +export interface DaemonStartResult { + /** Always-non-null for a successfully-started daemon. */ + socketPath: string; + /** Lockfile contents as written. */ + lock: DaemonLockInfo; +} + +/** + * Run as the shared daemon for `projectRoot`. Resolves once the socket is + * listening and the lockfile is committed. The returned Daemon owns the + * socket, the engine, and the lockfile until `stop()` is called or it exits + * on idle/signal. + * + * Race-safe: callers must first try `tryAcquireDaemonLock(projectRoot)` and + * only call `Daemon.run` if they got the lock. The atomic `O_EXCL` create + * inside the acquire helper is the only synchronization between competing + * daemons. + */ +export class Daemon { + private server: net.Server | null = null; + private clients = new Set(); + private idleTimer: NodeJS.Timeout | null = null; + private idleTimeoutMs: number; + private engine: MCPEngine; + private stopping = false; + private socketPath: string; + private pidPath: string; + private lockFd: number | null = null; + + constructor( + private projectRoot: string, + opts: { lockFd: number; idleTimeoutMs?: number } = { lockFd: -1 }, + ) { + this.socketPath = getDaemonSocketPath(projectRoot); + this.pidPath = getDaemonPidPath(projectRoot); + this.lockFd = opts.lockFd >= 0 ? opts.lockFd : null; + this.idleTimeoutMs = opts.idleTimeoutMs ?? resolveIdleTimeoutMs(); + this.engine = new MCPEngine(); + this.engine.setProjectPathHint(projectRoot); + } + + /** + * Bind the socket, write the pidfile body, kick off engine init, and + * register signal handlers. The promise resolves once the server is + * listening — the daemon then sticks around until idle/shutdown. + */ + async start(): Promise { + // Engine init is deliberately backgrounded — see #172. The first session + // to land waits on `ensureInitialized` either way, and unloaded sessions + // (cross-project tool calls only) shouldn't pay any open cost. + void this.engine.ensureInitialized(this.projectRoot); + + // Stale socket file (left over from a SIGKILL'd previous daemon) will + // wedge `listen` with EADDRINUSE. We arrived here holding the lockfile, + // which means there's no live daemon, so it's safe to clear. + if (process.platform !== 'win32') { + try { fs.unlinkSync(this.socketPath); } catch { /* not-exists is fine */ } + } + + await new Promise((resolve, reject) => { + const server = net.createServer((socket) => this.handleConnection(socket)); + server.once('error', (err) => reject(err)); + server.listen(this.socketPath, () => { + // POSIX: tighten permissions to user-only — the socket lives under + // `.codegraph/`, which is git-ignored but may be on a shared FS. + if (process.platform !== 'win32') { + try { fs.chmodSync(this.socketPath, 0o600); } catch { /* best-effort */ } + } + this.server = server; + resolve(); + }); + }); + + const lock: DaemonLockInfo = { + pid: process.pid, + version: CodeGraphPackageVersion, + socketPath: this.socketPath, + startedAt: Date.now(), + }; + this.writeLockFile(lock); + + process.stderr.write( + `[CodeGraph daemon] Listening on ${this.socketPath} (pid ${process.pid}, v${CodeGraphPackageVersion}). Idle timeout ${this.idleTimeoutMs}ms.\n` + ); + + // No clients yet: arm the idle timer immediately so a daemon that nobody + // ever connects to (e.g. spawned by a misconfigured client) doesn't pin + // resources forever. + this.armIdleTimer(); + + process.on('SIGINT', () => this.stop('SIGINT')); + process.on('SIGTERM', () => this.stop('SIGTERM')); + + return { socketPath: this.socketPath, lock }; + } + + /** + * Attach an stdio session for the *launcher* — the MCP host that spawned + * this very process. The launcher already opened a stdio pipe to us and is + * waiting for an `initialize` response; that pipe gets its own session + * just like any socket connection. The transport is configured with + * `exitOnClose: false` so losing the launcher doesn't kill the daemon — + * other socket clients are still entitled to service. When stdin closes + * we just remove this session from the client set and arm the idle timer + * if nothing else is connected. + */ + attachStdioLauncherSession(): MCPSession { + let session!: MCPSession; + const transport = new StdioTransport({ + exitOnClose: false, + onClose: () => { + if (session) this.dropClient(session); + }, + }); + session = new MCPSession(transport, this.engine, { + explicitProjectPath: this.projectRoot, + }); + this.clients.add(session); + this.disarmIdleTimer(); + session.start(); + return session; + } + + /** Currently-connected client count. Exposed for tests / status output. */ + getClientCount(): number { + return this.clients.size; + } + + /** Graceful shutdown: close all sessions, the engine, and clean up the lock. */ + async stop(reason: string = 'stop'): Promise { + if (this.stopping) return; + this.stopping = true; + if (this.idleTimer) { + clearTimeout(this.idleTimer); + this.idleTimer = null; + } + process.stderr.write(`[CodeGraph daemon] Shutting down (${reason}; clients=${this.clients.size}).\n`); + for (const session of [...this.clients]) { + try { session.stop(); } catch { /* best-effort */ } + } + this.clients.clear(); + if (this.server) { + await new Promise((resolve) => this.server!.close(() => resolve())); + this.server = null; + } + this.engine.stop(); + this.cleanupLockfile(); + if (process.platform !== 'win32') { + try { fs.unlinkSync(this.socketPath); } catch { /* may already be gone */ } + } + process.exit(0); + } + + private handleConnection(socket: net.Socket): void { + // Hello first so the proxy can verify versions before piping any + // application bytes. The proxy reads exactly one line, then forwards. + const hello: DaemonHello = { + codegraph: CodeGraphPackageVersion, + pid: process.pid, + socketPath: this.socketPath, + protocol: 1, + }; + socket.write(JSON.stringify(hello) + '\n'); + + const transport = new SocketTransport(socket); + const session = new MCPSession(transport, this.engine, { + explicitProjectPath: this.projectRoot, + }); + transport.onClose(() => this.dropClient(session)); + this.clients.add(session); + this.disarmIdleTimer(); + session.start(); + } + + private dropClient(session: MCPSession): void { + if (!this.clients.delete(session)) return; + if (this.clients.size === 0) this.armIdleTimer(); + } + + private armIdleTimer(): void { + if (this.idleTimer || this.stopping) return; + if (this.idleTimeoutMs <= 0) return; // 0 = never idle-exit + this.idleTimer = setTimeout(() => { + this.idleTimer = null; + // Last-second sanity check: if a connection landed between the timer + // firing and now, don't exit. (setImmediate-ordering is the only way + // this races; cheap to defend against.) + if (this.clients.size > 0) { + this.armIdleTimer(); + return; + } + void this.stop('idle timeout'); + }, this.idleTimeoutMs); + // Don't keep the event loop alive just for this — if the socket server + // and active connections are all gone, the loop should drain naturally. + this.idleTimer.unref?.(); + } + + private disarmIdleTimer(): void { + if (!this.idleTimer) return; + clearTimeout(this.idleTimer); + this.idleTimer = null; + } + + private writeLockFile(info: DaemonLockInfo): void { + const body = encodeLockInfo(info); + if (this.lockFd !== null) { + // We came in already holding the lockfile (acquired via `wx`); fill it + // in atomically by writing the body and closing the fd. Subsequent + // readers of the pidfile then see the full record. + fs.writeSync(this.lockFd, body); + fs.closeSync(this.lockFd); + this.lockFd = null; + } else { + // Defensive path — should be unreachable because callers always go + // through `tryAcquireDaemonLock` before constructing a Daemon. + fs.writeFileSync(this.pidPath, body, { flag: 'w' }); + } + } + + private cleanupLockfile(): void { + try { + if (fs.existsSync(this.pidPath)) { + // Only remove if it still belongs to us — another daemon may have + // already taken over while we were shutting down (extremely rare). + const raw = fs.readFileSync(this.pidPath, 'utf8'); + const info = decodeLockInfo(raw); + if (info && info.pid === process.pid) { + fs.unlinkSync(this.pidPath); + } + } + } catch { /* best-effort; we're exiting anyway */ } + } +} + +/** + * Result of `tryAcquireDaemonLock`. Either we got the lockfile (caller becomes + * the daemon), or it already existed (caller should try to connect to the + * existing daemon as a proxy). + */ +export type AcquireResult = + | { kind: 'acquired'; lockFd: number; pidPath: string } + | { kind: 'taken'; existing: DaemonLockInfo | null; pidPath: string }; + +/** + * Atomic-create the daemon pidfile. Returns either an `acquired` result (the + * caller is now the daemon-elect; must call `Daemon.run` which writes the + * pidfile body and closes the fd) or a `taken` result (some other process + * either is or was the daemon; caller should connect-or-take-over). + * + * The fd is left writable + truncate-only — Daemon.start() writes the actual + * body (pid, version, socket path) once it's bound the socket. That way a + * crash mid-acquire leaves an empty pidfile which any subsequent daemon + * candidate can recognize as stale. + */ +export function tryAcquireDaemonLock(projectRoot: string): AcquireResult { + const pidPath = getDaemonPidPath(projectRoot); + // Make sure the .codegraph/ directory exists — the daemon may be the first + // thing to touch it on a fresh-clone-but-already-initialized checkout. + fs.mkdirSync(path.dirname(pidPath), { recursive: true }); + + try { + const fd = fs.openSync(pidPath, 'wx', 0o600); + return { kind: 'acquired', lockFd: fd, pidPath }; + } catch (err: unknown) { + const e = err as NodeJS.ErrnoException; + if (e.code !== 'EEXIST') throw err; + } + + let existing: DaemonLockInfo | null = null; + try { + const raw = fs.readFileSync(pidPath, 'utf8'); + existing = decodeLockInfo(raw); + } catch { /* unreadable lockfile — treat as malformed */ } + return { kind: 'taken', existing, pidPath }; +} + +/** + * Remove a stale pidfile and return whether we successfully cleared it. Used + * by callers that detected a "taken" lock pointing at a dead pid. + */ +export function clearStaleDaemonLock(pidPath: string): boolean { + try { + fs.unlinkSync(pidPath); + return true; + } catch (err: unknown) { + const e = err as NodeJS.ErrnoException; + if (e.code === 'ENOENT') return true; + return false; + } +} + +/** + * Probe whether `pid` is currently alive (signal-0). False on Windows for + * pids of a different user since `kill` returns EPERM there; we accept that + * as "still alive" to be conservative — better to fall back to direct mode + * than to nuke a stranger's daemon. + */ +export function isProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (err: unknown) { + const e = err as NodeJS.ErrnoException; + if (e.code === 'EPERM') return true; // exists, just not ours to signal + return false; + } +} + +function resolveIdleTimeoutMs(): number { + const raw = process.env.CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS; + if (raw === undefined || raw === '') return DEFAULT_IDLE_TIMEOUT_MS; + const parsed = Number(raw); + if (!Number.isFinite(parsed) || parsed < 0) return DEFAULT_IDLE_TIMEOUT_MS; + return Math.floor(parsed); +} + +/** Exported for test stubs that need to bound the hello-line read. */ +export { MAX_HELLO_LINE_BYTES }; diff --git a/src/mcp/engine.ts b/src/mcp/engine.ts new file mode 100644 index 000000000..1d7bd573e --- /dev/null +++ b/src/mcp/engine.ts @@ -0,0 +1,232 @@ +/** + * MCP shared engine — the heavyweight, *shared* state for an MCP server: + * the project's {@link CodeGraph} instance, file watcher, and the + * {@link ToolHandler} cache for cross-project queries. + * + * One engine, many sessions: + * - direct mode (single stdio session) instantiates one engine + one session; + * - daemon mode instantiates one engine and a new session per socket + * connection. Every session reads from the same SQLite WAL and the same + * inotify watch set — that's the entire point of issue #411. + */ + +import CodeGraph, { findNearestCodeGraphRoot } from '../index'; +import { watchDisabledReason } from '../sync'; +import { ToolHandler } from './tools'; + +export interface MCPEngineOptions { + /** + * Whether to start the file watcher when initializing. Daemon and direct + * modes both want this true; tests may set it false to keep the engine + * cheap. Honors {@link watchDisabledReason} regardless. + */ + watch?: boolean; +} + +/** + * Shared MCP engine. Thread-safe in the sense that multiple sessions can + * call its methods concurrently — internally it serializes initialization + * through a single promise so multiple sessions racing each other on first + * connect never double-open the SQLite file. + */ +export class MCPEngine { + private cg: CodeGraph | null = null; + private toolHandler: ToolHandler; + // Project root we resolved to. Null until `ensureInitialized` succeeds + // (or null forever if no .codegraph/ ever turned up — that's a valid + // state for the engine, since cross-project queries still work). + private projectPath: string | null = null; + // Set on first `ensureInitialized` so subsequent sessions don't redo work. + private initPromise: Promise | null = null; + private watcherStarted = false; + private opts: Required; + private closed = false; + + constructor(opts: MCPEngineOptions = {}) { + this.opts = { watch: opts.watch ?? true }; + this.toolHandler = new ToolHandler(null); + } + + /** + * Convenience for {@link MCPServer} compatibility: pre-seed an explicit + * project path (from the `--path` CLI flag) without yet opening it. This + * keeps the synchronous constructor cheap; the actual open happens on the + * first `ensureInitialized` call. + */ + setProjectPathHint(projectPath: string): void { + this.projectPath = projectPath; + this.toolHandler.setDefaultProjectHint(projectPath); + } + + /** Project root that the engine resolved on first init (null if none). */ + getProjectPath(): string | null { + return this.projectPath; + } + + /** Shared ToolHandler — sessions delegate tool dispatch through this. */ + getToolHandler(): ToolHandler { + return this.toolHandler; + } + + /** Whether the default project's CodeGraph is open. */ + hasDefaultCodeGraph(): boolean { + return this.toolHandler.hasDefaultCodeGraph(); + } + + /** + * Walk up from `searchFrom` to find the nearest `.codegraph/` and open it. + * Idempotent: concurrent callers share one in-flight init; subsequent + * callers after success are no-ops. + * + * The original `MCPServer.tryInitializeDefault` carried the same retry-on- + * subsequent-tool-call semantics; we preserve them by NOT throwing when the + * search misses (just leaves `cg` null so the next call can retry). + */ + async ensureInitialized(searchFrom: string): Promise { + if (this.closed) return; + if (this.toolHandler.hasDefaultCodeGraph()) return; + if (this.initPromise) { + try { await this.initPromise; } catch { /* let caller retry */ } + return; + } + + this.initPromise = this.doInitialize(searchFrom).finally(() => { + this.initPromise = null; + }); + try { + await this.initPromise; + } catch { + // Init errors are logged inside `doInitialize`; falling through here + // matches MCPServer's previous "retry on next tool call" behavior. + } + } + + /** + * Synchronous last-resort init used by the per-session retry loop when the + * background `ensureInitialized` already finished (or failed) and we need + * to pick up a project that appeared *after* the engine started. + */ + retryInitializeSync(searchFrom: string): void { + if (this.closed) return; + if (this.toolHandler.hasDefaultCodeGraph()) return; + this.toolHandler.setDefaultProjectHint(searchFrom); + const resolvedRoot = findNearestCodeGraphRoot(searchFrom); + if (!resolvedRoot) return; + try { + // Close any previously failed instance to avoid leaking resources. + if (this.cg) { + try { this.cg.close(); } catch { /* ignore */ } + this.cg = null; + } + this.cg = CodeGraph.openSync(resolvedRoot); + this.projectPath = resolvedRoot; + this.toolHandler.setDefaultCodeGraph(this.cg); + this.startWatching(); + this.catchUpSync(); + } catch { + // Still failing — caller will try again on the next tool call. + } + } + + /** + * Close everything. Used on graceful daemon shutdown (SIGTERM/idle timeout) + * and on direct-mode stop. Idempotent. + */ + stop(): void { + if (this.closed) return; + this.closed = true; + this.toolHandler.closeAll(); + if (this.cg) { + try { this.cg.close(); } catch { /* ignore */ } + this.cg = null; + } + } + + private async doInitialize(searchFrom: string): Promise { + this.toolHandler.setDefaultProjectHint(searchFrom); + + const resolvedRoot = findNearestCodeGraphRoot(searchFrom); + if (!resolvedRoot) { + // No .codegraph/ above searchFrom — that's not an error, sessions may + // still discover one later via roots/list. + this.projectPath = searchFrom; + return; + } + + this.projectPath = resolvedRoot; + try { + this.cg = await CodeGraph.open(resolvedRoot); + this.toolHandler.setDefaultCodeGraph(this.cg); + this.startWatching(); + this.catchUpSync(); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`[CodeGraph MCP] Failed to open project at ${resolvedRoot}: ${msg}\n`); + } + } + + /** + * Start file watching on the active CodeGraph instance. Idempotent — the + * watcher is per-engine, not per-session, which is why the daemon path + * collapses N inotify sets to one. The wording of the disabled-reason log + * exactly matches the prior in-tree implementation so log-driven dashboards + * keep working. + */ + private startWatching(): void { + if (!this.cg || this.watcherStarted || !this.opts.watch) return; + + const disabledReason = watchDisabledReason(this.projectPath ?? process.cwd()); + if (disabledReason) { + process.stderr.write( + `[CodeGraph MCP] File watcher disabled — ${disabledReason}. ` + + `The graph will not auto-update; run \`codegraph sync\` (or install the git sync hooks via \`codegraph init\`) to refresh.\n` + ); + this.watcherStarted = true; + return; + } + + const started = this.cg.watch({ + onSyncComplete: (result) => { + if (result.filesChanged > 0) { + process.stderr.write( + `[CodeGraph MCP] Auto-synced ${result.filesChanged} file(s) in ${result.durationMs}ms\n` + ); + } + }, + onSyncError: (err) => { + process.stderr.write(`[CodeGraph MCP] Auto-sync error: ${err.message}\n`); + }, + }); + + this.watcherStarted = true; + if (started) { + process.stderr.write('[CodeGraph MCP] File watcher active — graph will auto-sync on changes\n'); + } else { + process.stderr.write( + '[CodeGraph MCP] File watcher unavailable on this platform — run `codegraph sync` to refresh the graph after changes.\n' + ); + } + } + + /** + * Reconcile the index with the current filesystem once, right after open — + * catches edits, adds, deletes, and `git pull`/`checkout` changes made while + * no watcher was running. Background, never awaited. + */ + private catchUpSync(): void { + const cg = this.cg; + if (!cg) return; + void cg + .sync() + .then((result) => { + const changed = result.filesAdded + result.filesModified + result.filesRemoved; + if (changed > 0) { + process.stderr.write(`[CodeGraph MCP] Caught up ${changed} file(s) changed since last run\n`); + } + }) + .catch((err) => { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`[CodeGraph MCP] Catch-up sync failed: ${msg}\n`); + }); + } +} diff --git a/src/mcp/index.ts b/src/mcp/index.ts index 71dbc9b9f..2179eddf9 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -13,60 +13,54 @@ * const server = new MCPServer('/path/to/project'); * await server.start(); * ``` + * + * Three runtime modes (decided in {@link MCPServer.start}): + * + * - **Direct** — one process serves one MCP client over stdio. Today's + * behavior; used when no shareable daemon is reachable or the user opted + * out via `CODEGRAPH_NO_DAEMON=1`. + * - **Daemon** — accept N concurrent MCP clients over a Unix-domain socket / + * named pipe, sharing one CodeGraph + watcher + SQLite handle. See + * {@link ./daemon.ts} and issue #411 for the rationale. + * - **Proxy** — pure stdio↔socket pipe to an existing daemon. See + * {@link ./proxy.ts}. */ -import * as path from 'path'; -import CodeGraph, { findNearestCodeGraphRoot } from '../index'; -import { watchDisabledReason } from '../sync'; -import { StdioTransport, JsonRpcRequest, JsonRpcNotification, ErrorCodes } from './transport'; -import { tools, ToolHandler } from './tools'; -import { SERVER_INSTRUCTIONS } from './server-instructions'; +import { findNearestCodeGraphRoot } from '../index'; +import { StdioTransport } from './transport'; +import { MCPEngine } from './engine'; +import { MCPSession } from './session'; +import { + Daemon, + clearStaleDaemonLock, + isProcessAlive, + tryAcquireDaemonLock, +} from './daemon'; +import { runProxy } from './proxy'; +import { getDaemonSocketPath } from './daemon-paths'; import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags'; /** - * Convert a file:// URI to a filesystem path. - * Handles URL encoding and Windows drive letter paths. - */ -function fileUriToPath(uri: string): string { - try { - const url = new URL(uri); - let filePath = decodeURIComponent(url.pathname); - // On Windows, file:///C:/path produces pathname /C:/path — strip leading / - if (process.platform === 'win32' && /^\/[a-zA-Z]:/.test(filePath)) { - filePath = filePath.slice(1); - } - return path.resolve(filePath); - } catch { - // Fallback for non-standard URIs - return uri.replace(/^file:\/\/\/?/, ''); - } -} - -/** - * MCP Server Info - */ -const SERVER_INFO = { - name: 'codegraph', - version: '0.1.0', -}; - -/** - * MCP Protocol Version + * How often to poll `process.ppid` to detect parent process death (see #277). + * 5s is a deliberate trade-off: the failure mode being guarded against is rare + * (parent SIGKILL'd), and longer poll = less wakeup overhead while idle. */ -const PROTOCOL_VERSION = '2024-11-05'; +const DEFAULT_PPID_POLL_MS = 5000; /** - * How long to wait for the client's `roots/list` response before giving up - * and falling back to the process cwd. + * Max retries when a stale-lock takeover races other candidates. After this + * many failed acquire+probe rounds we give up and fall back to direct mode — + * something is wedged enough that adding our own daemon to the mix would only + * make it worse. */ -const ROOTS_LIST_TIMEOUT_MS = 5000; +const TAKEOVER_MAX_RETRIES = 3; /** - * How often to poll `process.ppid` to detect parent process death (see #277). - * 5s is a deliberate trade-off: the failure mode being guarded against is rare - * (parent SIGKILL'd), and longer poll = less wakeup overhead while idle. + * Brief sleep between takeover retries so a freshly-spawned daemon has time + * to bind its socket. 100ms is well under any realistic startup, so a + * legitimate races resolves on the first or second retry. */ -const DEFAULT_PPID_POLL_MS = 5000; +const TAKEOVER_RETRY_DELAY_MS = 100; /** * Resolve the PPID watchdog poll interval from an env override. A value of @@ -96,28 +90,22 @@ function parseHostPpid(raw: string | undefined): number | null { return parsed; } -/** True if a process with `pid` currently exists (signal-0 probe). */ -function isProcessAlive(pid: number): boolean { - try { - process.kill(pid, 0); - return true; - } catch { - return false; - } +/** Whether `CODEGRAPH_NO_DAEMON` was set to a truthy value. */ +function daemonOptOutSet(): boolean { + const raw = process.env.CODEGRAPH_NO_DAEMON; + if (!raw) return false; + return raw !== '0' && raw.toLowerCase() !== 'false'; } /** - * Extract the first usable filesystem path from a `roots/list` result. - * Shape per MCP spec: `{ roots: [{ uri: "file:///path", name?: string }] }`. - * Returns null if the result is empty or malformed. + * Resolve the project root the daemon machinery should key on. Returns + * `null` when no `.codegraph/` is reachable from the candidate path — in + * that case the caller must run in direct mode, since the daemon lockfile + * and socket both live under `.codegraph/`. */ -function firstRootPath(result: unknown): string | null { - if (!result || typeof result !== 'object') return null; - const roots = (result as { roots?: unknown }).roots; - if (!Array.isArray(roots) || roots.length === 0) return null; - const first = roots[0] as { uri?: unknown }; - if (typeof first?.uri !== 'string') return null; - return fileUriToPath(first.uri); +function resolveDaemonRoot(explicitPath: string | null): string | null { + const candidate = explicitPath ?? process.cwd(); + return findNearestCodeGraphRoot(candidate); } /** @@ -125,281 +113,85 @@ function firstRootPath(result: unknown): string | null { * * Implements the Model Context Protocol to expose CodeGraph * functionality as tools that can be called by AI assistants. + * + * Backwards-compatible constructor and `start()` signature with the + * pre-issue-#411 implementation: callers continue to do + * `new MCPServer(path).start()`. Internally we now pick from direct / proxy / + * daemon at start time. */ export class MCPServer { - private transport: StdioTransport; - private cg: CodeGraph | null = null; - private toolHandler: ToolHandler; private projectPath: string | null; - // In-flight background init kicked off from handleInitialize. Tracked so the - // sync retry path doesn't race against it (double-opening the SQLite file). - private initPromise: Promise | null = null; - // Whether the client advertised the MCP `roots` capability during initialize. - // If so, and no explicit project path was given, we ask it for the workspace - // root via roots/list rather than guessing from the (often wrong) cwd. - private clientSupportsRoots = false; - // Guards the one-shot deferred resolution (roots/list or cwd) so we don't - // re-issue roots/list on every tool call. - private rootsAttempted = false; - // PPID watchdog — see start(). Captured at construction so we always have a + // Direct-mode-only state. In daemon mode the per-connection session lives + // inside the Daemon class; in proxy mode there is no session at all. + private session: MCPSession | null = null; + private engine: MCPEngine | null = null; + private daemon: Daemon | null = null; + private ppidWatchdog: ReturnType | null = null; + // PPID watchdog baseline — captured at construction so we always have a // baseline, even if start() runs after a fork-style reparent. private originalPpid: number = process.ppid; - // The MCP host's PID, propagated across the `--liftoff-only` re-exec (see - // HOST_PPID_ENV). When set, the watchdog polls it directly: the re-exec - // inserts an intermediate process whose *death* — not just our reparenting — - // is what we'd otherwise miss. null on the direct (bundled) launch path. private hostPpid: number | null = parseHostPpid(process.env[HOST_PPID_ENV]); - private ppidWatchdog: ReturnType | null = null; - // Idempotency guard for stop(). Without it, the watchdog can race with the - // stdin `end`/`close` handlers (or SIGTERM/SIGINT) and double-close cg and - // the transport before process.exit() lands. + // Idempotency guard for stop(). private stopped = false; + private mode: 'unstarted' | 'direct' | 'proxy' | 'daemon' = 'unstarted'; constructor(projectPath?: string) { this.projectPath = projectPath || null; - this.transport = new StdioTransport(); - // Create ToolHandler eagerly — cross-project queries work even without a default project - this.toolHandler = new ToolHandler(null); } /** - * Start the MCP server + * Start the MCP server. * - * Note: CodeGraph initialization is deferred until the initialize request - * is received, which includes the rootUri from the client. - */ - async start(): Promise { - // Start listening for messages immediately - don't check initialization yet - // We'll get the project path from the initialize request's rootUri - this.transport.start(this.handleMessage.bind(this)); - - // Keep the process running - process.on('SIGINT', () => this.stop()); - process.on('SIGTERM', () => this.stop()); - - // When the parent process (Claude Code) exits, stdin closes. - // Detect this and shut down gracefully to prevent orphaned processes. - process.stdin.on('end', () => this.stop()); - process.stdin.on('close', () => this.stop()); - - // PPID watchdog (#277). Linux doesn't propagate parent death to children, - // so when the MCP host (Claude Code, opencode, …) is SIGKILL'd by the OOM - // killer / a force-quit / a container teardown, the child is reparented to - // init/systemd and the stdin `end`/`close` events don't always fire. The - // server would then linger indefinitely, holding inotify watches, file - // descriptors, and the SQLite WAL. Poll `process.ppid` and shut down the - // moment it changes from what we observed at startup. Cross-platform: - // reparenting changes ppid on Linux *and* macOS; on Windows the value can - // also drop to 0 once the parent is gone. When the CLI re-execs itself for - // `--liftoff-only`, an intermediate process sits between us and the host and - // outlives it, so our own ppid wouldn't change — in that case we poll the - // host PID (propagated via HOST_PPID_ENV) for liveness instead. The watchdog - // is `.unref()`'d so it never holds the event loop open on its own. - const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS); - if (pollMs > 0) { - this.ppidWatchdog = setInterval(() => { - const current = process.ppid; - const ppidChanged = current !== this.originalPpid; - const hostGone = this.hostPpid !== null && !isProcessAlive(this.hostPpid); - if (ppidChanged || hostGone) { - const reason = ppidChanged - ? `ppid ${this.originalPpid} -> ${current}` - : `host pid ${this.hostPpid} exited`; - process.stderr.write( - `[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n` - ); - this.stop(); - } - }, pollMs); - this.ppidWatchdog.unref(); - } - } - - /** - * Try to initialize CodeGraph for the default project. - * - * Walks up parent directories to find the nearest .codegraph/ folder, - * similar to how git finds .git/ directories. + * Decision order: + * 1. If `CODEGRAPH_NO_DAEMON=1` → direct mode (unchanged behavior). + * 2. If no `.codegraph/` reachable → direct mode (daemon needs a lockfile + * and socket location, which both live under `.codegraph/`). + * 3. Try to attach to an existing daemon as a proxy. + * 4. Otherwise become the daemon ourselves. * - * If initialization fails, the error is recorded but the server continues - * to work — cross-project queries and retries on subsequent tool calls - * are still possible. + * On any unexpected failure in steps 3–4 we transparently fall back to + * direct mode — a misbehaving daemon must never block a session from + * starting. */ - private async tryInitializeDefault(projectPath: string): Promise { - // Record where we searched so a later "not initialized" error can name it. - this.toolHandler.setDefaultProjectHint(projectPath); - - // Walk up parent directories to find nearest .codegraph/ - const resolvedRoot = findNearestCodeGraphRoot(projectPath); - - if (!resolvedRoot) { - this.projectPath = projectPath; - return; - } - - this.projectPath = resolvedRoot; - - try { - this.cg = await CodeGraph.open(resolvedRoot); - this.toolHandler.setDefaultCodeGraph(this.cg); - this.startWatching(); - this.catchUpSync(); - } catch (err) { - // Log the error so transient failures are diagnosable (see issue #47) - const msg = err instanceof Error ? err.message : String(err); - process.stderr.write(`[CodeGraph MCP] Failed to open project at ${resolvedRoot}: ${msg}\n`); - } - } - - /** - * Retry initialization of the default project if it previously failed. - * Called lazily on tool calls that need the default project. - * Re-walks parent directories each time so it picks up projects - * initialized after the MCP server started. - * - * Awaits any in-flight background init (kicked off by handleInitialize) so - * we never open the SQLite file twice concurrently. - */ - private async retryInitIfNeeded(): Promise { - // Wait for the background init started during handleInitialize, if any. - if (this.initPromise) { - try { await this.initPromise; } catch { /* errored init falls through to retry */ } - } - - // Already initialized successfully - if (this.toolHandler.hasDefaultCodeGraph()) return; - - // No explicit path was given at initialize. Resolve it now, exactly once: - // ask the client via roots/list (if it advertised roots), else use cwd. - // Deferring to here lets a roots answer override the wrong cwd, and the - // one-shot guard means we never re-issue roots/list per tool call. - if (!this.projectPath && !this.rootsAttempted) { - this.rootsAttempted = true; - this.initPromise = ( - this.clientSupportsRoots - ? this.initFromRoots() - : this.tryInitializeDefault(process.cwd()) - ).finally(() => { this.initPromise = null; }); - try { await this.initPromise; } catch { /* fall through to last-resort below */ } - if (this.toolHandler.hasDefaultCodeGraph()) return; + async start(): Promise { + // Direct mode if the user opted out. Done first so debugging is simple: + // setting the env var is sufficient to get the pre-#411 behavior. + if (daemonOptOutSet()) { + return this.startDirect('CODEGRAPH_NO_DAEMON set'); } - // Last resort: re-walk from the best candidate we have. Picks up projects - // initialized after the server started, and covers clients that sent no - // usable initialize signal at all. - const candidate = this.projectPath ?? process.cwd(); - this.toolHandler.setDefaultProjectHint(candidate); - const resolvedRoot = findNearestCodeGraphRoot(candidate); - if (!resolvedRoot) return; - - try { - // Close any previously failed instance to avoid leaking resources - if (this.cg) { - try { this.cg.close(); } catch { /* ignore */ } - this.cg = null; - } - this.cg = CodeGraph.openSync(resolvedRoot); - this.projectPath = resolvedRoot; - this.toolHandler.setDefaultCodeGraph(this.cg); - this.startWatching(); - this.catchUpSync(); - } catch { - // Still failing — will retry on next tool call + const root = resolveDaemonRoot(this.projectPath); + if (!root) { + // No initialized project found — daemon mode has nowhere to put its + // socket. This is the fresh-checkout / outside-project case; behave + // exactly as before. + return this.startDirect('no .codegraph/ root found'); } - } - /** - * Resolve the project root via the MCP `roots/list` request and initialize - * from the first root the client reports. Falls back to the process cwd if - * the client returns no usable root or doesn't answer in time. See issue #196. - */ - private async initFromRoots(): Promise { - let target = process.cwd(); + // Try the daemon attach/spawn dance. try { - const result = await this.transport.request('roots/list', undefined, ROOTS_LIST_TIMEOUT_MS); - const rootPath = firstRootPath(result); - if (rootPath) { - target = rootPath; - } else { - process.stderr.write('[CodeGraph MCP] Client returned no workspace roots; falling back to process cwd.\n'); + const mode = await this.startDaemonOrProxy(root); + if (mode === 'fallback') { + return this.startDirect('daemon attach/start failed; fallback to direct'); } + this.mode = mode; + this.installSignalHandlers(); + this.installPpidWatchdog(); + return; } catch (err) { + // Belt-and-braces: if anything throws inside the daemon machinery, + // never wedge the user — fall back to a working direct-mode session. const msg = err instanceof Error ? err.message : String(err); - process.stderr.write(`[CodeGraph MCP] roots/list request failed (${msg}); falling back to process cwd.\n`); - } - await this.tryInitializeDefault(target); - } - - /** - * Start file watching on the active CodeGraph instance. - * Logs sync activity to stderr for diagnostics. - */ - private startWatching(): void { - if (!this.cg) return; - - // When the watcher is intentionally disabled (e.g. WSL2 /mnt drives, or - // CODEGRAPH_NO_WATCH=1), say so explicitly and tell the user how to keep - // the graph fresh — otherwise the silent staleness is hard to diagnose. - const disabledReason = watchDisabledReason(this.projectPath ?? process.cwd()); - if (disabledReason) { - process.stderr.write( - `[CodeGraph MCP] File watcher disabled — ${disabledReason}. ` + - `The graph will not auto-update; run \`codegraph sync\` (or install the git sync hooks via \`codegraph init\`) to refresh.\n` - ); - return; - } - - const started = this.cg.watch({ - onSyncComplete: (result) => { - if (result.filesChanged > 0) { - process.stderr.write( - `[CodeGraph MCP] Auto-synced ${result.filesChanged} file(s) in ${result.durationMs}ms\n` - ); - } - }, - onSyncError: (err) => { - process.stderr.write(`[CodeGraph MCP] Auto-sync error: ${err.message}\n`); - }, - }); - - if (started) { - process.stderr.write('[CodeGraph MCP] File watcher active — graph will auto-sync on changes\n'); - } else { - // start() can also return false when recursive fs.watch isn't supported. - process.stderr.write( - '[CodeGraph MCP] File watcher unavailable on this platform — run `codegraph sync` to refresh the graph after changes.\n' - ); + process.stderr.write(`[CodeGraph MCP] Daemon path failed (${msg}); falling back to direct mode.\n`); + return this.startDirect('daemon path threw'); } } /** - * Reconcile the index with the current filesystem once, right after connect — - * catches edits, adds, deletes, and `git pull`/`checkout` changes made while - * no watcher was running. Runs in the background so it never delays the - * `initialize` response; `sync()` is incremental (a stat pre-filter skips - * unchanged files) and mutex-guarded, so it can't collide with the live - * watcher or a git-hook sync. Runs even when the watcher is unavailable - * (e.g. WSL2 /mnt drives), where catch-up matters most. - */ - private catchUpSync(): void { - const cg = this.cg; - if (!cg) return; - void cg - .sync() - .then((result) => { - const changed = result.filesAdded + result.filesModified + result.filesRemoved; - if (changed > 0) { - process.stderr.write(`[CodeGraph MCP] Caught up ${changed} file(s) changed since last run\n`); - } - }) - .catch((err) => { - const msg = err instanceof Error ? err.message : String(err); - process.stderr.write(`[CodeGraph MCP] Catch-up sync failed: ${msg}\n`); - }); - } - - /** - * Stop the server + * Stop the server. In daemon mode this triggers graceful shutdown of every + * connected session; in proxy mode the proxy's own resolve handler exits + * the process and `stop()` is a no-op; in direct mode this mirrors the + * pre-#411 behavior (close cg, exit). */ stop(): void { if (this.stopped) return; @@ -408,181 +200,154 @@ export class MCPServer { clearInterval(this.ppidWatchdog); this.ppidWatchdog = null; } - // Close all cached cross-project connections first - this.toolHandler.closeAll(); - // Close the main CodeGraph instance - if (this.cg) { - this.cg.close(); - this.cg = null; + if (this.daemon) { + void this.daemon.stop('stop()'); + // Daemon.stop calls process.exit; nothing else to do. + return; + } + if (this.session) { + this.session.stop(); + this.session = null; + } + if (this.engine) { + this.engine.stop(); + this.engine = null; } - this.transport.stop(); process.exit(0); } - /** - * Handle incoming JSON-RPC messages - */ - private async handleMessage(message: JsonRpcRequest | JsonRpcNotification): Promise { - // Check if it's a request (has id) or notification (no id) - const isRequest = 'id' in message; - - switch (message.method) { - case 'initialize': - if (isRequest) { - await this.handleInitialize(message as JsonRpcRequest); - } - break; - - case 'initialized': - // Notification that client has finished initialization - // No action needed - the client is ready - break; + /** Single-process stdio MCP session — the pre-issue-#411 code path. */ + private async startDirect(reason: string): Promise { + if (reason && process.env.CODEGRAPH_MCP_DEBUG) { + process.stderr.write(`[CodeGraph MCP] Direct mode: ${reason}.\n`); + } + this.engine = new MCPEngine(); + const transport = new StdioTransport(); + this.session = new MCPSession(transport, this.engine, { + explicitProjectPath: this.projectPath, + }); - case 'tools/list': - if (isRequest) { - await this.handleToolsList(message as JsonRpcRequest); - } - break; + if (this.projectPath) { + // Background init so the initialize response stays fast (#172). + void this.engine.ensureInitialized(this.projectPath); + } - case 'tools/call': - if (isRequest) { - await this.handleToolsCall(message as JsonRpcRequest); - } - break; + this.session.start(); - case 'ping': - if (isRequest) { - this.transport.sendResult((message as JsonRpcRequest).id, {}); - } - break; + // Detect parent-process death — same logic as pre-refactor. When stdin + // closes we go through StdioTransport's `process.exit(0)` already, but + // SIGKILL of the parent doesn't reliably close stdin on Linux (#277). + process.stdin.on('end', () => this.stop()); + process.stdin.on('close', () => this.stop()); - default: - if (isRequest) { - this.transport.sendError( - (message as JsonRpcRequest).id, - ErrorCodes.MethodNotFound, - `Method not found: ${message.method}` - ); - } - } + this.mode = 'direct'; + this.installSignalHandlers(); + this.installPpidWatchdog(); } /** - * Handle initialize request + * Try to attach as proxy or start as daemon. Returns 'proxy' / 'daemon' on + * success, 'fallback' if the caller should retry in direct mode. */ - private async handleInitialize(request: JsonRpcRequest): Promise { - const params = request.params as { - rootUri?: string; - workspaceFolders?: Array<{ uri: string; name: string }>; - capabilities?: { roots?: unknown }; - } | undefined; + private async startDaemonOrProxy(root: string): Promise<'proxy' | 'daemon' | 'fallback'> { + for (let attempt = 0; attempt < TAKEOVER_MAX_RETRIES; attempt++) { + const lock = tryAcquireDaemonLock(root); + + if (lock.kind === 'acquired') { + const daemon = new Daemon(root, { lockFd: lock.lockFd }); + await daemon.start(); + // The MCP host launched us over stdio and is waiting for our + // `initialize` response — attach it as the daemon's first session + // so we never silently drop the launcher. Subsequent invocations + // discover us via the socket and proxy in. + daemon.attachStdioLauncherSession(); + this.daemon = daemon; + return 'daemon'; + } - // Does the client support the MCP `roots` protocol? If so, and we have no - // explicit path, we ask it for the workspace root after the handshake - // instead of falling back to the (frequently wrong) cwd. See issue #196. - this.clientSupportsRoots = !!params?.capabilities?.roots; + // Lock is taken — that *should* mean a daemon is alive. Probe. + const socketPath = lock.existing?.socketPath || getDaemonSocketPath(root); + const probe = await runProxy(socketPath); + if (probe.outcome === 'proxied') { + // runProxy only returns when the connection has CLOSED — meaning we + // already piped stdio and are now exiting. From here we should not + // start anything else. The process is expected to terminate + // naturally after this function returns. + return 'proxy'; + } - // Explicit project signal, strongest first: a client-provided rootUri / - // workspaceFolders (LSP-style, non-standard but some clients send it), else - // the --path the server was launched with. cwd is NOT used here — we defer - // it so a roots/list answer can win over it. - let explicitPath: string | null = null; - if (params?.rootUri) { - explicitPath = fileUriToPath(params.rootUri); - } else if (params?.workspaceFolders?.[0]?.uri) { - explicitPath = fileUriToPath(params.workspaceFolders[0].uri); - } else if (this.projectPath) { - explicitPath = this.projectPath; - } + // Proxy didn't attach. Possible causes: + // (a) Daemon is mid-startup and hasn't bound the socket yet — retry. + // (b) Daemon crashed but lockfile leaked — clear it and retry. + // (c) Daemon is alive but version-mismatched — fall back to direct. + if (probe.reason === 'version mismatch') { + return 'fallback'; + } - // Respond to the handshake BEFORE doing any heavy initialization. Loading - // the SQLite DB and the tree-sitter WASM runtime can take many seconds on - // slow filesystems (Docker Desktop VirtioFS on macOS, WSL2). Clients like - // Claude Code time out the handshake at ~30s, which manifested as - // "MCP tools never appear" — the child was alive and had received the - // initialize but was still awaiting initGrammars(). See issue #172. - // - // We accept the client's protocol version but respond with our supported - // version. The `instructions` field is surfaced by MCP clients in the - // agent's system prompt automatically — it's the right place for the - // universal tool-selection playbook, ahead of individual tool descriptions. - this.transport.sendResult(request.id, { - protocolVersion: PROTOCOL_VERSION, - capabilities: { - tools: {}, - }, - serverInfo: SERVER_INFO, - instructions: SERVER_INSTRUCTIONS, - }); + if (lock.existing && lock.existing.pid > 0 && isProcessAlive(lock.existing.pid)) { + // Daemon process is alive but its socket isn't accepting — probably + // (a). Sleep briefly and try again. + await sleep(TAKEOVER_RETRY_DELAY_MS); + continue; + } - // If we know the project dir, kick off init in the background now. Tool - // calls that arrive before it finishes fall through to `retryInitIfNeeded`, - // which waits for this promise rather than racing it with a second open. - // - // If we DON'T know it (no rootUri, no --path), defer: the first tool call - // resolves it via roots/list (when the client supports roots) or cwd. This - // is the fix for issue #196 — clients that launch the server outside the - // project and don't pass a rootUri previously got a misleading "not - // initialized" error on every call. - if (explicitPath) { - this.initPromise = this.tryInitializeDefault(explicitPath).finally(() => { - this.initPromise = null; - }); + // Dead pid (or unreadable lockfile): clear it and retry. If we lose + // the next race to another candidate, that's fine — they'll be the + // new daemon and we'll proxy through them. + clearStaleDaemonLock(lock.pidPath); + await sleep(TAKEOVER_RETRY_DELAY_MS); } + + // Repeated failures — something is very wrong (perms?). Direct mode it is. + return 'fallback'; } - /** - * Handle tools/list request - */ - private async handleToolsList(request: JsonRpcRequest): Promise { - await this.retryInitIfNeeded(); - this.transport.sendResult(request.id, { - tools: this.toolHandler.getTools(), - }); + /** Standard SIGINT/SIGTERM handlers that route to our `stop()`. */ + private installSignalHandlers(): void { + process.on('SIGINT', () => this.stop()); + process.on('SIGTERM', () => this.stop()); } /** - * Handle tools/call request + * PPID watchdog. The daemon mode owns its own lifecycle (idle timeout + + * client refcount), so we deliberately do NOT enable the PPID watchdog + * there — otherwise the very first proxy that spawned the daemon would + * drag it down when it exited. Direct mode and proxy mode both enable it. */ - private async handleToolsCall(request: JsonRpcRequest): Promise { - const params = request.params as { - name: string; - arguments?: Record; - }; - - if (!params || !params.name) { - this.transport.sendError( - request.id, - ErrorCodes.InvalidParams, - 'Missing tool name' - ); - return; - } - - const toolName = params.name; - const toolArgs = params.arguments || {}; - - // Validate tool exists - const tool = tools.find(t => t.name === toolName); - if (!tool) { - this.transport.sendError( - request.id, - ErrorCodes.InvalidParams, - `Unknown tool: ${toolName}` - ); - return; - } - - // If the default project isn't initialized yet, retry in case it was - // initialized after the MCP server started (e.g. user ran codegraph init) - await this.retryInitIfNeeded(); - - const result = await this.toolHandler.execute(toolName, toolArgs); - - this.transport.sendResult(request.id, result); + private installPpidWatchdog(): void { + if (this.mode === 'daemon') return; + if (this.mode === 'proxy') return; // proxy.ts installs its own. + const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS); + if (pollMs <= 0) return; + this.ppidWatchdog = setInterval(() => { + const current = process.ppid; + const ppidChanged = current !== this.originalPpid; + const hostGone = this.hostPpid !== null && !isProcessAlive(this.hostPpid); + if (ppidChanged || hostGone) { + const reason = ppidChanged + ? `ppid ${this.originalPpid} -> ${current}` + : `host pid ${this.hostPpid} exited`; + process.stderr.write( + `[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n` + ); + this.stop(); + } + }, pollMs); + this.ppidWatchdog.unref(); } } +function sleep(ms: number): Promise { + // Deliberately NOT unref'd. During the daemon takeover retry loop we may + // be between processes — no socket bound yet, no transport, no listener + // pinning the event loop. An unref'd timer would let Node drain the loop + // and exit silently before we get a chance to try again. + return new Promise((resolve) => { setTimeout(resolve, ms); }); +} + // Export for use in CLI export { StdioTransport } from './transport'; export { tools, ToolHandler } from './tools'; +// Surface a few daemon-mode bits for tests + diagnostics. +export { Daemon } from './daemon'; +export { CodeGraphPackageVersion } from './version'; diff --git a/src/mcp/proxy.ts b/src/mcp/proxy.ts new file mode 100644 index 000000000..d51153e2a --- /dev/null +++ b/src/mcp/proxy.ts @@ -0,0 +1,243 @@ +/** + * MCP proxy mode — issue #411. + * + * The proxy is a near-transparent stdio↔socket pipe. Once it has verified + * the daemon's hello line (same major.minor.patch as ours), it does no + * protocol parsing of its own: every byte the MCP host writes to the proxy's + * stdin goes straight to the daemon socket, and every byte the daemon emits + * goes straight to the host's stdout. Server-initiated JSON-RPC requests + * (e.g. `roots/list`) flow through the same pipe transparently. + * + * Lifecycle expectations: + * - The proxy exits when *either* stream closes (host stdin closed → + * daemon socket end, or daemon-side socket close → host stdout end). + * - Closing the socket on the proxy side is what tells the daemon to + * decrement its connected-clients refcount. + * - On a parent-process death we can't detect via stdin close (e.g. SIGKILL + * of the MCP host), the proxy's PPID watchdog catches it — same logic + * the direct-mode server uses; see issue #277. + */ + +import * as fs from 'fs'; +import * as net from 'net'; +import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags'; +import { DaemonHello, MAX_HELLO_LINE_BYTES } from './daemon'; +import { CodeGraphPackageVersion } from './version'; + +/** Default poll cadence for the PPID watchdog (same as the direct server). */ +const DEFAULT_PPID_POLL_MS = 5000; + +export interface ProxyResult { + /** + * `proxied` — successfully attached to a same-version daemon and piped + * stdio. The proxy stays alive until either end closes. + * `fallback-needed` — the daemon rejected us (version mismatch / unreachable + * socket) and the caller should run the server in direct mode. + */ + outcome: 'proxied' | 'fallback-needed'; + reason?: string; +} + +/** + * Attempt to connect to the daemon at `socketPath` and pipe stdio through it. + * + * Returns a promise that resolves when either: + * - the connection succeeded and one of stdin/socket has now closed + * (after which the process should exit), or + * - the connection failed early enough that the caller can still fall + * back to direct mode. + * + * The `expectedVersion` param defaults to the package's own version — daemon + * and proxy MUST match exactly. Mismatch resolves with + * `outcome: 'fallback-needed'` so the caller can transparently start its own + * server. (We accept the cost of two concurrent servers in this case as the + * price of never silently running a stale daemon against newer client code.) + */ +export async function runProxy( + socketPath: string, + expectedVersion: string = CodeGraphPackageVersion, +): Promise { + // POSIX: refuse to connect to a stale socket file that points at no + // listening process. `fs.existsSync` is a cheap pre-check; a real + // ECONNREFUSED below catches the rare "exists but unbound" race. + if (process.platform !== 'win32' && !fs.existsSync(socketPath)) { + return { outcome: 'fallback-needed', reason: 'socket file missing' }; + } + + const socket = net.createConnection(socketPath); + socket.setEncoding('utf8'); + + const hello = await readHelloLine(socket).catch((err) => { + socket.destroy(); + return new Error(String(err)); + }); + if (hello instanceof Error) { + return { outcome: 'fallback-needed', reason: hello.message }; + } + + if (hello.codegraph !== expectedVersion) { + process.stderr.write( + `[CodeGraph MCP] Found a daemon on ${socketPath} but version (${hello.codegraph}) ` + + `differs from ours (${expectedVersion}); falling back to direct mode.\n` + ); + socket.destroy(); + return { outcome: 'fallback-needed', reason: 'version mismatch' }; + } + + process.stderr.write( + `[CodeGraph MCP] Attached to shared daemon on ${socketPath} (pid ${hello.pid}, v${hello.codegraph}).\n` + ); + + startPpidWatchdog(socket); + await pipeUntilClose(socket); + return { outcome: 'proxied' }; +} + +/** + * Read one CRLF/LF-terminated JSON line from the socket, parse it as the + * daemon hello, and return it. Bounded to {@link MAX_HELLO_LINE_BYTES} so a + * malicious or broken peer can't OOM us. Times out at 3s — a healthy daemon + * sends hello immediately on accept. + */ +function readHelloLine(socket: net.Socket): Promise { + return new Promise((resolve, reject) => { + let buffer = ''; + const cleanup = () => { + socket.removeListener('data', onData); + socket.removeListener('error', onError); + socket.removeListener('close', onClose); + clearTimeout(timer); + }; + const onData = (chunk: string | Buffer) => { + buffer += typeof chunk === 'string' ? chunk : chunk.toString('utf8'); + const idx = buffer.indexOf('\n'); + if (idx === -1) { + if (buffer.length > MAX_HELLO_LINE_BYTES) { + cleanup(); + reject(new Error('daemon hello line exceeded size limit')); + } + return; + } + const line = buffer.slice(0, idx); + // Re-emit anything past the newline so the pipe-stage sees it. + const tail = buffer.slice(idx + 1); + cleanup(); + if (tail.length > 0) { + // Push back via unshift — Node's net.Socket supports it on readable streams. + socket.unshift(tail); + } + try { + const parsed = JSON.parse(line) as DaemonHello; + if (typeof parsed.codegraph !== 'string' || typeof parsed.pid !== 'number') { + reject(new Error('daemon hello missing required fields')); + return; + } + resolve(parsed); + } catch (err) { + reject(new Error(`daemon hello not JSON: ${err instanceof Error ? err.message : String(err)}`)); + } + }; + const onError = (err: Error) => { cleanup(); reject(err); }; + const onClose = () => { cleanup(); reject(new Error('daemon closed connection before hello')); }; + const timer = setTimeout(() => { + cleanup(); + reject(new Error('timed out waiting for daemon hello')); + }, 3000); + timer.unref?.(); + socket.on('data', onData); + socket.on('error', onError); + socket.on('close', onClose); + }); +} + +/** + * Pipe stdin → socket and socket → stdout. Resolves once either end closes + * so the process can exit. Note: we deliberately do NOT use + * `process.stdin.pipe(socket)` because pipe propagates 'end' onto the + * downstream, which would close the socket prematurely if stdin happens to + * end early — the MCP spec allows it to stay open across reconnects. + */ +function pipeUntilClose(socket: net.Socket): Promise { + return new Promise((resolve) => { + let resolved = false; + const done = () => { if (!resolved) { resolved = true; resolve(); } }; + + process.stdin.on('data', (chunk) => { + try { socket.write(chunk); } catch { /* socket may have errored — close path catches it */ } + }); + process.stdin.on('end', () => { + try { socket.end(); } catch { /* ignore */ } + done(); + }); + process.stdin.on('close', () => { + try { socket.destroy(); } catch { /* ignore */ } + done(); + }); + + socket.on('data', (chunk) => { + try { process.stdout.write(chunk); } catch { /* ignore */ } + }); + socket.on('end', () => done()); + socket.on('close', () => done()); + socket.on('error', (err) => { + process.stderr.write(`[CodeGraph MCP] daemon socket error: ${err.message}\n`); + done(); + }); + }); +} + +/** + * PPID watchdog mirroring the one in `MCPServer.start` — kills the proxy if + * the MCP host (or its proxy of a host, see HOST_PPID_ENV) goes away without + * closing stdin. Issue #277 documents why we can't rely on stdin EOF on + * Linux: the parent may be SIGKILL'd and reparenting doesn't close pipes. + * + * The proxy's "kill" is just a socket close + process.exit — no SQLite or + * watchers to clean up, so this is cheap. + */ +function startPpidWatchdog(socket: net.Socket): void { + const pollMs = parsePollMs(process.env.CODEGRAPH_PPID_POLL_MS); + if (pollMs <= 0) return; + const originalPpid = process.ppid; + const hostPpid = parseHostPpid(process.env[HOST_PPID_ENV]); + const timer = setInterval(() => { + const current = process.ppid; + const ppidChanged = current !== originalPpid; + const hostGone = hostPpid !== null && !isProcessAliveLocal(hostPpid); + if (ppidChanged || hostGone) { + const reason = ppidChanged + ? `ppid ${originalPpid} -> ${current}` + : `host pid ${hostPpid} exited`; + process.stderr.write(`[CodeGraph MCP] proxy parent exited (${reason}); detaching.\n`); + try { socket.destroy(); } catch { /* ignore */ } + process.exit(0); + } + }, pollMs); + timer.unref?.(); +} + +function parsePollMs(raw: string | undefined): number { + if (raw === undefined || raw === '') return DEFAULT_PPID_POLL_MS; + const parsed = Number(raw); + if (!Number.isFinite(parsed)) return DEFAULT_PPID_POLL_MS; + if (parsed < 0) return DEFAULT_PPID_POLL_MS; + return Math.floor(parsed); +} + +function parseHostPpid(raw: string | undefined): number | null { + if (raw === undefined || raw === '') return null; + const parsed = Number(raw); + if (!Number.isInteger(parsed) || parsed <= 1) return null; + return parsed; +} + +function isProcessAliveLocal(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch (err: unknown) { + const e = err as NodeJS.ErrnoException; + if (e.code === 'EPERM') return true; + return false; + } +} diff --git a/src/mcp/session.ts b/src/mcp/session.ts new file mode 100644 index 000000000..4357bd8b1 --- /dev/null +++ b/src/mcp/session.ts @@ -0,0 +1,270 @@ +/** + * MCP per-connection session — speaks the JSON-RPC protocol (initialize, + * tools/list, tools/call) over a single {@link JsonRpcTransport}. It owns + * per-client state only (which protocol version the client asked for, whether + * it advertised `roots`, the one-shot roots/list latch); the heavyweight + * resources (CodeGraph, watcher, ToolHandler) live in the shared + * {@link MCPEngine} so daemon mode can collapse N inotify sets / DB handles + * to one. + * + * The state-machine itself mirrors what `MCPServer` used to do inline before + * issue #411 split it out — the same regression tests in + * `__tests__/mcp-initialize.test.ts` still drive this code path. + */ + +import * as path from 'path'; +import { JsonRpcRequest, JsonRpcNotification, JsonRpcTransport, ErrorCodes } from './transport'; +import { MCPEngine } from './engine'; +import { tools } from './tools'; +import { SERVER_INSTRUCTIONS } from './server-instructions'; + +/** + * MCP Server Info — kept on the session because some clients log it. + */ +const SERVER_INFO = { + name: 'codegraph', + version: '0.1.0', +}; + +/** MCP Protocol Version (latest the server claims). */ +const PROTOCOL_VERSION = '2024-11-05'; + +/** + * How long to wait for the client's `roots/list` response before giving up + * and falling back to the process cwd. + */ +const ROOTS_LIST_TIMEOUT_MS = 5000; + +/** + * Convert a file:// URI to a filesystem path. Handles URL encoding and + * Windows drive letter paths. + */ +function fileUriToPath(uri: string): string { + try { + const url = new URL(uri); + let filePath = decodeURIComponent(url.pathname); + if (process.platform === 'win32' && /^\/[a-zA-Z]:/.test(filePath)) { + filePath = filePath.slice(1); + } + return path.resolve(filePath); + } catch { + return uri.replace(/^file:\/\/\/?/, ''); + } +} + +/** First usable filesystem path from a `roots/list` result, or null. */ +function firstRootPath(result: unknown): string | null { + if (!result || typeof result !== 'object') return null; + const roots = (result as { roots?: unknown }).roots; + if (!Array.isArray(roots) || roots.length === 0) return null; + const first = roots[0] as { uri?: unknown }; + if (typeof first?.uri !== 'string') return null; + return fileUriToPath(first.uri); +} + +export interface MCPSessionOptions { + /** + * Explicit project path from the `--path` CLI flag. When set, the session + * will not bother asking the client for `roots/list` — we already know + * where the project lives. + */ + explicitProjectPath?: string | null; +} + +/** + * One MCP client's view of the server. Created fresh per stdio launch + * (direct mode) or per socket connection (daemon mode). + */ +export class MCPSession { + private clientSupportsRoots = false; + private rootsAttempted = false; + private resolvePromise: Promise | null = null; + private explicitProjectPath: string | null; + + constructor( + private transport: JsonRpcTransport, + private engine: MCPEngine, + opts: MCPSessionOptions = {}, + ) { + this.explicitProjectPath = opts.explicitProjectPath ?? null; + } + + /** + * Start handling messages from the transport. Returns immediately — the + * session lives for as long as the transport is open. + */ + start(): void { + this.transport.start(this.handleMessage.bind(this)); + } + + /** + * Tear down the session. Does NOT touch the engine (the engine may serve + * other sessions) or call `process.exit` (the daemon decides when to exit). + */ + stop(): void { + this.transport.stop(); + } + + /** Underlying transport — exposed for daemon-side close hooks. */ + getTransport(): JsonRpcTransport { + return this.transport; + } + + private async handleMessage(message: JsonRpcRequest | JsonRpcNotification): Promise { + const isRequest = 'id' in message; + switch (message.method) { + case 'initialize': + if (isRequest) await this.handleInitialize(message as JsonRpcRequest); + break; + case 'initialized': + // Notification that client has finished initialization — no action needed. + break; + case 'tools/list': + if (isRequest) await this.handleToolsList(message as JsonRpcRequest); + break; + case 'tools/call': + if (isRequest) await this.handleToolsCall(message as JsonRpcRequest); + break; + case 'ping': + if (isRequest) this.transport.sendResult((message as JsonRpcRequest).id, {}); + break; + default: + if (isRequest) { + this.transport.sendError( + (message as JsonRpcRequest).id, + ErrorCodes.MethodNotFound, + `Method not found: ${message.method}`, + ); + } + } + } + + private async handleInitialize(request: JsonRpcRequest): Promise { + const params = request.params as { + rootUri?: string; + workspaceFolders?: Array<{ uri: string; name: string }>; + capabilities?: { roots?: unknown }; + } | undefined; + + this.clientSupportsRoots = !!params?.capabilities?.roots; + + // Explicit project signal, strongest first: client-provided rootUri / + // workspaceFolders (LSP-style), else the --path the server was launched + // with. cwd is NOT used here — we defer it so a roots/list answer can + // win over it. See issue #196. + let explicitPath: string | null = null; + if (params?.rootUri) { + explicitPath = fileUriToPath(params.rootUri); + } else if (params?.workspaceFolders?.[0]?.uri) { + explicitPath = fileUriToPath(params.workspaceFolders[0].uri); + } else if (this.explicitProjectPath) { + explicitPath = this.explicitProjectPath; + } + + // Respond to the handshake BEFORE doing any heavy init — see issue #172. + this.transport.sendResult(request.id, { + protocolVersion: PROTOCOL_VERSION, + capabilities: { tools: {} }, + serverInfo: SERVER_INFO, + instructions: SERVER_INSTRUCTIONS, + }); + + if (explicitPath) { + // Kick off engine init in the background. If another session in the + // same daemon already opened the project, `ensureInitialized` is a + // ~free no-op — N concurrent clients pay exactly one open. + this.resolvePromise = this.engine.ensureInitialized(explicitPath); + } + } + + private async handleToolsList(request: JsonRpcRequest): Promise { + await this.retryInitIfNeeded(); + this.transport.sendResult(request.id, { + tools: this.engine.getToolHandler().getTools(), + }); + } + + private async handleToolsCall(request: JsonRpcRequest): Promise { + const params = request.params as { + name: string; + arguments?: Record; + }; + + if (!params || !params.name) { + this.transport.sendError(request.id, ErrorCodes.InvalidParams, 'Missing tool name'); + return; + } + + const toolName = params.name; + const toolArgs = params.arguments || {}; + + const tool = tools.find((t) => t.name === toolName); + if (!tool) { + this.transport.sendError( + request.id, + ErrorCodes.InvalidParams, + `Unknown tool: ${toolName}`, + ); + return; + } + + await this.retryInitIfNeeded(); + + const result = await this.engine.getToolHandler().execute(toolName, toolArgs); + this.transport.sendResult(request.id, result); + } + + /** + * Lazy default-project resolution. Three layers: + * 1. await the in-flight init kicked off from `handleInitialize` (if any); + * 2. if still uninitialized and we never asked the client for its roots, + * do so now (one-shot); fall back to cwd if the client lacks roots; + * 3. last-resort: re-walk from the best candidate — picks up projects + * that were `codegraph init`'d *after* the server started. + */ + private async retryInitIfNeeded(): Promise { + if (this.resolvePromise) { + try { await this.resolvePromise; } catch { /* fall through to retry */ } + this.resolvePromise = null; + } + + if (this.engine.hasDefaultCodeGraph()) return; + + const hint = this.explicitProjectPath ?? this.engine.getProjectPath(); + if (!hint && !this.rootsAttempted) { + this.rootsAttempted = true; + this.resolvePromise = this.clientSupportsRoots + ? this.initFromRoots() + : this.engine.ensureInitialized(process.cwd()); + try { await this.resolvePromise; } catch { /* fall through */ } + this.resolvePromise = null; + if (this.engine.hasDefaultCodeGraph()) return; + } + + // Last resort: walk from the best candidate (sync open). Picks up + // projects that appeared after the server started. + const candidate = hint ?? process.cwd(); + this.engine.retryInitializeSync(candidate); + } + + /** + * Ask the client for its workspace root via `roots/list` and open the + * first one. Falls back to `process.cwd()` on timeout or empty answer. + */ + private async initFromRoots(): Promise { + let target = process.cwd(); + try { + const result = await this.transport.request('roots/list', undefined, ROOTS_LIST_TIMEOUT_MS); + const rootPath = firstRootPath(result); + if (rootPath) { + target = rootPath; + } else { + process.stderr.write('[CodeGraph MCP] Client returned no workspace roots; falling back to process cwd.\n'); + } + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`[CodeGraph MCP] roots/list request failed (${msg}); falling back to process cwd.\n`); + } + await this.engine.ensureInitialized(target); + } +} diff --git a/src/mcp/transport.ts b/src/mcp/transport.ts index 2638600d6..ff81b0d07 100644 --- a/src/mcp/transport.ts +++ b/src/mcp/transport.ts @@ -1,10 +1,22 @@ /** - * MCP Stdio Transport + * MCP JSON-RPC Transports * - * Handles JSON-RPC 2.0 communication over stdin/stdout for MCP protocol. + * Two flavors share the same wire format (newline-delimited JSON-RPC 2.0): + * + * - `StdioTransport` — original transport; reads/writes the process's + * stdin/stdout. Used by direct-mode MCP servers. + * - `SocketTransport` — wraps a single `net.Socket`. Used by the shared-daemon + * architecture (see {@link ./daemon}) to multiplex multiple MCP clients onto + * one CodeGraph instance via per-connection sessions. + * + * Both implement {@link JsonRpcTransport} so the session-level protocol logic + * (initialize / tools/list / tools/call, plus server-initiated `roots/list`) + * is identical regardless of where the bytes come from. */ import * as readline from 'readline'; +import type { Readable, Writable } from 'stream'; +import type { Socket } from 'net'; /** * JSON-RPC 2.0 Request @@ -56,56 +68,41 @@ export const ErrorCodes = { export type MessageHandler = (message: JsonRpcRequest | JsonRpcNotification) => Promise; /** - * Stdio Transport for MCP - * - * Reads JSON-RPC messages from stdin and writes responses to stdout. + * Generic JSON-RPC transport interface — common surface for stdio and socket + * carriers. Anything below the session layer (initialize, tool dispatch, etc.) + * talks to this, not to a concrete transport class. */ -export class StdioTransport { - private rl: readline.Interface | null = null; - private messageHandler: MessageHandler | null = null; +export interface JsonRpcTransport { + start(handler: MessageHandler): void; + stop(): void; + send(response: JsonRpcResponse): void; + notify(method: string, params?: unknown): void; + request(method: string, params?: unknown, timeoutMs?: number): Promise; + sendResult(id: string | number, result: unknown): void; + sendError(id: string | number | null, code: number, message: string, data?: unknown): void; +} + +/** + * Shared implementation of newline-delimited JSON-RPC 2.0 over any + * `Readable`/`Writable` stream pair. Stdio and socket transports both wrap + * this — the only difference between them is which streams get plugged in + * and how a "close" propagates back to the owning code. + */ +abstract class LineBasedJsonRpcTransport implements JsonRpcTransport { + protected messageHandler: MessageHandler | null = null; // Outstanding server-initiated requests (e.g. roots/list), keyed by the id // we sent. Responses from the client are matched back here. - private pending = new Map void; reject: (error: Error) => void; }>(); - private nextRequestId = 1; - - /** - * Start listening for messages on stdin - */ - start(handler: MessageHandler): void { - this.messageHandler = handler; - - this.rl = readline.createInterface({ - input: process.stdin, - output: process.stdout, - terminal: false, - }); + protected nextRequestId = 1; + protected stopped = false; - this.rl.on('line', async (line) => { - await this.handleLine(line); - }); - - this.rl.on('close', () => { - process.exit(0); - }); - } - - /** - * Stop listening - */ - stop(): void { - // Fail any in-flight server-initiated requests so their awaiters don't hang. - for (const { reject } of this.pending.values()) { - reject(new Error('Transport stopped')); - } - this.pending.clear(); - if (this.rl) { - this.rl.close(); - this.rl = null; - } - } + abstract start(handler: MessageHandler): void; + protected abstract write(line: string): void; + protected abstract idPrefix(): string; + abstract stop(): void; /** * Send a server-initiated request to the client and await its response. @@ -116,7 +113,7 @@ export class StdioTransport { * on timeout so callers can fall back rather than hang forever. */ request(method: string, params?: unknown, timeoutMs = 5000): Promise { - const id = `cg-srv-${this.nextRequestId++}`; + const id = `${this.idPrefix()}-${this.nextRequestId++}`; return new Promise((resolve, reject) => { const timer = setTimeout(() => { this.pending.delete(id); @@ -128,56 +125,42 @@ export class StdioTransport { resolve: (value) => { clearTimeout(timer); resolve(value); }, reject: (error) => { clearTimeout(timer); reject(error); }, }); - process.stdout.write(JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'); + this.write(JSON.stringify({ jsonrpc: '2.0', id, method, params })); }); } - /** - * Send a response - */ send(response: JsonRpcResponse): void { - const json = JSON.stringify(response); - process.stdout.write(json + '\n'); + this.write(JSON.stringify(response)); } - /** - * Send a notification (no id) - */ notify(method: string, params?: unknown): void { - const notification: JsonRpcNotification = { - jsonrpc: '2.0', - method, - params, - }; - process.stdout.write(JSON.stringify(notification) + '\n'); + const notification: JsonRpcNotification = { jsonrpc: '2.0', method, params }; + this.write(JSON.stringify(notification)); } - /** - * Send a success response - */ sendResult(id: string | number, result: unknown): void { - this.send({ - jsonrpc: '2.0', - id, - result, - }); + this.send({ jsonrpc: '2.0', id, result }); + } + + sendError(id: string | number | null, code: number, message: string, data?: unknown): void { + this.send({ jsonrpc: '2.0', id, error: { code, message, data } }); } /** - * Send an error response + * Fail any in-flight server-initiated requests so their awaiters don't hang. + * Called from `stop()` in subclasses. */ - sendError(id: string | number | null, code: number, message: string, data?: unknown): void { - this.send({ - jsonrpc: '2.0', - id, - error: { code, message, data }, - }); + protected rejectPending(reason: string): void { + for (const { reject } of this.pending.values()) { + reject(new Error(reason)); + } + this.pending.clear(); } /** - * Handle an incoming line of JSON + * Handle an incoming line of JSON. Both transports feed lines here. */ - private async handleLine(line: string): Promise { + protected async handleLine(line: string): Promise { const trimmed = line.trim(); if (!trimmed) return; @@ -254,3 +237,218 @@ export class StdioTransport { return true; } } + +export interface StdioTransportOptions { + /** + * If true, the transport calls `process.exit(0)` when stdin closes. Set to + * `false` in shared-daemon mode where the stdio "session" is just *one* of + * many clients — losing it shouldn't drag the daemon down. The default + * (true) matches the original single-process behavior callers rely on. + */ + exitOnClose?: boolean; + /** + * Optional callback fired when the stdin stream closes. The daemon uses + * this to decrement its connected-clients refcount. + */ + onClose?: () => void; +} + +/** + * Stdio Transport for MCP + * + * Reads JSON-RPC messages from stdin and writes responses to stdout. Used by + * the direct (single-process) MCP server path, where the MCP host launches + * one server per session and talks to it over the child's stdio. Also used by + * shared-daemon mode for the launcher's session (with `exitOnClose: false`) + * so the daemon outlives its launcher. + */ +export class StdioTransport extends LineBasedJsonRpcTransport { + private rl: readline.Interface | null = null; + private opts: Required; + + constructor(opts: StdioTransportOptions = {}) { + super(); + this.opts = { + exitOnClose: opts.exitOnClose ?? true, + onClose: opts.onClose ?? (() => { /* no-op */ }), + }; + } + + start(handler: MessageHandler): void { + this.messageHandler = handler; + + this.rl = readline.createInterface({ + input: process.stdin, + output: process.stdout, + terminal: false, + }); + + this.rl.on('line', async (line) => { + await this.handleLine(line); + }); + + this.rl.on('close', () => { + this.opts.onClose(); + if (this.opts.exitOnClose) { + process.exit(0); + } + }); + } + + stop(): void { + if (this.stopped) return; + this.stopped = true; + this.rejectPending('Transport stopped'); + if (this.rl) { + this.rl.close(); + this.rl = null; + } + } + + protected write(line: string): void { + process.stdout.write(line + '\n'); + } + + protected idPrefix(): string { + return 'cg-srv'; + } +} + +/** + * Socket Transport for MCP daemon sessions. + * + * Wraps a single `net.Socket` (Unix domain socket on POSIX, named pipe on + * Windows). One instance per connected MCP client. Unlike {@link StdioTransport}, + * `stop()` and stream-close *don't* call `process.exit` — a daemon-side session + * ending must not bring down the whole daemon. + */ +export class SocketTransport extends LineBasedJsonRpcTransport { + private buffer = ''; + private closeHandlers: Array<() => void> = []; + + constructor(private socket: Socket, private prefix: string = 'cg-sock') { + super(); + } + + /** + * Register a callback fired exactly once when the socket closes (from either + * side). Used by the daemon to decrement its connected-clients refcount. + */ + onClose(handler: () => void): void { + this.closeHandlers.push(handler); + } + + start(handler: MessageHandler): void { + this.messageHandler = handler; + + this.socket.setEncoding('utf8'); + this.socket.on('data', (chunk: string) => { + this.buffer += chunk; + let idx; + // Drain every complete line; tail-fragment stays in the buffer for the + // next chunk. The handler is async but we don't await it here — JSON-RPC + // permits out-of-order responses, and serializing here would deadlock if + // a handler issued a server-initiated request that needed a *later* line + // to arrive (e.g. roots/list mid-tools-call). + while ((idx = this.buffer.indexOf('\n')) !== -1) { + const line = this.buffer.slice(0, idx); + this.buffer = this.buffer.slice(idx + 1); + void this.handleLine(line); + } + }); + + this.socket.on('close', () => this.handleSocketClose()); + this.socket.on('error', (err) => { + // Don't crash the daemon over a broken pipe; just shut this connection. + process.stderr.write(`[CodeGraph daemon] socket error: ${err.message}\n`); + this.handleSocketClose(); + }); + } + + stop(): void { + if (this.stopped) return; + this.stopped = true; + this.rejectPending('Transport stopped'); + if (!this.socket.destroyed) { + this.socket.end(); + this.socket.destroy(); + } + } + + /** + * Write a one-shot line directly to the socket (no JSON-RPC framing applied + * by this class — caller produces the line). The daemon uses this for the + * hello/handshake line that precedes the JSON-RPC stream. + */ + writeRaw(line: string): void { + if (!this.socket.destroyed) { + this.socket.write(line.endsWith('\n') ? line : line + '\n'); + } + } + + protected write(line: string): void { + if (!this.socket.destroyed) { + this.socket.write(line + '\n'); + } + } + + protected idPrefix(): string { + return this.prefix; + } + + private handleSocketClose(): void { + if (this.stopped) return; + this.stopped = true; + this.rejectPending('Socket closed'); + for (const h of this.closeHandlers) { + try { h(); } catch { /* never let a close-handler take the daemon down */ } + } + this.closeHandlers = []; + } +} + +/** + * Adapter that lets the proxy mode reuse {@link LineBasedJsonRpcTransport}'s + * line buffering for arbitrary `Readable`/`Writable` pairs. Not currently used + * by sessions — kept here for symmetry with the existing stdio/socket pair if + * future work needs a third carrier. + */ +export class StreamPairTransport extends LineBasedJsonRpcTransport { + private buffer = ''; + + constructor( + private input: Readable, + private output: Writable, + private prefix: string = 'cg-stream', + ) { + super(); + } + + start(handler: MessageHandler): void { + this.messageHandler = handler; + this.input.setEncoding?.('utf8'); + this.input.on('data', (chunk: string | Buffer) => { + this.buffer += typeof chunk === 'string' ? chunk : chunk.toString('utf8'); + let idx; + while ((idx = this.buffer.indexOf('\n')) !== -1) { + const line = this.buffer.slice(0, idx); + this.buffer = this.buffer.slice(idx + 1); + void this.handleLine(line); + } + }); + } + + stop(): void { + if (this.stopped) return; + this.stopped = true; + this.rejectPending('Transport stopped'); + } + + protected write(line: string): void { + this.output.write(line + '\n'); + } + + protected idPrefix(): string { + return this.prefix; + } +} diff --git a/src/mcp/version.ts b/src/mcp/version.ts new file mode 100644 index 000000000..cef1b7834 --- /dev/null +++ b/src/mcp/version.ts @@ -0,0 +1,36 @@ +/** + * Resolved package version, computed once at module load. + * + * The version string is the rendezvous datum between cooperating daemon and + * proxy processes: the daemon advertises its version in the hello line, and + * the proxy refuses to share IPC across a mismatch (falls back to direct + * mode). Keeping the resolution in one place avoids drift between the CLI + * `--version` output (which reads `package.json` directly) and the daemon + * handshake. + * + * Resolution strategy: read the bundled `package.json` two levels up from + * this file — same relative position whether we're loaded from `src/mcp/` or + * the `dist/mcp/` output, since `tsc` preserves the layout. If reading fails + * (e.g. the package was unpacked oddly), fall back to "0.0.0-unknown" — a + * sentinel that will never match a real version, so the proxy harmlessly + * falls back to direct mode. + */ + +import * as fs from 'fs'; +import * as path from 'path'; + +function readPackageVersion(): string { + try { + const pkgPath = path.join(__dirname, '..', '..', 'package.json'); + const raw = fs.readFileSync(pkgPath, 'utf8'); + const parsed = JSON.parse(raw); + if (typeof parsed?.version === 'string' && parsed.version.length > 0) { + return parsed.version; + } + } catch { + // Fall through to sentinel. + } + return '0.0.0-unknown'; +} + +export const CodeGraphPackageVersion = readPackageVersion(); From 55798b405ca3cbcd1ac963610975f9fa79426a4c Mon Sep 17 00:00:00 2001 From: Colby McHenry Date: Mon, 25 May 2026 19:45:26 -0500 Subject: [PATCH 2/2] fix(mcp): detached daemon + atomic lock to harden shared MCP daemon (#411) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review fixes on top of the shared-daemon PR (#419): - Detach the daemon into its own session/process group; every `serve --mcp` is now a thin proxy to it. The in-process daemon shared the first launcher's process group (closing that terminal severed every other client) and disabled the PPID watchdog, orphaning the daemon on host SIGKILL — regressing #277. Now a proxy carries the watchdog; the detached daemon reaps via client-refcount + idle timeout. - Write the lockfile record atomically on the O_EXCL create and pid-verify before clearing a stale lock, closing the concurrent-startup race where a racing candidate could delete the winner's lock and spawn a second daemon (two watchers / two writers) — the exact multi-agent case the feature targets. - Canonicalize the project root with realpath so clients converge on one socket regardless of a symlinked cwd / rootUri. - Proxy exits when the pipe closes (its stdin listener otherwise pinned the loop); align the proxy watchdog message with direct mode; drop the dead StreamPairTransport; serverInfo.version now tracks the package version. - Tests: rewrite the daemon suite for the detached architecture; add concurrent-startup-race and first-client-death-survival regressions; pin the #172 and #277 tests to direct mode (their in-process contracts). Validated on macOS, Linux (Docker), and Windows (named pipes): full suite green; daemon survives the launcher; concurrent launchers converge on one daemon; 3 agents share one inotify watch set (3x fewer watches). A/B confirms daemon mode returns byte-identical tool output to direct mode with no latency regression. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 37 +-- __tests__/mcp-daemon.test.ts | 395 ++++++++++++++++------------ __tests__/mcp-initialize.test.ts | 8 + __tests__/mcp-ppid-watchdog.test.ts | 7 +- src/mcp/daemon.ts | 194 +++++++------- src/mcp/index.ts | 278 +++++++++++++------- src/mcp/proxy.ts | 7 +- src/mcp/session.ts | 6 +- src/mcp/transport.ts | 47 ---- 9 files changed, 561 insertions(+), 418 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cd6f67fa..e98934351 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,21 +11,28 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Added - **Shared MCP daemon — running multiple AI agents in the same project no - longer multiplies the inotify, SQLite, and indexing cost.** The first - `codegraph serve --mcp` per project becomes a per-project daemon listening - on `.codegraph/daemon.sock` (named pipe on Windows). Subsequent invocations - for the same project attach as thin stdio↔socket proxies — one file - watcher, one SQLite connection, one tree-sitter warm-up no matter how many - Claude Code / Cursor / Codex / opencode sessions you point at the repo. - Two concurrent sessions on a large monorepo used to consume ~880k of the - Linux 1,048,576 per-user inotify budget; they now share ~440k. The daemon - lingers for `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` (default 300s) after the - last client disconnects so back-to-back sessions don't repay startup cost. - Resolves issue #411. -- **`CODEGRAPH_NO_DAEMON=1` — opt out of the shared daemon.** Restores the - pre-issue-#411 behavior of one independent server process per client. - Useful for debugging or for environments that don't permit local - IPC sockets. + longer multiplies the file-watch, SQLite, and indexing cost.** Point more + than one `codegraph serve --mcp` at a project (two Claude Code windows, an + agent in a git worktree, `/loop` alongside an interactive session, parallel + sub-agents) and they now share **one** background daemon per project: a + single file watcher (one inotify set on Linux), one SQLite connection, and + one tree-sitter warm-up — instead of N independent copies. Measured on Linux: + three agents register **~3× fewer inotify watches** sharing one watcher + versus three standalone servers. Resolves issue #411. (Composable with the + per-watcher pruning in #276/#346 — that shrinks each watch set; this shares + one across agents.) +- The daemon runs as a **detached background process** that outlives any single + session, so closing one editor or terminal never severs the others. Each + `serve --mcp` your agent host launches is a thin stdio↔socket proxy to it (a + Unix-domain socket, or a named pipe on Windows). When the last client + disconnects the daemon lingers for `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` + (default `300000`) so back-to-back sessions skip the startup cost, then exits + and removes its lockfile — an OOM-killed or force-quit host can't leak it. +- **`CODEGRAPH_NO_DAEMON=1`** opts out, restoring one independent server per + client (handy for debugging or sandboxes that disallow local IPC sockets). + The daemon is also version-pinned: after you upgrade codegraph, sessions + already attached to the old daemon keep using it while new sessions run + standalone until it idles out — they never mix versions over the socket. ## [0.9.5] - 2026-05-25 diff --git a/__tests__/mcp-daemon.test.ts b/__tests__/mcp-daemon.test.ts index c8019b786..fa1558f02 100644 --- a/__tests__/mcp-daemon.test.ts +++ b/__tests__/mcp-daemon.test.ts @@ -1,18 +1,35 @@ /** * Shared MCP daemon — issue #411. * - * Validates the contract added in `src/mcp/{daemon,proxy,session}.ts`: - * - Two `serve --mcp` invocations against the same project share *one* - * daemon process; the second invocation attaches as a proxy. - * - A stale lockfile (PID gone, no socket) gets cleared so the next - * invocation can become the new daemon. - * - `CODEGRAPH_NO_DAEMON=1` opts out — both processes run independently. - * - The proxy refuses to attach across a version mismatch. + * Validates the daemon architecture in `src/mcp/{daemon,proxy,session,index}.ts` + * AFTER the review fixes: + * + * - The daemon is a *detached* background process; every `serve --mcp` + * invocation is a thin proxy to it. Two invocations against one project + * share ONE daemon. + * - Concurrent launchers converge on a single daemon (the must-fix-1 + * lockfile-race: an empty-pidfile window used to let a racing candidate + * delete the winner's lock → two daemons). + * - Killing the launcher that spawned the daemon does NOT take the daemon + * down — other attached clients keep working (the must-fix-2 detach: the + * in-process daemon used to die with its launcher's process group and + * orphan on host SIGKILL, regressing #277). + * - A stale lockfile (dead pid) is cleared; `CODEGRAPH_NO_DAEMON=1` opts out; + * the proxy refuses to attach across a version mismatch; the daemon + * idle-times-out after the last client leaves (so a single session can't + * leak a daemon forever). * * These tests intentionally spawn real `node dist/bin/codegraph.js` processes - * over real sockets — the same surface a Claude Code / Cursor / Codex install - * would exercise. Idle timeouts are forced short via - * `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` to keep the suite fast. + * over real sockets/pipes — the same surface a Claude Code / Cursor / Codex + * install exercises. The daemon logs to `.codegraph/daemon.log` (it has no + * client stderr of its own), so daemon-side assertions read that file. + * + * `realRoot` vs `tempDir`: processes are spawned with the (possibly symlinked) + * `tempDir` as cwd/rootUri — on macOS `os.tmpdir()` lives under `/var`, a + * symlink to `/private/var`, and a spawned child's `process.cwd()` is already + * realpath'd. The daemon canonicalizes the root with `realpathSync`, so all + * path assertions use `realRoot` (the canonical form). That this matches end to + * end is itself the proof the canonicalization works. */ import { afterEach, beforeEach, describe, expect, it } from 'vitest'; @@ -21,6 +38,7 @@ import * as fs from 'fs'; import * as os from 'os'; import * as path from 'path'; import { CodeGraph } from '../src'; +import { getDaemonSocketPath } from '../src/mcp/daemon-paths'; const BIN = path.resolve(__dirname, '../dist/bin/codegraph.js'); @@ -28,10 +46,6 @@ interface SpawnedServer { child: ChildProcessWithoutNullStreams; stdout: string[]; stderr: string[]; - // Resolves once the child has emitted at least one stderr line — gives us a - // stable signal that the process is past the `relaunchWithWasmRuntimeFlagsIfNeeded` - // re-exec dance. - spawnedSettled: Promise; } function spawnServer(cwd: string, env: NodeJS.ProcessEnv = {}): SpawnedServer { @@ -40,12 +54,14 @@ function spawnServer(cwd: string, env: NodeJS.ProcessEnv = {}): SpawnedServer { stdio: ['pipe', 'pipe', 'pipe'], env: { ...process.env, ...env }, }) as ChildProcessWithoutNullStreams; + // Swallow spawn/EPIPE errors so killing a child mid-write can't surface as an + // unhandled error that crashes the vitest worker. + child.on('error', () => { /* ignore */ }); + child.stdin.on('error', () => { /* ignore */ }); const stdout: string[] = []; const stderr: string[] = []; let stdoutBuf = ''; let stderrBuf = ''; - let firstStderrResolve!: () => void; - const spawnedSettled = new Promise((resolve) => { firstStderrResolve = resolve; }); child.stdout.on('data', (chunk: Buffer) => { stdoutBuf += chunk.toString('utf8'); let idx: number; @@ -61,211 +77,264 @@ function spawnServer(cwd: string, env: NodeJS.ProcessEnv = {}): SpawnedServer { stderr.push(stderrBuf.slice(0, idx)); stderrBuf = stderrBuf.slice(idx + 1); } - firstStderrResolve(); }); - return { child, stdout, stderr, spawnedSettled }; + return { child, stdout, stderr }; } -function sendInitialize(child: ChildProcessWithoutNullStreams, rootUri: string, id: number = 0) { - const msg = JSON.stringify({ +function sendMessage(child: ChildProcessWithoutNullStreams, msg: unknown): void { + try { child.stdin.write(JSON.stringify(msg) + '\n'); } catch { /* child may be gone */ } +} + +function sendInitialize(child: ChildProcessWithoutNullStreams, rootUri: string, id: number): void { + sendMessage(child, { jsonrpc: '2.0', id, method: 'initialize', params: { - protocolVersion: '2025-11-25', + protocolVersion: '2024-11-05', capabilities: {}, clientInfo: { name: 'test', version: '0.0.0' }, rootUri, }, }); - child.stdin.write(msg + '\n'); +} + +/** Find a JSON-RPC response with the given id (result OR error) on stdout. */ +function findResponse(stdout: string[], id: number): any | null { + for (const line of stdout) { + if (!line.trim()) continue; + try { + const parsed = JSON.parse(line); + if (parsed && parsed.id === id && (parsed.result !== undefined || parsed.error !== undefined)) { + return parsed; + } + } catch { /* not JSON */ } + } + return null; } function waitFor( predicate: () => T | undefined | null | false, timeoutMs: number, - pollMs: number = 25, + pollMs = 25, ): Promise { return new Promise((resolve, reject) => { const started = Date.now(); const tick = () => { - const v = predicate(); + let v: T | undefined | null | false; + try { v = predicate(); } catch (e) { return reject(e); } if (v) return resolve(v as T); - if (Date.now() - started > timeoutMs) { - return reject(new Error(`Timed out after ${timeoutMs}ms`)); - } + if (Date.now() - started > timeoutMs) return reject(new Error(`Timed out after ${timeoutMs}ms`)); setTimeout(tick, pollMs); }; tick(); }); } -function findInitializeResponse(stdout: string[], id: number) { - for (const line of stdout) { - if (!line.trim()) continue; - try { - const parsed = JSON.parse(line); - if (parsed.id === id && parsed.result?.serverInfo) return parsed; - } catch { /* not JSON */ } - } - return null; +function isAlive(pid: number): boolean { + try { process.kill(pid, 0); return true; } catch { return false; } +} + +function readLockPid(root: string): number | null { + try { + const raw = fs.readFileSync(path.join(root, '.codegraph', 'daemon.pid'), 'utf8'); + const info = JSON.parse(raw); + return typeof info.pid === 'number' ? info.pid : null; + } catch { return null; } +} + +function readDaemonLog(root: string): string { + try { return fs.readFileSync(path.join(root, '.codegraph', 'daemon.log'), 'utf8'); } + catch { return ''; } } -function killTree(...procs: ChildProcessWithoutNullStreams[]) { +function countListeningLines(root: string): number { + return readDaemonLog(root).split('\n').filter((l) => l.includes('[CodeGraph daemon] Listening on')).length; +} + +function killTree(...procs: ChildProcessWithoutNullStreams[]): void { for (const p of procs) { - if (!p.killed) { - try { p.kill('SIGKILL'); } catch { /* already gone */ } - } + if (!p.killed) { try { p.kill('SIGKILL'); } catch { /* gone */ } } } } -async function waitProcessExit(child: ChildProcessWithoutNullStreams, timeoutMs: number): Promise { - return new Promise((resolve) => { - if (child.exitCode !== null || child.signalCode !== null) return resolve(true); - const timer = setTimeout(() => resolve(false), timeoutMs); - child.once('exit', () => { clearTimeout(timer); resolve(true); }); - }); +async function waitProcessExit(pid: number, timeoutMs: number): Promise { + return waitFor(() => !isAlive(pid), timeoutMs).then(() => true).catch(() => false); } describe('Shared MCP daemon (issue #411)', () => { - let tempDir: string; + let tempDir: string; // the (possibly symlinked) path processes are spawned with + let realRoot: string; // its canonical form — what the daemon keys paths on const servers: SpawnedServer[] = []; beforeEach(async () => { tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-mcp-daemon-')); - // Initialize a real CodeGraph project — the daemon needs `.codegraph/` to - // know where to put its socket + pidfile. `CodeGraph.init` writes the SQL - // schema synchronously, so by the time we spawn the server it's ready. const cg = await CodeGraph.init(tempDir); cg.close(); + realRoot = fs.realpathSync(tempDir); }); afterEach(async () => { killTree(...servers.map((s) => s.child)); - // Give the OS a moment to reap and remove socket files before rmSync. - await new Promise((resolve) => setTimeout(resolve, 50)); + // The daemon is detached (not a tracked child) — reap it explicitly via the + // pid it recorded, so a test can't leak a background daemon. Guard against + // our own pid: the version-mismatch test plants `pid: process.pid` in the + // lockfile, and we must never SIGKILL the vitest worker. + const daemonPid = readLockPid(realRoot); + if (daemonPid && daemonPid !== process.pid && isAlive(daemonPid)) { + try { process.kill(daemonPid, 'SIGKILL'); } catch { /* race */ } + } + await new Promise((r) => setTimeout(r, 50)); servers.length = 0; fs.rmSync(tempDir, { recursive: true, force: true }); }); - it('second invocation attaches as a proxy to the first', async () => { - // Short idle so the suite doesn't have to wait the production 5 minutes - // if anything leaks — but long enough that the second client's lifetime - // overlaps with the daemon's. - const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '5000' }; + it('two invocations share ONE detached daemon; both attach as proxies', async () => { + const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '15000' }; const first = spawnServer(tempDir, env); servers.push(first); sendInitialize(first.child, `file://${tempDir}`, 1); - const firstResponse = await waitFor( - () => findInitializeResponse(first.stdout, 1), - 8000, - ); - expect(firstResponse.result.serverInfo.name).toBe('codegraph'); - - // Daemon should be advertising itself on stderr — proves the daemon path - // ran, not the direct-mode fallback. - expect(first.stderr.some((l) => l.includes('[CodeGraph daemon] Listening on'))).toBe(true); - - // Lockfile + socket exist. - const pidPath = path.join(tempDir, '.codegraph', 'daemon.pid'); - const sockPath = path.join(tempDir, '.codegraph', 'daemon.sock'); - expect(fs.existsSync(pidPath)).toBe(true); - // On POSIX the socket lives at the in-project path unless its absolute - // path exceeded the limit — `os.tmpdir()`-based fallback is rare for - // mkdtemp paths. - expect(fs.existsSync(sockPath)).toBe(true); - - // Second server in the same project should attach as a proxy. + const firstResp = await waitFor(() => findResponse(first.stdout, 1), 10000); + expect(firstResp.result.serverInfo.name).toBe('codegraph'); + + // The launcher is a PROXY (not the daemon itself) — that's the detach fix. + await waitFor(() => first.stderr.some((l) => l.includes('Attached to shared daemon')), 8000); + + // A detached daemon came up and recorded itself. + await waitFor(() => fs.existsSync(path.join(realRoot, '.codegraph', 'daemon.pid')), 8000); + await waitFor(() => countListeningLines(realRoot) >= 1, 8000); + const daemonPid = readLockPid(realRoot); + expect(daemonPid).toBeTruthy(); + expect(isAlive(daemonPid!)).toBe(true); + // The socket exists at the path the code computes from the canonical root. + // On Windows the daemon listens on a named pipe (\\.\pipe\...), which isn't + // a filesystem entry — existsSync doesn't apply there, and the "Attached to + // shared daemon" proof above already confirms the proxy reached it. + if (process.platform !== 'win32') { + expect(fs.existsSync(getDaemonSocketPath(realRoot))).toBe(true); + } + + // Second invocation attaches as a proxy to the SAME daemon. const second = spawnServer(tempDir, env); servers.push(second); sendInitialize(second.child, `file://${tempDir}`, 2); - const secondResponse = await waitFor( - () => findInitializeResponse(second.stdout, 2), - 8000, - ); - expect(secondResponse.result.serverInfo.name).toBe('codegraph'); - // The proxy logs its attach to stderr; that's the canonical witness. - await waitFor( - () => second.stderr.some((l) => l.includes('Attached to shared daemon')), - 5000, - ); - }, 30000); + const secondResp = await waitFor(() => findResponse(second.stdout, 2), 10000); + expect(secondResp.result.serverInfo.name).toBe('codegraph'); + await waitFor(() => second.stderr.some((l) => l.includes('Attached to shared daemon')), 8000); + + // Exactly one daemon ever bound, and it's the same pid both attached to. + expect(countListeningLines(realRoot)).toBe(1); + expect(readLockPid(realRoot)).toBe(daemonPid); + }, 40000); + + it('concurrent launchers converge on a single daemon (lockfile race — must-fix 1)', async () => { + const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '15000' }; + + // Fire three launchers as close to simultaneously as possible — this is the + // race window where the old code could end up with two daemons. + const procs = [spawnServer(tempDir, env), spawnServer(tempDir, env), spawnServer(tempDir, env)]; + procs.forEach((p, i) => { servers.push(p); sendInitialize(p.child, `file://${tempDir}`, i + 1); }); + + // All three get a valid initialize response... + for (let i = 0; i < procs.length; i++) { + const resp = await waitFor(() => findResponse(procs[i].stdout, i + 1), 12000); + expect(resp.result.serverInfo.name).toBe('codegraph'); + } + // ...and all three attached as proxies (none fell back / wedged). + for (const p of procs) { + await waitFor(() => p.stderr.some((l) => l.includes('Attached to shared daemon')), 10000); + } + + // The decisive assertion: exactly ONE daemon bound the socket. Losing + // candidates log "already holds the lock; exiting" and never listen. + expect(countListeningLines(realRoot)).toBe(1); + const daemonPid = readLockPid(realRoot); + expect(daemonPid).toBeTruthy(); + expect(isAlive(daemonPid!)).toBe(true); + }, 45000); + + it('daemon survives the first client dying; a second client keeps working (must-fix 2 / #277)', async () => { + // Idle high so the daemon doesn't reap mid-test; poll fast so proxy 1 + // notices its dead parent quickly. + const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '30000', CODEGRAPH_PPID_POLL_MS: '200' }; + + const first = spawnServer(tempDir, env); + servers.push(first); + sendInitialize(first.child, `file://${tempDir}`, 1); + await waitFor(() => findResponse(first.stdout, 1), 10000); + await waitFor(() => (readLockPid(realRoot) ?? 0) > 0, 8000); + const daemonPid = readLockPid(realRoot)!; + expect(isAlive(daemonPid)).toBe(true); - it('CODEGRAPH_NO_DAEMON=1 keeps both processes independent (no socket)', async () => { + const second = spawnServer(tempDir, env); + servers.push(second); + sendInitialize(second.child, `file://${tempDir}`, 1); + await waitFor(() => findResponse(second.stdout, 1), 10000); + await waitFor(() => second.stderr.some((l) => l.includes('Attached to shared daemon')), 8000); + + // Kill the launcher that spawned the daemon. With the old in-process design + // this would take the daemon (and thus the second client) down. + killTree(first.child); + + // The daemon is detached — it must still be alive a beat later. + await new Promise((r) => setTimeout(r, 1500)); + expect(isAlive(daemonPid)).toBe(true); + + // And the second client can still drive a real tool call through it. + sendMessage(second.child, { jsonrpc: '2.0', id: 2, method: 'tools/list' }); + const toolsResp = await waitFor(() => findResponse(second.stdout, 2), 10000); + expect(Array.isArray(toolsResp.result.tools)).toBe(true); + expect(toolsResp.result.tools.length).toBeGreaterThan(0); + }, 45000); + + it('CODEGRAPH_NO_DAEMON=1 keeps each process independent (no socket/pidfile)', async () => { const env = { CODEGRAPH_NO_DAEMON: '1' }; const first = spawnServer(tempDir, env); servers.push(first); sendInitialize(first.child, `file://${tempDir}`, 1); - await waitFor(() => findInitializeResponse(first.stdout, 1), 8000); - // Direct mode — no daemon listener log. - expect(first.stderr.some((l) => l.includes('[CodeGraph daemon] Listening on'))).toBe(false); - // No pidfile in opt-out mode. - expect(fs.existsSync(path.join(tempDir, '.codegraph', 'daemon.pid'))).toBe(false); + await waitFor(() => findResponse(first.stdout, 1), 10000); + // Direct mode — no daemon machinery touched. + expect(first.stderr.some((l) => l.includes('Attached to shared daemon'))).toBe(false); + expect(fs.existsSync(path.join(realRoot, '.codegraph', 'daemon.pid'))).toBe(false); + expect(fs.existsSync(path.join(realRoot, '.codegraph', 'daemon.log'))).toBe(false); }, 20000); - it('stale pidfile from a dead daemon gets cleared and a fresh daemon takes over', async () => { - // Plant a lockfile pointing at a definitely-dead pid. PID 999999 is - // outside the usual Linux pid_max default (4194304) — but `process.kill` - // probing returns ESRCH for nonexistent pids, which is what we want. - const pidPath = path.join(tempDir, '.codegraph', 'daemon.pid'); + it('clears a stale (dead-pid) lockfile and a fresh daemon takes over', async () => { + // Plant a lockfile pointing at a definitely-dead pid + the real socket path. fs.writeFileSync( - pidPath, + path.join(realRoot, '.codegraph', 'daemon.pid'), JSON.stringify({ pid: 999_999, version: '0.0.0-fake', - socketPath: path.join(tempDir, '.codegraph', 'daemon.sock'), + socketPath: getDaemonSocketPath(realRoot), startedAt: Date.now() - 1000, }), ); - const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '5000' }; + const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '15000' }; const server = spawnServer(tempDir, env); servers.push(server); sendInitialize(server.child, `file://${tempDir}`, 1); - let response: { result?: { serverInfo?: { name: string } } } | null = null; - try { - response = await waitFor(() => findInitializeResponse(server.stdout, 1), 8000); - } catch (err) { - throw new Error( - `${(err as Error).message}\nstderr:\n${server.stderr.join('\n')}\nstdout:\n${server.stdout.join('\n')}`, - ); - } - expect(response?.result?.serverInfo?.name).toBe('codegraph'); - // Daemon mode took over. - await waitFor( - () => server.stderr.some((l) => l.includes('[CodeGraph daemon] Listening on')), - 8000, - ); - // Pidfile now reflects a live daemon, not the planted-dead one. (Note: - // we can't compare to `server.child.pid` directly because the CLI may - // re-exec itself with `--liftoff-only`; the daemon lives in the - // grandchild, not the immediate child. What matters is that the pid - // recorded in the lockfile is *alive*, which the planted 999999 wasn't.) - const lockBody = JSON.parse(fs.readFileSync(pidPath, 'utf8')); - expect(lockBody.pid).not.toBe(999_999); - expect(() => process.kill(lockBody.pid, 0)).not.toThrow(); - }, 30000); + const resp = await waitFor(() => findResponse(server.stdout, 1), 10000).catch((e) => { + throw new Error(`${(e as Error).message}\nstderr:\n${server.stderr.join('\n')}\ndaemon.log:\n${readDaemonLog(realRoot)}`); + }); + expect(resp.result.serverInfo.name).toBe('codegraph'); + await waitFor(() => countListeningLines(realRoot) >= 1, 10000); + // The pidfile now names a live daemon, not the planted-dead 999999. + const livePid = readLockPid(realRoot); + expect(livePid).not.toBe(999_999); + expect(isAlive(livePid!)).toBe(true); + }, 40000); - it('proxy falls back to direct mode on daemon version mismatch', async () => { - // Stand up a daemon at a known socket, then write a hello line with a - // mismatched version into a *separate* test socket. The probe path - // doesn't actually need a full daemon — just a peer that produces a - // hello line. We use a hand-rolled mini-server so this test stays - // hermetic and doesn't depend on lockfile-aware behavior of the real - // daemon. + it('proxy falls back to direct mode on a daemon version mismatch', async () => { const net = await import('net'); - const sockPath = path.join(tempDir, '.codegraph', 'daemon.sock'); - // Pre-plant a lockfile pointing at a *live* (this test process) pid so - // the takeover loop doesn't unlink the lockfile mid-test. + const sockPath = getDaemonSocketPath(realRoot); + // Plant a live-pid lockfile so the launcher treats the lock as held, and a + // mini-server that answers with a mismatched-version hello. fs.writeFileSync( - path.join(tempDir, '.codegraph', 'daemon.pid'), - JSON.stringify({ - pid: process.pid, - version: '0.0.0-mismatch', - socketPath: sockPath, - startedAt: Date.now(), - }), + path.join(realRoot, '.codegraph', 'daemon.pid'), + JSON.stringify({ pid: process.pid, version: '0.0.0-mismatch', socketPath: sockPath, startedAt: Date.now() }), ); const miniServer = net.createServer((sock) => { sock.write(JSON.stringify({ codegraph: '0.0.0-mismatch', pid: 1, socketPath: sockPath, protocol: 1 }) + '\n'); @@ -276,18 +345,13 @@ describe('Shared MCP daemon (issue #411)', () => { const server = spawnServer(tempDir); servers.push(server); sendInitialize(server.child, `file://${tempDir}`, 1); - // Despite the mismatched-version daemon, the client should still get - // an initialize response — proxy refuses to attach and we fall back - // to direct mode. - const response = await waitFor( - () => findInitializeResponse(server.stdout, 1), - 8000, - ); - expect(response.result.serverInfo.name).toBe('codegraph'); - // The version-mismatch fallback message goes to stderr. + // Despite the mismatched daemon, the client still gets an initialize + // response — the proxy refuses to attach and falls back to direct mode. + const resp = await waitFor(() => findResponse(server.stdout, 1), 10000); + expect(resp.result.serverInfo.name).toBe('codegraph'); await waitFor( - () => server.stderr.some((l) => l.includes('version') && l.includes('falling back to direct mode')), - 4000, + () => server.stderr.some((l) => l.includes('falling back to direct mode')), + 6000, ); } finally { await new Promise((resolve) => miniServer.close(() => resolve())); @@ -295,28 +359,19 @@ describe('Shared MCP daemon (issue #411)', () => { }, 30000); it('daemon idle-times-out after the last client disconnects', async () => { - // 800ms idle is enough to ride out any post-disconnect grace; with the - // poll-based unref'd timer it fires quickly. We deliberately don't go - // below ~500ms because the watcher catch-up sync runs in the background - // and chowns the event loop briefly during teardown. - const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '800' }; + const env = { CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS: '800', CODEGRAPH_PPID_POLL_MS: '200' }; const server = spawnServer(tempDir, env); servers.push(server); sendInitialize(server.child, `file://${tempDir}`, 1); - await waitFor(() => findInitializeResponse(server.stdout, 1), 8000); - await waitFor( - () => server.stderr.some((l) => l.includes('[CodeGraph daemon] Listening on')), - 5000, - ); + await waitFor(() => findResponse(server.stdout, 1), 10000); + await waitFor(() => (readLockPid(realRoot) ?? 0) > 0, 8000); + const daemonPid = readLockPid(realRoot)!; - // Close stdin → launcher session drops → no clients → idle timer arms. + // Close the only client's stdin → proxy exits → daemon refcount hits 0 → + // idle timer fires → daemon exits and cleans up its lockfile. server.child.stdin.end(); - // The daemon should exit on idle. Give it a generous window: idle timer - // (800ms) + a few seconds slack for engine teardown on a slow CI box. - const exited = await waitProcessExit(server.child, 8000); - expect(exited).toBe(true); - // After exit, lockfile + socket should be cleaned up. - expect(fs.existsSync(path.join(tempDir, '.codegraph', 'daemon.pid'))).toBe(false); + expect(await waitProcessExit(daemonPid, 10000)).toBe(true); + expect(fs.existsSync(path.join(realRoot, '.codegraph', 'daemon.pid'))).toBe(false); }, 30000); }); diff --git a/__tests__/mcp-initialize.test.ts b/__tests__/mcp-initialize.test.ts index 4a57ebae0..31899aa7c 100644 --- a/__tests__/mcp-initialize.test.ts +++ b/__tests__/mcp-initialize.test.ts @@ -23,6 +23,14 @@ function spawnServer(cwd: string): ChildProcessWithoutNullStreams { return spawn(process.execPath, [BIN, 'serve', '--mcp'], { cwd, stdio: ['pipe', 'pipe', 'pipe'], + // Pin to direct (in-process) mode. #172 is a contract about the in-process + // server's init ordering — the "File watcher active" log this test observes + // is emitted in-process. In daemon mode the watcher runs in the detached + // daemon (logging to .codegraph/daemon.log, not the child's stderr); the + // same response-before-init guarantee lives in the shared session code and + // is covered by mcp-daemon.test.ts. Direct mode also avoids leaking a + // detached daemon from this suite. + env: { ...process.env, CODEGRAPH_NO_DAEMON: '1' }, }) as ChildProcessWithoutNullStreams; } diff --git a/__tests__/mcp-ppid-watchdog.test.ts b/__tests__/mcp-ppid-watchdog.test.ts index 0e3dc188a..781e0be70 100644 --- a/__tests__/mcp-ppid-watchdog.test.ts +++ b/__tests__/mcp-ppid-watchdog.test.ts @@ -103,7 +103,12 @@ describe.skipIf(process.platform === 'win32')('MCP PPID watchdog (#277)', () => stdinHolder.unref(); const child = spawn(process.execPath, [${JSON.stringify(BIN)}, 'serve', '--mcp'], { stdio: [stdinHolder.stdout, 'ignore', stderrFd], - env: { ...process.env, CODEGRAPH_PPID_POLL_MS: '200' }, + // Pin to direct (in-process) mode: this test targets the in-process + // server's PPID watchdog (#277). The detached-daemon/proxy watchdog is + // covered separately in mcp-daemon.test.ts ("daemon survives the first + // client dying"). Without this the spawned process becomes a proxy and + // also spawns a detached daemon that would outlive the test. + env: { ...process.env, CODEGRAPH_PPID_POLL_MS: '200', CODEGRAPH_NO_DAEMON: '1' }, detached: true, }); child.unref(); diff --git a/src/mcp/daemon.ts b/src/mcp/daemon.ts index 86a2be008..a9b5eaa69 100644 --- a/src/mcp/daemon.ts +++ b/src/mcp/daemon.ts @@ -1,23 +1,37 @@ /** * Shared MCP daemon — issue #411. * - * One `codegraph serve --mcp` process per project root, accepting N concurrent - * MCP clients over a Unix-domain socket (or named pipe on Windows). Each - * incoming connection gets its own {@link MCPSession}; all sessions share a - * single {@link MCPEngine}, which means a single file watcher (one inotify - * set), a single SQLite connection (one WAL writer), and a single tree-sitter - * warm-up — paid once, amortized across every agent talking to the project. + * One detached `codegraph serve --mcp` daemon process per project root, + * accepting N concurrent MCP clients over a Unix-domain socket (or named pipe + * on Windows). Each incoming connection gets its own {@link MCPSession}; all + * sessions share a single {@link MCPEngine}, which means a single file watcher + * (one inotify set), a single SQLite connection (one WAL writer), and a single + * tree-sitter warm-up — paid once, amortized across every agent talking to the + * project. + * + * Lifecycle (see also `./index.ts` and `./proxy.ts`): + * - The daemon is spawned **detached** (its own session/process group, stdio + * decoupled) by the first launcher that finds no daemon running. It is NOT + * a child of any MCP host, so closing one terminal / Ctrl-C'ing one session + * can't take it down and sever the others. That's why this process has no + * PPID watchdog: it deliberately outlives every individual client. + * - Every MCP host talks to the daemon through a thin `proxy` process (the + * thing the host actually spawned). The proxy keeps the #277 PPID watchdog, + * so a SIGKILL'd host still reaps its proxy promptly; the proxy's socket + * close then decrements the daemon's refcount. + * - When the last client disconnects the daemon lingers for + * `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` (default 300s) so back-to-back agent + * runs in the same project don't repay startup, then exits cleanly. This is + * what keeps a single-agent session from leaking a daemon forever (#277). * * What this file owns: * - Listening on the daemon socket and spawning per-connection sessions. * - The handshake "hello" line that lets a proxy verify it found a * same-version daemon before piping any JSON-RPC through it. - * - The lockfile (`.codegraph/daemon.pid`) that races between daemons are - * resolved against — atomic `O_EXCL` create + cleanup on exit. - * - Reference counting + idle timeout: when the last client disconnects - * the daemon lingers for `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` (default - * 300s) so back-to-back agent runs in the same project don't repay - * startup. New connection cancels the timer. + * - The lockfile (`.codegraph/daemon.pid`) competing daemons arbitrate + * against — atomic `O_EXCL` create with the full record written in the same + * breath (no empty-file window) + cleanup on exit. + * - Reference counting + idle timeout. * - Graceful shutdown on SIGTERM/SIGINT and idle exit. * * What this file does NOT own: @@ -31,7 +45,7 @@ import * as net from 'net'; import * as path from 'path'; import { MCPEngine } from './engine'; import { MCPSession } from './session'; -import { SocketTransport, StdioTransport } from './transport'; +import { SocketTransport } from './transport'; import { DaemonLockInfo, decodeLockInfo, @@ -69,13 +83,13 @@ export interface DaemonStartResult { /** * Run as the shared daemon for `projectRoot`. Resolves once the socket is - * listening and the lockfile is committed. The returned Daemon owns the - * socket, the engine, and the lockfile until `stop()` is called or it exits - * on idle/signal. + * listening. The Daemon owns the socket, the engine, and the lockfile until + * `stop()` is called or it exits on idle/signal. * - * Race-safe: callers must first try `tryAcquireDaemonLock(projectRoot)` and - * only call `Daemon.run` if they got the lock. The atomic `O_EXCL` create - * inside the acquire helper is the only synchronization between competing + * Race-safe: callers must first call `tryAcquireDaemonLock(projectRoot)` and + * only construct a Daemon if they got the lock (`kind: 'acquired'`). The atomic + * `O_EXCL` create inside the acquire helper — which now also writes the full + * record before returning — is the only synchronization between competing * daemons. */ export class Daemon { @@ -87,23 +101,22 @@ export class Daemon { private stopping = false; private socketPath: string; private pidPath: string; - private lockFd: number | null = null; constructor( private projectRoot: string, - opts: { lockFd: number; idleTimeoutMs?: number } = { lockFd: -1 }, + opts: { idleTimeoutMs?: number } = {}, ) { this.socketPath = getDaemonSocketPath(projectRoot); this.pidPath = getDaemonPidPath(projectRoot); - this.lockFd = opts.lockFd >= 0 ? opts.lockFd : null; this.idleTimeoutMs = opts.idleTimeoutMs ?? resolveIdleTimeoutMs(); this.engine = new MCPEngine(); this.engine.setProjectPathHint(projectRoot); } /** - * Bind the socket, write the pidfile body, kick off engine init, and - * register signal handlers. The promise resolves once the server is + * Bind the socket, kick off engine init, and register signal handlers. The + * lockfile body was already written atomically by `tryAcquireDaemonLock`, so + * there is nothing to write here. The promise resolves once the server is * listening — the daemon then sticks around until idle/shutdown. */ async start(): Promise { @@ -139,15 +152,14 @@ export class Daemon { socketPath: this.socketPath, startedAt: Date.now(), }; - this.writeLockFile(lock); process.stderr.write( `[CodeGraph daemon] Listening on ${this.socketPath} (pid ${process.pid}, v${CodeGraphPackageVersion}). Idle timeout ${this.idleTimeoutMs}ms.\n` ); // No clients yet: arm the idle timer immediately so a daemon that nobody - // ever connects to (e.g. spawned by a misconfigured client) doesn't pin - // resources forever. + // ever connects to (e.g. spawned then abandoned because the launcher died) + // doesn't pin resources forever. this.armIdleTimer(); process.on('SIGINT', () => this.stop('SIGINT')); @@ -156,38 +168,16 @@ export class Daemon { return { socketPath: this.socketPath, lock }; } - /** - * Attach an stdio session for the *launcher* — the MCP host that spawned - * this very process. The launcher already opened a stdio pipe to us and is - * waiting for an `initialize` response; that pipe gets its own session - * just like any socket connection. The transport is configured with - * `exitOnClose: false` so losing the launcher doesn't kill the daemon — - * other socket clients are still entitled to service. When stdin closes - * we just remove this session from the client set and arm the idle timer - * if nothing else is connected. - */ - attachStdioLauncherSession(): MCPSession { - let session!: MCPSession; - const transport = new StdioTransport({ - exitOnClose: false, - onClose: () => { - if (session) this.dropClient(session); - }, - }); - session = new MCPSession(transport, this.engine, { - explicitProjectPath: this.projectRoot, - }); - this.clients.add(session); - this.disarmIdleTimer(); - session.start(); - return session; - } - /** Currently-connected client count. Exposed for tests / status output. */ getClientCount(): number { return this.clients.size; } + /** The socket path the daemon is (or will be) listening on. */ + getSocketPath(): string { + return this.socketPath; + } + /** Graceful shutdown: close all sessions, the engine, and clean up the lock. */ async stop(reason: string = 'stop'): Promise { if (this.stopping) return; @@ -253,8 +243,9 @@ export class Daemon { } void this.stop('idle timeout'); }, this.idleTimeoutMs); - // Don't keep the event loop alive just for this — if the socket server - // and active connections are all gone, the loop should drain naturally. + // Don't keep the event loop alive just for this — the net.Server keeps the + // loop alive while listening, so the timer still fires; once we stop() the + // loop should drain naturally. this.idleTimer.unref?.(); } @@ -264,22 +255,6 @@ export class Daemon { this.idleTimer = null; } - private writeLockFile(info: DaemonLockInfo): void { - const body = encodeLockInfo(info); - if (this.lockFd !== null) { - // We came in already holding the lockfile (acquired via `wx`); fill it - // in atomically by writing the body and closing the fd. Subsequent - // readers of the pidfile then see the full record. - fs.writeSync(this.lockFd, body); - fs.closeSync(this.lockFd); - this.lockFd = null; - } else { - // Defensive path — should be unreachable because callers always go - // through `tryAcquireDaemonLock` before constructing a Daemon. - fs.writeFileSync(this.pidPath, body, { flag: 'w' }); - } - } - private cleanupLockfile(): void { try { if (fs.existsSync(this.pidPath)) { @@ -297,23 +272,28 @@ export class Daemon { /** * Result of `tryAcquireDaemonLock`. Either we got the lockfile (caller becomes - * the daemon), or it already existed (caller should try to connect to the - * existing daemon as a proxy). + * the daemon), or it already existed (caller should connect to the existing + * daemon as a proxy, or — if the holder is dead — clear it and retry). */ export type AcquireResult = - | { kind: 'acquired'; lockFd: number; pidPath: string } + | { kind: 'acquired'; pidPath: string; info: DaemonLockInfo } | { kind: 'taken'; existing: DaemonLockInfo | null; pidPath: string }; /** - * Atomic-create the daemon pidfile. Returns either an `acquired` result (the - * caller is now the daemon-elect; must call `Daemon.run` which writes the - * pidfile body and closes the fd) or a `taken` result (some other process - * either is or was the daemon; caller should connect-or-take-over). + * Atomically create the daemon pidfile AND write its full record in the same + * call. Returns either an `acquired` result (the caller is now the daemon-elect + * and may construct a {@link Daemon}) or a `taken` result. * - * The fd is left writable + truncate-only — Daemon.start() writes the actual - * body (pid, version, socket path) once it's bound the socket. That way a - * crash mid-acquire leaves an empty pidfile which any subsequent daemon - * candidate can recognize as stale. + * must-fix 1 (issue #411 review): the original implementation created the + * pidfile empty under an `O_EXCL` fd and only wrote the body later, after + * `server.listen` resolved. A second candidate that read the pidfile during + * that millisecond-wide window saw an empty file, decoded it as `null`, treated + * it as stale, and `unlink`'d the lock the first daemon still held — producing + * two daemons (two watchers, two writers) on concurrent startup, exactly the + * multi-agent scenario the feature targets. Writing the complete record before + * returning the handle closes that window: a concurrent reader always sees a + * valid pid+version+socketPath, never an empty file. The socket path is + * deterministic from the project root, so it's known here. */ export function tryAcquireDaemonLock(projectRoot: string): AcquireResult { const pidPath = getDaemonPidPath(projectRoot); @@ -322,8 +302,24 @@ export function tryAcquireDaemonLock(projectRoot: string): AcquireResult { fs.mkdirSync(path.dirname(pidPath), { recursive: true }); try { + // `wx` = O_CREAT | O_EXCL | O_WRONLY: atomic "create only if absent". const fd = fs.openSync(pidPath, 'wx', 0o600); - return { kind: 'acquired', lockFd: fd, pidPath }; + const info: DaemonLockInfo = { + pid: process.pid, + version: CodeGraphPackageVersion, + socketPath: getDaemonSocketPath(projectRoot), + startedAt: Date.now(), + }; + try { + // Synchronous write immediately after the create — no await in between — + // so the empty-file window is a single fs.writeSync, not an I/O-bound + // `server.listen`. Combined with the pid-verified `clearStaleDaemonLock` + // below, concurrent candidates can never delete a live daemon's lock. + fs.writeSync(fd, encodeLockInfo(info)); + } finally { + fs.closeSync(fd); + } + return { kind: 'acquired', pidPath, info }; } catch (err: unknown) { const e = err as NodeJS.ErrnoException; if (e.code !== 'EEXIST') throw err; @@ -338,25 +334,39 @@ export function tryAcquireDaemonLock(projectRoot: string): AcquireResult { } /** - * Remove a stale pidfile and return whether we successfully cleared it. Used - * by callers that detected a "taken" lock pointing at a dead pid. + * Remove a stale pidfile, but only if it still names a dead process. Re-reads + * the file immediately before unlinking so we never delete a lock that a live + * daemon (re)acquired in the meantime. + * + * must-fix 1 (issue #411 review): the original unconditionally `unlink`'d, + * which let a racing candidate delete a healthy daemon's lock. Passing + * `expectedDeadPid` (the pid the caller believed was dead) makes the clear a + * compare-and-delete: bail if the file now holds a different pid, or any live + * pid. Returns true when the stale lock is gone (or was already gone). */ -export function clearStaleDaemonLock(pidPath: string): boolean { +export function clearStaleDaemonLock(pidPath: string, expectedDeadPid?: number): boolean { try { + const raw = fs.readFileSync(pidPath, 'utf8'); + const info = decodeLockInfo(raw); + if (info) { + // A different pid took over since we read it — not ours to clear. + if (expectedDeadPid !== undefined && info.pid !== expectedDeadPid) return false; + // Holder is actually alive — never clear a live daemon's lock. + if (info.pid > 0 && isProcessAlive(info.pid)) return false; + } fs.unlinkSync(pidPath); return true; } catch (err: unknown) { const e = err as NodeJS.ErrnoException; - if (e.code === 'ENOENT') return true; + if (e.code === 'ENOENT') return true; // already gone return false; } } /** - * Probe whether `pid` is currently alive (signal-0). False on Windows for - * pids of a different user since `kill` returns EPERM there; we accept that - * as "still alive" to be conservative — better to fall back to direct mode - * than to nuke a stranger's daemon. + * Probe whether `pid` is currently alive (signal-0). Treats EPERM as alive on + * every platform (the process exists, it's just not ours to signal) so we never + * mistake a live daemon for a dead one and clear its lock. */ export function isProcessAlive(pid: number): boolean { try { diff --git a/src/mcp/index.ts b/src/mcp/index.ts index 2179eddf9..85c3949e2 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -14,19 +14,31 @@ * await server.start(); * ``` * - * Three runtime modes (decided in {@link MCPServer.start}): + * Runtime modes (decided in {@link MCPServer.start}): * - * - **Direct** — one process serves one MCP client over stdio. Today's - * behavior; used when no shareable daemon is reachable or the user opted - * out via `CODEGRAPH_NO_DAEMON=1`. - * - **Daemon** — accept N concurrent MCP clients over a Unix-domain socket / - * named pipe, sharing one CodeGraph + watcher + SQLite handle. See - * {@link ./daemon.ts} and issue #411 for the rationale. - * - **Proxy** — pure stdio↔socket pipe to an existing daemon. See - * {@link ./proxy.ts}. + * - **Direct** — one process serves one MCP client over stdio. The pre-#411 + * behavior; used when the user opts out (`CODEGRAPH_NO_DAEMON=1`), no + * `.codegraph/` is reachable, or the daemon machinery fails for any reason. + * - **Proxy** — what an MCP host actually talks to when sharing is on: a thin + * stdio↔socket pipe to the shared daemon. The proxy carries the #277 PPID + * watchdog, so a SIGKILL'd host reaps its proxy promptly. See {@link ./proxy.ts}. + * - **Daemon** — a *detached* background process (its own session/process + * group) that serves N proxies over a Unix-domain socket / named pipe, + * sharing one CodeGraph + watcher + SQLite handle. Spawned on demand; never a + * child of any host, so it survives individual sessions and is reaped by + * client-refcount + idle timeout. See {@link ./daemon.ts} and issue #411. + * + * The detached-daemon + always-proxy split is the fix for the review finding + * that the original in-process daemon (a) was the first host's child, so closing + * that terminal severed every other client, and (b) disabled the PPID watchdog, + * regressing #277 (orphaned daemons on host SIGKILL). */ +import * as fs from 'fs'; +import * as path from 'path'; +import { spawn, StdioOptions } from 'child_process'; import { findNearestCodeGraphRoot } from '../index'; +import { getCodeGraphDir } from '../directory'; import { StdioTransport } from './transport'; import { MCPEngine } from './engine'; import { MCPSession } from './session'; @@ -48,20 +60,31 @@ import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags'; const DEFAULT_PPID_POLL_MS = 5000; /** - * Max retries when a stale-lock takeover races other candidates. After this - * many failed acquire+probe rounds we give up and fall back to direct mode — - * something is wedged enough that adding our own daemon to the mix would only - * make it worse. + * Env var that marks a process as the *detached daemon* itself (set by + * {@link spawnDetachedDaemon} when it re-invokes the CLI). Without it a + * `serve --mcp` invocation is a launcher that connects-or-spawns; with it, the + * process IS the daemon and must never try to spawn another (infinite spawn). */ -const TAKEOVER_MAX_RETRIES = 3; +const DAEMON_INTERNAL_ENV = 'CODEGRAPH_DAEMON_INTERNAL'; /** - * Brief sleep between takeover retries so a freshly-spawned daemon has time - * to bind its socket. 100ms is well under any realistic startup, so a - * legitimate races resolves on the first or second retry. + * Retries for the detached daemon arbitrating the O_EXCL lock against a racing + * sibling. Tiny — the lock resolves on the first round in practice; the retries + * only cover clearing a genuinely stale (dead-pid) lockfile. */ +const TAKEOVER_MAX_RETRIES = 5; const TAKEOVER_RETRY_DELAY_MS = 100; +/** + * How long a launcher waits for a freshly-spawned daemon to bind its socket + * before giving up and running in-process. The daemon binds the socket *before* + * the (backgrounded) engine/grammar warm-up, so this only needs to cover node + * process startup. 60 × 100ms = 6s of headroom for a cold/slow box; on the + * common path the socket appears within a few rounds. + */ +const DAEMON_CONNECT_MAX_RETRIES = 60; +const DAEMON_CONNECT_RETRY_DELAY_MS = 100; + /** * Resolve the PPID watchdog poll interval from an env override. A value of * `0` disables the watchdog entirely (escape hatch for embedded scenarios @@ -97,15 +120,77 @@ function daemonOptOutSet(): boolean { return raw !== '0' && raw.toLowerCase() !== 'false'; } +/** Whether this process was spawned to BE the detached daemon. */ +function daemonInternalSet(): boolean { + const raw = process.env[DAEMON_INTERNAL_ENV]; + return !!raw && raw !== '0' && raw.toLowerCase() !== 'false'; +} + /** * Resolve the project root the daemon machinery should key on. Returns * `null` when no `.codegraph/` is reachable from the candidate path — in * that case the caller must run in direct mode, since the daemon lockfile * and socket both live under `.codegraph/`. + * + * The result is canonicalized with `realpathSync` so every client converges on + * the same socket/lock path regardless of how it expressed the path: a client + * launched with cwd under a symlink (e.g. macOS `/var` → `/private/var`, where + * spawned `process.cwd()` is already realpath'd) and one that passed a + * symlinked `rootUri` would otherwise hash to different sockets and silently + * fail to share the daemon. */ function resolveDaemonRoot(explicitPath: string | null): string | null { const candidate = explicitPath ?? process.cwd(); - return findNearestCodeGraphRoot(candidate); + const root = findNearestCodeGraphRoot(candidate); + if (!root) return null; + try { return fs.realpathSync(root); } catch { return root; } +} + +/** + * Spawn the shared daemon as a fully detached background process: its own + * session/process group (so a SIGHUP/SIGINT to the launcher's terminal can't + * reach it) with stdio decoupled from the launcher (logs to + * `.codegraph/daemon.log`). Re-invokes the *same* CLI faithfully across dev and + * bundled launches by reusing `process.argv[0]` (the right node), the current + * `process.execArgv` (carries `--liftoff-only`, so the daemon never re-execs) + * and `process.argv[1]` (this script). The spawned process self-arbitrates the + * O_EXCL lock, so racing launchers may each spawn one — losers exit and every + * launcher proxies through the single winner. + */ +function spawnDetachedDaemon(root: string): void { + const scriptPath = process.argv[1]; + if (!scriptPath) { + // No resolvable CLI entry point to re-invoke — let the caller fall back to + // direct mode rather than spawn something broken. + throw new Error('cannot resolve CLI script path to spawn the daemon'); + } + + let logFd: number | null = null; + let stdio: StdioOptions = 'ignore'; + try { + logFd = fs.openSync(path.join(getCodeGraphDir(root), 'daemon.log'), 'a'); + stdio = ['ignore', logFd, logFd]; + } catch { + stdio = 'ignore'; // no log file — discard daemon output rather than fail + } + try { + const child = spawn( + process.execPath, + [...process.execArgv, scriptPath, 'serve', '--mcp', '--path', root], + { + detached: true, + stdio, + windowsHide: true, + env: { ...process.env, [DAEMON_INTERNAL_ENV]: '1' }, + }, + ); + child.unref(); + } finally { + // The child holds its own dup of the log fd now; the launcher doesn't need it. + if (logFd !== null) { + try { fs.closeSync(logFd); } catch { /* ignore */ } + } + } } /** @@ -121,7 +206,7 @@ function resolveDaemonRoot(explicitPath: string | null): string | null { */ export class MCPServer { private projectPath: string | null; - // Direct-mode-only state. In daemon mode the per-connection session lives + // Direct-mode-only state. In daemon mode the per-connection sessions live // inside the Daemon class; in proxy mode there is no session at all. private session: MCPSession | null = null; private engine: MCPEngine | null = null; @@ -143,19 +228,24 @@ export class MCPServer { * Start the MCP server. * * Decision order: - * 1. If `CODEGRAPH_NO_DAEMON=1` → direct mode (unchanged behavior). - * 2. If no `.codegraph/` reachable → direct mode (daemon needs a lockfile - * and socket location, which both live under `.codegraph/`). - * 3. Try to attach to an existing daemon as a proxy. - * 4. Otherwise become the daemon ourselves. + * 1. `CODEGRAPH_NO_DAEMON=1` → direct mode (unchanged pre-#411 behavior). + * 2. `CODEGRAPH_DAEMON_INTERNAL=1` → we ARE the detached daemon; listen. + * 3. No `.codegraph/` reachable → direct mode (the daemon's lockfile and + * socket both live under `.codegraph/`). + * 4. Otherwise connect to (or spawn) the shared daemon and proxy to it. * - * On any unexpected failure in steps 3–4 we transparently fall back to - * direct mode — a misbehaving daemon must never block a session from - * starting. + * On any unexpected failure in step 4 we transparently fall back to direct + * mode — a misbehaving daemon must never block a session from starting. */ async start(): Promise { - // Direct mode if the user opted out. Done first so debugging is simple: - // setting the env var is sufficient to get the pre-#411 behavior. + // The detached daemon process itself. Checked before the opt-out so the + // daemon honors the same env it was spawned with (it never sets NO_DAEMON). + if (daemonInternalSet()) { + return this.startDaemonProcess(); + } + + // Direct mode if the user opted out. Setting the env var is sufficient to + // get the pre-#411 single-process behavior. if (daemonOptOutSet()) { return this.startDirect('CODEGRAPH_NO_DAEMON set'); } @@ -163,20 +253,19 @@ export class MCPServer { const root = resolveDaemonRoot(this.projectPath); if (!root) { // No initialized project found — daemon mode has nowhere to put its - // socket. This is the fresh-checkout / outside-project case; behave - // exactly as before. + // socket. The fresh-checkout / outside-project case; behave as before. return this.startDirect('no .codegraph/ root found'); } - // Try the daemon attach/spawn dance. try { - const mode = await this.startDaemonOrProxy(root); + const mode = await this.connectOrSpawnDaemon(root); if (mode === 'fallback') { - return this.startDirect('daemon attach/start failed; fallback to direct'); + return this.startDirect('daemon unavailable; fallback to direct'); } - this.mode = mode; - this.installSignalHandlers(); - this.installPpidWatchdog(); + // 'proxy': connectOrSpawnDaemon ran the stdio↔socket pipe to completion + // (it only returns once the host disconnected). The process is now + // expected to terminate naturally — the proxy installed its own watchdog. + this.mode = 'proxy'; return; } catch (err) { // Belt-and-braces: if anything throws inside the daemon machinery, @@ -189,9 +278,8 @@ export class MCPServer { /** * Stop the server. In daemon mode this triggers graceful shutdown of every - * connected session; in proxy mode the proxy's own resolve handler exits - * the process and `stop()` is a no-op; in direct mode this mirrors the - * pre-#411 behavior (close cg, exit). + * connected session; in direct mode it mirrors the pre-#411 behavior (close + * cg, exit). Proxy mode never routes through here — the proxy exits itself. */ stop(): void { if (this.stopped) return; @@ -246,77 +334,89 @@ export class MCPServer { } /** - * Try to attach as proxy or start as daemon. Returns 'proxy' / 'daemon' on - * success, 'fallback' if the caller should retry in direct mode. + * Run as the detached shared daemon (process spawned with + * `CODEGRAPH_DAEMON_INTERNAL=1`). Arbitrate the O_EXCL lock, then either + * become the daemon (bind the socket, serve forever) or — if a live daemon + * already holds the lock — exit so we don't leak a redundant process. + * + * No PPID watchdog and no stdin handlers: the daemon is detached on purpose + * and reaps itself via client-refcount + idle timeout (see {@link Daemon}). */ - private async startDaemonOrProxy(root: string): Promise<'proxy' | 'daemon' | 'fallback'> { + private async startDaemonProcess(): Promise { + const root = resolveDaemonRoot(this.projectPath) ?? this.projectPath ?? process.cwd(); for (let attempt = 0; attempt < TAKEOVER_MAX_RETRIES; attempt++) { const lock = tryAcquireDaemonLock(root); if (lock.kind === 'acquired') { - const daemon = new Daemon(root, { lockFd: lock.lockFd }); + const daemon = new Daemon(root); await daemon.start(); - // The MCP host launched us over stdio and is waiting for our - // `initialize` response — attach it as the daemon's first session - // so we never silently drop the launcher. Subsequent invocations - // discover us via the socket and proxy in. - daemon.attachStdioLauncherSession(); this.daemon = daemon; - return 'daemon'; + this.mode = 'daemon'; + return; // the net.Server keeps the process alive } - // Lock is taken — that *should* mean a daemon is alive. Probe. - const socketPath = lock.existing?.socketPath || getDaemonSocketPath(root); - const probe = await runProxy(socketPath); - if (probe.outcome === 'proxied') { - // runProxy only returns when the connection has CLOSED — meaning we - // already piped stdio and are now exiting. From here we should not - // start anything else. The process is expected to terminate - // naturally after this function returns. - return 'proxy'; + // Taken. If the holder is alive, another daemon already serves (or is + // binding) — we're redundant; exit cleanly so the launcher proxies to it. + const existing = lock.existing; + if (existing && existing.pid > 0 && isProcessAlive(existing.pid)) { + process.stderr.write( + `[CodeGraph daemon] Another daemon (pid ${existing.pid}) already holds the lock; exiting.\n` + ); + process.exit(0); } - // Proxy didn't attach. Possible causes: - // (a) Daemon is mid-startup and hasn't bound the socket yet — retry. - // (b) Daemon crashed but lockfile leaked — clear it and retry. - // (c) Daemon is alive but version-mismatched — fall back to direct. - if (probe.reason === 'version mismatch') { - return 'fallback'; - } + // Holder is dead (or the record is unreadable) — clear it (pid-verified, + // so we never delete a live daemon's lock) and retry the acquire. + clearStaleDaemonLock(lock.pidPath, existing?.pid); + await sleep(TAKEOVER_RETRY_DELAY_MS); + } - if (lock.existing && lock.existing.pid > 0 && isProcessAlive(lock.existing.pid)) { - // Daemon process is alive but its socket isn't accepting — probably - // (a). Sleep briefly and try again. - await sleep(TAKEOVER_RETRY_DELAY_MS); - continue; - } + process.stderr.write('[CodeGraph daemon] Could not acquire the daemon lock; exiting.\n'); + process.exit(0); + } - // Dead pid (or unreadable lockfile): clear it and retry. If we lose - // the next race to another candidate, that's fine — they'll be the - // new daemon and we'll proxy through them. - clearStaleDaemonLock(lock.pidPath); - await sleep(TAKEOVER_RETRY_DELAY_MS); + /** + * Become a proxy to the shared daemon, spawning the daemon first if none is + * reachable. Returns 'proxy' once the proxied session has run to completion + * (the host disconnected), or 'fallback' if the caller should run in-process. + */ + private async connectOrSpawnDaemon(root: string): Promise<'proxy' | 'fallback'> { + const socketPath = getDaemonSocketPath(root); + + // Fast path: a daemon may already be listening. On success runProxy pipes + // stdio until the host disconnects, so a 'proxied' outcome means this + // process has finished its entire job. + let probe = await runProxy(socketPath); + if (probe.outcome === 'proxied') return 'proxy'; + if (probe.reason === 'version mismatch') return 'fallback'; + + // No reachable daemon — spawn one (detached) and wait for it to bind. + spawnDetachedDaemon(root); + + for (let attempt = 0; attempt < DAEMON_CONNECT_MAX_RETRIES; attempt++) { + await sleep(DAEMON_CONNECT_RETRY_DELAY_MS); + probe = await runProxy(socketPath); + if (probe.outcome === 'proxied') return 'proxy'; + if (probe.reason === 'version mismatch') return 'fallback'; } - // Repeated failures — something is very wrong (perms?). Direct mode it is. + // Daemon never came up in time — run in-process so the user is never blocked. return 'fallback'; } - /** Standard SIGINT/SIGTERM handlers that route to our `stop()`. */ + /** Standard SIGINT/SIGTERM handlers that route to our `stop()` (direct mode). */ private installSignalHandlers(): void { process.on('SIGINT', () => this.stop()); process.on('SIGTERM', () => this.stop()); } /** - * PPID watchdog. The daemon mode owns its own lifecycle (idle timeout + - * client refcount), so we deliberately do NOT enable the PPID watchdog - * there — otherwise the very first proxy that spawned the daemon would - * drag it down when it exited. Direct mode and proxy mode both enable it. + * PPID watchdog (#277) — direct mode only. Daemon mode is detached on purpose + * and reaps via idle timeout; proxy mode installs its own watchdog inside + * {@link runProxy}. So this only ever runs for an in-process direct session. */ private installPpidWatchdog(): void { - if (this.mode === 'daemon') return; - if (this.mode === 'proxy') return; // proxy.ts installs its own. + if (this.mode !== 'direct') return; const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS); if (pollMs <= 0) return; this.ppidWatchdog = setInterval(() => { @@ -338,10 +438,10 @@ export class MCPServer { } function sleep(ms: number): Promise { - // Deliberately NOT unref'd. During the daemon takeover retry loop we may - // be between processes — no socket bound yet, no transport, no listener - // pinning the event loop. An unref'd timer would let Node drain the loop - // and exit silently before we get a chance to try again. + // Deliberately NOT unref'd. During the daemon connect/takeover retry loop we + // may be between processes — no socket bound yet, no transport, no listener + // pinning the event loop. An unref'd timer would let Node drain the loop and + // exit silently before we get a chance to try again. return new Promise((resolve) => { setTimeout(resolve, ms); }); } diff --git a/src/mcp/proxy.ts b/src/mcp/proxy.ts index d51153e2a..938b135ef 100644 --- a/src/mcp/proxy.ts +++ b/src/mcp/proxy.ts @@ -90,7 +90,10 @@ export async function runProxy( startPpidWatchdog(socket); await pipeUntilClose(socket); - return { outcome: 'proxied' }; + // Host disconnected (or the daemon went away). The proxy's only job is the + // pipe; exit now so we don't linger — process.stdin's 'data' listener would + // otherwise keep the event loop alive and leave a zombie launcher behind. + process.exit(0); } /** @@ -208,7 +211,7 @@ function startPpidWatchdog(socket: net.Socket): void { const reason = ppidChanged ? `ppid ${originalPpid} -> ${current}` : `host pid ${hostPpid} exited`; - process.stderr.write(`[CodeGraph MCP] proxy parent exited (${reason}); detaching.\n`); + process.stderr.write(`[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n`); try { socket.destroy(); } catch { /* ignore */ } process.exit(0); } diff --git a/src/mcp/session.ts b/src/mcp/session.ts index 4357bd8b1..157dc17db 100644 --- a/src/mcp/session.ts +++ b/src/mcp/session.ts @@ -17,13 +17,15 @@ import { JsonRpcRequest, JsonRpcNotification, JsonRpcTransport, ErrorCodes } fro import { MCPEngine } from './engine'; import { tools } from './tools'; import { SERVER_INSTRUCTIONS } from './server-instructions'; +import { CodeGraphPackageVersion } from './version'; /** - * MCP Server Info — kept on the session because some clients log it. + * MCP Server Info — kept on the session because some clients log it. The + * version tracks the real package version (was a hard-coded '0.1.0'). */ const SERVER_INFO = { name: 'codegraph', - version: '0.1.0', + version: CodeGraphPackageVersion, }; /** MCP Protocol Version (latest the server claims). */ diff --git a/src/mcp/transport.ts b/src/mcp/transport.ts index ff81b0d07..aecc0368f 100644 --- a/src/mcp/transport.ts +++ b/src/mcp/transport.ts @@ -15,7 +15,6 @@ */ import * as readline from 'readline'; -import type { Readable, Writable } from 'stream'; import type { Socket } from 'net'; /** @@ -406,49 +405,3 @@ export class SocketTransport extends LineBasedJsonRpcTransport { this.closeHandlers = []; } } - -/** - * Adapter that lets the proxy mode reuse {@link LineBasedJsonRpcTransport}'s - * line buffering for arbitrary `Readable`/`Writable` pairs. Not currently used - * by sessions — kept here for symmetry with the existing stdio/socket pair if - * future work needs a third carrier. - */ -export class StreamPairTransport extends LineBasedJsonRpcTransport { - private buffer = ''; - - constructor( - private input: Readable, - private output: Writable, - private prefix: string = 'cg-stream', - ) { - super(); - } - - start(handler: MessageHandler): void { - this.messageHandler = handler; - this.input.setEncoding?.('utf8'); - this.input.on('data', (chunk: string | Buffer) => { - this.buffer += typeof chunk === 'string' ? chunk : chunk.toString('utf8'); - let idx; - while ((idx = this.buffer.indexOf('\n')) !== -1) { - const line = this.buffer.slice(0, idx); - this.buffer = this.buffer.slice(idx + 1); - void this.handleLine(line); - } - }); - } - - stop(): void { - if (this.stopped) return; - this.stopped = true; - this.rejectPending('Transport stopped'); - } - - protected write(line: string): void { - this.output.write(line + '\n'); - } - - protected idPrefix(): string { - return this.prefix; - } -}