diff --git a/CLAUDE.md b/CLAUDE.md index d0e259c..b89685c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -75,6 +75,12 @@ src/ ├── mcp/ # MCP server │ ├── server.ts # MCP entry point │ └── tools/ # Each file exports registerTools(server, db, provider) +├── lite/ # libscope/lite — embeddable semantic search (no CLI/MCP/connectors) +│ ├── index.ts # Public entrypoint — exports LibScopeLite + types +│ ├── core.ts # LibScopeLite class (index, search, getContext, ask, rate) +│ ├── types.ts # LiteOptions, LiteDoc, LiteSearchResult, etc. +│ ├── normalize.ts # Raw input → markdown (dispatches to core/parsers/) +│ └── chunker-treesitter.ts # Optional tree-sitter code chunker (TS/JS/Python) ├── core/ # Business logic (documents, search, indexing, packs, topics, etc.) │ └── parsers/ # File format parsers (markdown, pdf, docx, html, epub, pptx, csv, yaml, json) ├── api/ # REST API server (routes, middleware, openapi spec) diff --git a/README.md b/README.md index 135fe37..92d7c76 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,9 @@ Search uses sqlite-vec for vector similarity when available, with FTS5 full-text ### Programmatic SDK -LibScope also exports a `LibScope` class for use as a library: +LibScope exports two embeddable APIs: + +**`LibScope`** — full SDK with all features (connectors, topics, packs, etc.): ```ts import { LibScope } from "libscope"; @@ -186,7 +188,36 @@ const results = await scope.search("query"); scope.close(); ``` -See the [Programmatic Usage](/guide/programmatic-usage) guide for details on the SDK, batch search, and document TTL/expiry. +**`LibScopeLite`** — lightweight embeddable class for external applications. No CLI, no MCP server, no connectors. Designed for embedding semantic search directly into other tools (MCP servers, VS Code extensions, CI scripts): + +```ts +import { LibScopeLite } from "libscope/lite"; + +const lite = new LibScopeLite({ dbPath: ":memory:" }); + +// Index documents (or code files via tree-sitter chunking) +await lite.indexBatch(docs, { concurrency: 4 }); + +// Hybrid vector + FTS5 search +const results = await lite.search("how to authenticate"); + +// Get RAG context for injection into an external LLM prompt +const context = await lite.getContext("How does auth work?"); + +lite.close(); +``` + +Tree-sitter powered code indexing splits TypeScript, JavaScript, and Python files at function/class boundaries: + +```ts +import { TreeSitterChunker } from "libscope/lite"; + +const chunker = new TreeSitterChunker(); +const chunks = await chunker.chunk(sourceCode, "typescript"); +// Each chunk is a complete function or class with 1-based line numbers +``` + +See the [LibScope Lite guide](https://libscope.com/guide/lite) and [API reference](https://libscope.com/reference/lite-api) for the full documentation. ## Organizing Content diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts index f6f51f2..350a972 100644 --- a/docs/.vitepress/config.ts +++ b/docs/.vitepress/config.ts @@ -61,6 +61,8 @@ export default defineConfig({ text: "Programmatic Usage", link: "/guide/programmatic-usage", }, + { text: "LibScope Lite", link: "/guide/lite" }, + { text: "Code Indexing", link: "/guide/code-indexing" }, ], }, { @@ -86,6 +88,7 @@ export default defineConfig({ { text: "CLI Commands", link: "/reference/cli" }, { text: "MCP Tools", link: "/reference/mcp-tools" }, { text: "REST API", link: "/reference/rest-api" }, + { text: "LibScope Lite API", link: "/reference/lite-api" }, { text: "Registry", link: "/reference/registry" }, { text: "Configuration", link: "/reference/configuration" }, ], diff --git a/docs/guide/architecture.md b/docs/guide/architecture.md index 6799468..006e5be 100644 --- a/docs/guide/architecture.md +++ b/docs/guide/architecture.md @@ -7,28 +7,44 @@ This guide explains how LibScope is structured internally. It is intended for co LibScope is organized into four distinct layers: ``` -┌─────────────────────────────────────────────┐ -│ Entry Points │ -│ CLI (Commander.js) MCP Server REST API │ -└──────────────────┬──────────────────────────┘ - │ -┌──────────────────▼──────────────────────────┐ -│ Core Business Logic │ -│ indexing · search · rag · documents · ... │ -└──────────────────┬──────────────────────────┘ - │ -┌──────────────────▼──────────────────────────┐ -│ Infrastructure │ -│ db/ (SQLite) providers/ (embeddings) │ -└─────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────┐ +│ Entry Points │ +│ CLI (Commander.js) MCP Server REST API LibScopeLite │ +└──────────────────────────┬──────────────────────────────┘ + │ +┌──────────────────────────▼──────────────────────────────┐ +│ Core Business Logic │ +│ indexing · search · rag · documents · parsers · … │ +└──────────────────────────┬──────────────────────────────┘ + │ +┌──────────────────────────▼──────────────────────────────┐ +│ Infrastructure │ +│ db/ (SQLite + sqlite-vec) providers/ (embeddings)│ +└─────────────────────────────────────────────────────────┘ ``` -**Entry points** (`src/cli/`, `src/mcp/`, `src/api/`) are thin adapters. They parse input, call core functions, and format output. They contain no business logic. +**Entry points** (`src/cli/`, `src/mcp/`, `src/api/`, `src/lite/`) are thin adapters. They parse input, call core functions, and format output. They contain no business logic. -**Core** (`src/core/`) contains all business logic. Core modules are plain TypeScript functions — they don't know whether they were called from the CLI, an MCP tool, or the REST API. +**Core** (`src/core/`) contains all business logic. Core modules are plain TypeScript functions — they don't know whether they were called from the CLI, an MCP tool, the REST API, or `LibScopeLite`. **Infrastructure** (`src/db/`, `src/providers/`) handles persistence and external services. The database layer uses better-sqlite3 (synchronous). The provider layer abstracts embedding models behind a common interface. +### LibScope Lite Layer + +`src/lite/` is a separate entry point that exposes a minimal embeddable API built on top of the same core and infrastructure modules: + +``` +libscope/lite → src/lite/index.ts → LibScopeLite class + ├── core/indexing.ts + ├── core/search.ts + ├── core/rag.ts + ├── core/ratings.ts + ├── db/connection.ts + └── providers/ +``` + +`LibScopeLite` deliberately omits connectors, topics, packs, webhooks, and registry — keeping the API surface small and the import footprint minimal for embedding in external applications. + ## Module Map ``` @@ -113,6 +129,12 @@ src/ │ ├── onenote.ts # Microsoft Graph API sync │ ├── http-utils.ts # shared retry logic with exponential backoff │ └── sync-tracker.ts # sync history and status in database +├── lite/ +│ ├── index.ts # public entrypoint — exports LibScopeLite + types +│ ├── core.ts # LibScopeLite class implementation +│ ├── types.ts # LiteOptions, LiteDoc, LiteSearchResult, etc. +│ ├── normalize.ts # raw input → markdown (dispatches to core/parsers/) +│ └── chunker-treesitter.ts # optional tree-sitter code chunker (TS/JS/Python) ├── config.ts # loadConfig() — merges env, project, user, defaults ├── errors.ts # LibScopeError hierarchy ├── logger.ts # pino logger with child logger support diff --git a/docs/guide/code-indexing.md b/docs/guide/code-indexing.md new file mode 100644 index 0000000..ff7d8f0 --- /dev/null +++ b/docs/guide/code-indexing.md @@ -0,0 +1,251 @@ +# Code Indexing + +LibScope Lite includes a tree-sitter powered code chunker that splits source files at function and class boundaries. This produces semantically meaningful chunks that are far better for embedding than naive line-count splits. + +## Why Code-Aware Chunking Matters + +The default LibScope chunker is paragraph- and heading-aware, which works well for documentation. For source code, it produces poor-quality chunks because: + +- Code has no paragraph boundaries — it's one continuous text +- A 500-line class split arbitrarily at line 100 loses the method signatures that give it meaning +- A function split in the middle loses the return statement (the most semantically important part) + +The tree-sitter chunker uses the Abstract Syntax Tree (AST) to split at **semantic boundaries** — each chunk is a complete, self-contained unit (a function, a class, a method) with its full signature and body. + +## Installation + +Tree-sitter is an **optional peer dependency**. Install the packages for the languages you need: + +```bash +# Core tree-sitter parser +npm install tree-sitter + +# Language grammars (install only what you need) +npm install tree-sitter-typescript # TypeScript + TSX +npm install tree-sitter-javascript # JavaScript, JSX, MJS, CJS +npm install tree-sitter-python # Python +``` + +If tree-sitter is not installed, `TreeSitterChunker.chunk()` throws a `ValidationError` with a clear install message. All other LibScope Lite features work normally without tree-sitter. + +## Supported Languages + +| Language | Aliases | Grammar Package | +|---|---|---| +| TypeScript | `typescript`, `ts`, `tsx` | `tree-sitter-typescript` | +| JavaScript | `javascript`, `js`, `jsx`, `mjs`, `cjs` | `tree-sitter-javascript` | +| Python | `python`, `py` | `tree-sitter-python` | + +Aliases are case-insensitive: `"TS"`, `"ts"`, `"TypeScript"` all resolve to TypeScript. + +## Basic Usage + +```ts +import { TreeSitterChunker } from "libscope/lite"; + +const chunker = new TreeSitterChunker(); + +// Check language support before chunking +if (!chunker.supports("typescript")) { + console.warn("tree-sitter-typescript not installed, skipping"); +} + +const source = ` +import { EventEmitter } from "events"; + +export class AuthService extends EventEmitter { + private tokens = new Map(); + + async login(userId: string, password: string): Promise { + const token = await this.generateToken(userId); + this.tokens.set(userId, token); + this.emit("login", userId); + return token; + } + + logout(userId: string): void { + this.tokens.delete(userId); + this.emit("logout", userId); + } + + private async generateToken(userId: string): Promise { + // ... token generation logic + return `tok_${userId}_${Date.now()}`; + } +} +`; + +const chunks = await chunker.chunk(source, "typescript"); +``` + +Each chunk in the result: + +```ts +interface CodeChunk { + content: string; // source text of the chunk + startLine: number; // 1-based start line in the original file + endLine: number; // 1-based end line in the original file + nodeType: string; // tree-sitter node type (see below) +} +``` + +For the example above, you'd get chunks like: + +``` +chunk[0]: "import { EventEmitter } from 'events';" + startLine: 2, endLine: 2, nodeType: "preamble" + +chunk[1]: "export class AuthService extends EventEmitter { ... }" + startLine: 4, endLine: 25, nodeType: "class_declaration" +``` + +## Node Types + +The chunker extracts these node types per language: + +**TypeScript / TSX:** +- `function_declaration` — `function foo() {}` +- `class_declaration` — `class Foo {}` +- `method_definition` — methods inside a class +- `export_statement` — `export const foo = ...`, `export default ...` +- `lexical_declaration` — `const foo = ...` at module scope +- `interface_declaration` — TypeScript interfaces +- `type_alias_declaration` — `type Foo = ...` +- `enum_declaration` — TypeScript enums + +**JavaScript / JSX:** +- `function_declaration`, `class_declaration`, `method_definition`, `export_statement`, `lexical_declaration` + +**Python:** +- `function_definition` — `def foo():` +- `class_definition` — `class Foo:` +- `decorated_definition` — `@decorator\ndef foo():` + +## Preamble Accumulation + +Non-declaration nodes at the top of a file (imports, `"use strict"`, module-level comments) are accumulated and prepended to the first declaration chunk as a **preamble**. This preserves context: + +```ts +// These lines become the preamble: +import { db } from "./database.js"; +const MAX_RETRIES = 3; + +// Combined with the first function: +export async function fetchUser(id: string) { ... } +``` + +The combined chunk gives the embedding model crucial context — it knows about `db` and `MAX_RETRIES` while processing `fetchUser`. + +Trailing non-declaration nodes (after the last function/class) are returned as a separate `trailing` chunk. + +## Large Node Splitting + +If a single declaration (e.g., a 2000-line class) exceeds `maxChunkSize` (default: 1500 characters), the chunker recursively splits it by named children (methods): + +```ts +// Override the size limit +const chunks = await chunker.chunk(source, "typescript", 2000); +``` + +When a class is split, each method becomes its own chunk. If a single method is still over the limit, it's returned as-is (further splitting would break semantics). + +## Fallback for Empty Files + +If the source has no declaration nodes (e.g., a config file, a `.d.ts` with only type exports), the entire source is returned as a single chunk with `nodeType: "module"`. + +## Integrating with LibScope Lite + +The typical pattern for indexing a codebase: + +```ts +import { LibScopeLite, TreeSitterChunker } from "libscope/lite"; +import { readdir, readFile } from "node:fs/promises"; +import { join, extname } from "node:path"; + +const chunker = new TreeSitterChunker(); +const lite = new LibScopeLite({ dbPath: "./my-project.db" }); + +async function indexDirectory(dir: string): Promise { + const entries = await readdir(dir, { recursive: true, withFileTypes: true }); + + const tasks = entries + .filter((e) => e.isFile()) + .map(async (entry) => { + const filePath = join(entry.parentPath, entry.name); + const ext = extname(entry.name).slice(1); // "ts", "py", etc. + const source = await readFile(filePath, "utf8"); + + if (chunker.supports(ext)) { + // Code-aware chunking + const chunks = await chunker.chunk(source, ext); + return chunks.map((c) => ({ + title: `${filePath}:${c.startLine}-${c.endLine}`, + content: c.content, + url: filePath, + library: "src", + })); + } + + // Plain text fallback for unsupported files + return [{ title: filePath, content: source, url: filePath, library: "src" }]; + }); + + const docGroups = await Promise.all(tasks); + await lite.indexBatch(docGroups.flat(), { concurrency: 4 }); +} + +await indexDirectory("./src"); +console.log("Indexed. Searching..."); + +const results = await lite.search("authentication token generation"); +for (const r of results) { + console.log(`${r.title} (score: ${r.score.toFixed(3)})`); +} + +lite.close(); +``` + +## Caching + +`TreeSitterChunker` lazily initializes the tree-sitter parser and grammar modules on first use and caches them for the lifetime of the instance. Create one `TreeSitterChunker` instance and reuse it rather than creating a new one per file: + +```ts +// Good — one instance, shared across all files +const chunker = new TreeSitterChunker(); +for (const file of files) { + const chunks = await chunker.chunk(await readFile(file, "utf8"), "typescript"); + // ... +} + +// Avoid — new instance per file incurs repeated dynamic import overhead +for (const file of files) { + const chunks = await new TreeSitterChunker().chunk(...); +} +``` + +## Error Handling + +```ts +import { ValidationError } from "libscope"; + +try { + const chunks = await chunker.chunk(source, "go"); +} catch (err) { + if (err instanceof ValidationError) { + // "Unsupported language for code chunking: 'go'" + // "Code chunking requires the 'tree-sitter' package. Install it with: ..." + console.warn(err.message); + } +} +``` + +Two error conditions: +1. **Unsupported language** — throws immediately with the list of supported aliases +2. **tree-sitter not installed** — throws with the exact `npm install` command + +Both are `ValidationError` from LibScope's error hierarchy. + +## See Also + +- [LibScope Lite Guide](/guide/lite) — full LibScope Lite documentation +- [LibScope Lite API Reference](/reference/lite-api) — TypeScript API reference diff --git a/docs/guide/lite.md b/docs/guide/lite.md new file mode 100644 index 0000000..dc42f34 --- /dev/null +++ b/docs/guide/lite.md @@ -0,0 +1,376 @@ +# LibScope Lite — Embedded Semantic Search + +`libscope/lite` is a lightweight, embeddable version of LibScope designed to be imported directly into any Node.js application. Instead of running a standalone CLI process or an MCP server, you call `index()` and `search()` programmatically from your own code. + +## When to Use LibScope Lite + +Use `libscope/lite` when you need to: + +- **Embed semantic search into another application** — e.g., a custom MCP server, a VS Code extension, a CI/CD tool +- **Avoid spawning subprocesses** — no CLI execution, no HTTP server required +- **Control the database lifecycle** — pass `:memory:` for ephemeral sessions or a file path for persistent cross-session reuse +- **Search across code** — tree-sitter powered chunking splits source files at function and class boundaries + +The primary use case driving this feature: a Bitbucket MCP server that wants semantic search over repository files and Jira/Confluence pages. On repository connect, it calls `indexBatch(repoFiles)` to build a local index. On PR review, it calls `getContext(question)` to retrieve the top-K relevant chunks and inject them into its LLM prompt — replacing 50 raw files of context with 5 highly-relevant chunks. + +## What Lite Does NOT Include + +`libscope/lite` intentionally omits the full-LibScope surface area: + +| Feature | Full `libscope` | `libscope/lite` | +|---|---|---| +| Semantic search | ✅ | ✅ | +| RAG (ask/stream) | ✅ | ✅ | +| Code-aware chunking | ❌ | ✅ | +| CLI commands | ✅ | ❌ | +| MCP server | ✅ | ❌ | +| Connectors (Notion, Slack…) | ✅ | ❌ | +| Topics & packs | ✅ | ❌ | +| Webhooks & registry | ✅ | ❌ | +| Web dashboard | ✅ | ❌ | + +## Installation + +```bash +npm install libscope +``` + +For code indexing, also install the optional peer dependencies: + +```bash +npm install tree-sitter tree-sitter-typescript tree-sitter-javascript tree-sitter-python +``` + +These are optional — if not installed, code chunking is unavailable but all other features work. + +## Quick Start + +```ts +import { LibScopeLite } from "libscope/lite"; + +const lite = new LibScopeLite({ dbPath: ":memory:" }); + +// Index some documents +await lite.indexBatch([ + { title: "Auth Guide", content: "Use OAuth2 for all API access. Tokens expire after 1 hour." }, + { title: "Deploy Guide", content: "Deploy to Kubernetes using Helm charts. Set replicas: 3." }, +], { concurrency: 4 }); + +// Hybrid vector + FTS5 search +const results = await lite.search("how to authenticate"); +console.log(results[0]?.title); // "Auth Guide" + +// RAG context retrieval (for external LLMs) +const context = await lite.getContext("How do I authenticate API requests?"); +// Returns a formatted context string ready to inject into an LLM prompt + +lite.close(); +``` + +## Constructor Options + +```ts +new LibScopeLite(opts?: LiteOptions) +``` + +| Option | Type | Default | Description | +|---|---|---|---| +| `dbPath` | `string` | `~/.libscope/lite.db` | SQLite database path. Use `":memory:"` for in-memory. | +| `db` | `Database` | — | Inject an existing `better-sqlite3` instance. When provided, `dbPath` is ignored and no migrations or schema setup are run. | +| `provider` | `EmbeddingProvider` | Local (all-MiniLM-L6-v2) | Embedding provider to use for indexing and search. | +| `llmProvider` | `LlmProvider` | — | LLM provider for `ask()` and `askStream()`. Required to use those methods. | + +### Persistent vs In-Memory Database + +```ts +// In-memory — data is lost when the process exits (good for one-off tasks) +const lite = new LibScopeLite({ dbPath: ":memory:" }); + +// File-backed — persists across sessions (good for long-lived indexes) +const lite = new LibScopeLite({ dbPath: "/data/my-project.db" }); +``` + +### Embedding Providers + +By default, LibScope Lite uses the local `all-MiniLM-L6-v2` model (downloads ~80 MB on first use). To use OpenAI or Ollama: + +```ts +import { LibScopeLite } from "libscope/lite"; +import { createEmbeddingProvider } from "libscope"; + +const provider = createEmbeddingProvider({ + embedding: { provider: "openai", model: "text-embedding-3-small" }, +}); + +const lite = new LibScopeLite({ provider }); +``` + +## Indexing + +### `index(docs)` + +Index an array of pre-parsed documents: + +```ts +await lite.index([ + { + title: "Getting Started", + content: "# Introduction\n\nThis guide covers the basics...", + library: "my-api", + version: "2.0", + url: "https://docs.example.com/getting-started", + }, +]); +``` + +**`LiteDoc` fields:** + +| Field | Type | Description | +|---|---|---| +| `title` | `string` | Document title (required) | +| `content` | `string` | Document text content (required) | +| `url` | `string?` | Source URL for deduplication and attribution | +| `library` | `string?` | Library namespace for scoped search | +| `version` | `string?` | Library version | +| `sourceType` | `string?` | `"manual"` (default), `"library"`, `"topic"`, or `"model-generated"` | +| `topicId` | `string?` | Topic ID to associate the document with | + +### `indexRaw(input)` + +Index from a raw source — a file path, URL, buffer, or plain text. LibScope Lite normalizes the input using the same parser pipeline as full LibScope: + +```ts +// Index a local file (auto-detects format from extension) +const docId = await lite.indexRaw({ type: "file", path: "./docs/guide.pdf" }); + +// Fetch and index a URL +const docId = await lite.indexRaw({ type: "url", url: "https://docs.example.com/guide" }); + +// Index raw text +const docId = await lite.indexRaw({ type: "text", title: "Notes", content: "..." }); + +// Index from a buffer (e.g., uploaded file) +const docId = await lite.indexRaw({ + type: "buffer", + buffer: fileBuffer, + filename: "report.docx", + title: "Q4 Report", +}); +``` + +**Supported formats:** Markdown, plain text, HTML, PDF (requires `pdf-parse`), DOCX (requires `mammoth`), EPUB (requires `epub2`), PPTX (requires `pizzip`), CSV, JSON, YAML. + +### `indexBatch(docs, opts)` + +Index multiple documents with concurrency control: + +```ts +await lite.indexBatch(repoFiles, { concurrency: 4 }); +``` + +`concurrency` controls how many documents are embedded in parallel. A value of 4–8 is recommended for most systems. Each document's embeddings are computed concurrently but each database write is atomic. + +**Pattern for large repos:** + +```ts +const files = await glob("src/**/*.ts"); +const docs = await Promise.all( + files.map(async (path) => ({ + title: path, + content: await fs.readFile(path, "utf8"), + sourceType: "library" as const, + })) +); + +await lite.indexBatch(docs, { concurrency: 8 }); +``` + +## Searching + +### `search(query, opts?)` + +Hybrid vector + FTS5 search — the same engine used by full LibScope: + +```ts +const results = await lite.search("OAuth2 token refresh", { + limit: 5, // max results (default: 10) + library: "api", // scope to a library + tags: ["auth"], // filter by tags + diversity: 0.3, // MMR reranking (0 = pure relevance, 1 = max diversity) +}); + +for (const result of results) { + console.log(result.title, result.score); + console.log(result.content); // the matching chunk text +} +``` + +**`LiteSearchResult` fields:** + +| Field | Type | Description | +|---|---|---| +| `docId` | `string` | Document ID | +| `chunkId` | `string` | Chunk ID within the document | +| `title` | `string` | Document title | +| `content` | `string` | Chunk text | +| `score` | `number` | Relevance score (higher is better) | +| `url` | `string \| null` | Source URL if set at index time | + +## RAG + +### `getContext(question, opts?)` + +Retrieve context without running an LLM — useful when you want to inject the context into your own LLM prompt: + +```ts +const context = await lite.getContext("How does the event loop work?", { + topK: 5, // number of chunks to retrieve (default: 5) + library: "node", // optional scope +}); + +// context is a formatted string you can inject into any LLM prompt +const prompt = `Answer based only on this context:\n\n${context}\n\nQuestion: ...`; +``` + +This is the primary method for agent-to-agent integration patterns — your orchestrating LLM calls `getContext()` and injects the result into its prompt rather than managing a separate RAG system. + +### `ask(question, opts?)` + +Full RAG with an LLM completing the response (requires `llmProvider` in constructor or opts): + +```ts +import { LibScopeLite } from "libscope/lite"; +import { createLlmProvider } from "libscope"; + +const lite = new LibScopeLite({ + llmProvider: createLlmProvider({ llm: { provider: "openai", model: "gpt-4o-mini" } }), +}); + +const answer = await lite.ask("How do I configure rate limiting?", { topK: 5 }); +console.log(answer); // string answer from LLM +``` + +### `askStream(question, opts?)` + +Streaming version of `ask()` — returns an `AsyncGenerator` of token chunks: + +```ts +for await (const token of lite.askStream("Explain the deployment process")) { + process.stdout.write(token); +} +``` + +The LLM provider must support streaming. Providers that don't expose a `completeStream()` method will throw a clear error. + +## Code Indexing + +For source code files, use the tree-sitter chunker to split at function and class boundaries: + +```ts +import { LibScopeLite } from "libscope/lite"; +import { TreeSitterChunker } from "libscope/lite"; + +const chunker = new TreeSitterChunker(); +const lite = new LibScopeLite({ dbPath: ":memory:" }); + +// Check if a language is supported before chunking +if (chunker.supports("typescript")) { + const source = await fs.readFile("src/auth.ts", "utf8"); + const chunks = await chunker.chunk(source, "typescript"); + + // Each chunk is a function or class with 1-based line numbers + for (const chunk of chunks) { + await lite.index([{ + title: `auth.ts:${chunk.startLine}-${chunk.endLine} (${chunk.nodeType})`, + content: chunk.content, + library: "src", + }]); + } +} +``` + +See [Code Indexing](/guide/code-indexing) for the full guide including supported languages, chunk shape, and large-file strategies. + +## Feedback + +### `rate(docId, score)` + +Record a quality signal for a document (score 1–5): + +```ts +const results = await lite.search("deployment process"); +const docId = results[0]?.docId; +if (docId) { + lite.rate(docId, 5); // this result was highly relevant +} +``` + +Ratings feed into subsequent searches — highly-rated documents get boosted in results over time. + +## Lifecycle + +### `close()` + +Always close the database when done: + +```ts +lite.close(); +``` + +For long-running services, create one `LibScopeLite` instance and reuse it for the lifetime of the service. For one-off scripts, close in a `finally` block: + +```ts +const lite = new LibScopeLite({ dbPath: "/data/repo.db" }); +try { + await lite.indexBatch(docs, { concurrency: 4 }); + const results = await lite.search(query); + // ... +} finally { + lite.close(); +} +``` + +## Integration Pattern: External MCP Server + +The canonical use case — an MCP server that builds a semantic index over repository files: + +```ts +import { LibScopeLite } from "libscope/lite"; +import { TreeSitterChunker } from "libscope/lite"; + +const DB_PATH = path.join(os.homedir(), ".bitbucket-mcp", "index.db"); +const chunker = new TreeSitterChunker(); +let lite: LibScopeLite; + +// Called when a repo is connected +async function onRepoConnect(repoFiles: { path: string; content: string }[]) { + lite = new LibScopeLite({ dbPath: DB_PATH }); + + const docs = await Promise.all(repoFiles.map(async ({ path, content }) => { + if (chunker.supports(path.split(".").pop() ?? "")) { + // Code-aware chunking for supported languages + const chunks = await chunker.chunk(content, path.split(".").pop()!); + return chunks.map((c) => ({ + title: `${path}:${c.startLine}-${c.endLine}`, + content: c.content, + url: path, + })); + } + return [{ title: path, content, url: path }]; + })); + + await lite.indexBatch(docs.flat(), { concurrency: 4 }); +} + +// Called during PR review +async function onPrReview(question: string): Promise { + return lite.getContext(question, { topK: 5 }); +} +``` + +## See Also + +- [Code Indexing Guide](/guide/code-indexing) — tree-sitter chunking in depth +- [LibScope Lite API Reference](/reference/lite-api) — full TypeScript API +- [Programmatic Usage](/guide/programmatic-usage) — full `LibScope` SDK (with connectors, packs, topics) +- [How Search Works](/guide/how-search-works) — hybrid vector + FTS5 explained diff --git a/docs/guide/programmatic-usage.md b/docs/guide/programmatic-usage.md index e5cb353..01be902 100644 --- a/docs/guide/programmatic-usage.md +++ b/docs/guide/programmatic-usage.md @@ -2,6 +2,10 @@ LibScope can be used as a Node.js library via the `LibScope` SDK class. +::: tip Looking for embedded / lightweight usage? +[LibScope Lite](/guide/lite) (`libscope/lite`) is a zero-dependency-on-connectors embeddable class with `index()`, `search()`, `getContext()`, and tree-sitter code chunking — designed to be imported directly into external applications without the full CLI/MCP/connector surface area. +::: + ## Setup ```ts diff --git a/docs/reference/lite-api.md b/docs/reference/lite-api.md new file mode 100644 index 0000000..5b4bd0e --- /dev/null +++ b/docs/reference/lite-api.md @@ -0,0 +1,598 @@ +# LibScope Lite API Reference + +Complete TypeScript API reference for `libscope/lite`. + +## Import + +```ts +import { LibScopeLite, TreeSitterChunker } from "libscope/lite"; +import type { + LiteOptions, + LiteDoc, + RawInput, + LiteSearchOptions, + LiteSearchResult, + LiteContextOptions, + LiteAskOptions, + CodeChunk, +} from "libscope/lite"; +``` + +--- + +## `LibScopeLite` + +The main class. Creates and manages its own SQLite database, embedding provider, and search engine. + +### Constructor + +```ts +new LibScopeLite(opts?: LiteOptions) +``` + +**`LiteOptions`** + +```ts +interface LiteOptions { + /** + * Path to the SQLite database file. + * - Use ":memory:" for in-process ephemeral storage (lost on close()) + * - Use a file path for persistent cross-session storage + * - Defaults to ~/.libscope/lite.db + */ + dbPath?: string; + + /** + * Inject a pre-configured better-sqlite3 Database instance. + * When provided, dbPath is ignored. No migrations, no sqlite-vec + * setup, and no extension loading are performed — the caller is + * responsible for schema initialization. + * + * Useful for tests and for callers that already manage their own + * database connection. + */ + db?: Database; + + /** + * Embedding provider used for indexing and similarity search. + * Defaults to LocalEmbeddingProvider (all-MiniLM-L6-v2, ~80 MB download). + */ + provider?: EmbeddingProvider; + + /** + * LLM provider used by ask() and askStream(). + * Required to call those methods; other methods work without it. + */ + llmProvider?: LlmProvider; +} +``` + +**Throws** `DatabaseError` if the database file cannot be opened or migrations fail. + +--- + +### `index(docs)` + +```ts +async index(docs: LiteDoc[]): Promise +``` + +Index an array of pre-parsed documents. Each document is chunked using the markdown-aware chunker, embedded, and stored. + +**`LiteDoc`** + +```ts +interface LiteDoc { + /** Document title. Required. Used in search result display and title boosting. */ + title: string; + + /** Full document text. Required. Will be chunked before embedding. */ + content: string; + + /** Source URL. Used for deduplication: if a document with this URL exists, + * it is replaced if the content hash changed, skipped if unchanged. */ + url?: string; + + /** + * Source type for provenance tracking. + * @default "manual" + */ + sourceType?: "manual" | "library" | "topic" | "model-generated"; + + /** Library namespace. Allows scoping search to a specific library. */ + library?: string; + + /** Library version. Used with library for version-scoped search. */ + version?: string; + + /** Topic ID to associate the document with for topic-scoped search. */ + topicId?: string; +} +``` + +**Example:** + +```ts +await lite.index([ + { + title: "Rate Limiting", + content: "Apply rate limiting using the X-RateLimit-* headers...", + library: "api", + version: "3.2", + url: "https://docs.example.com/rate-limiting", + }, +]); +``` + +--- + +### `indexRaw(input)` + +```ts +async indexRaw(input: RawInput): Promise +``` + +Index from a raw input source. The input is passed through the parser pipeline (same parsers as the CLI `add` command), normalized to markdown, then chunked and indexed. + +Returns the document ID of the newly created document. + +**`RawInput`** + +```ts +type RawInput = + | { type: "file"; path: string; title?: string } + | { type: "url"; url: string; title?: string } + | { type: "text"; content: string; title: string } + | { type: "buffer"; buffer: Buffer; filename: string; title?: string }; +``` + +| `type` | Description | Format detection | +|---|---|---| +| `"file"` | Read from local filesystem | File extension (`.md`, `.pdf`, `.docx`, etc.) | +| `"url"` | Fetch and parse a web page | Content-Type header | +| `"text"` | Plain text or markdown string | Always treated as markdown | +| `"buffer"` | In-memory buffer (e.g., upload) | `filename` extension | + +**Supported formats:** Markdown, plain text, HTML, PDF (`pdf-parse`), DOCX (`mammoth`), EPUB (`epub2`), PPTX (`pizzip`), CSV, JSON, YAML. + +**Example:** + +```ts +const id1 = await lite.indexRaw({ type: "file", path: "./README.md" }); +const id2 = await lite.indexRaw({ type: "url", url: "https://docs.example.com" }); +const id3 = await lite.indexRaw({ type: "text", title: "Notes", content: "# My Notes\n..." }); +``` + +--- + +### `indexBatch(docs, opts)` + +```ts +async indexBatch(docs: LiteDoc[], opts: { concurrency: number }): Promise +``` + +Index multiple documents with concurrency control. Documents are embedded in parallel up to `concurrency` at a time. Each document's database write is still atomic. + +| Parameter | Type | Description | +|---|---|---| +| `docs` | `LiteDoc[]` | Documents to index | +| `opts.concurrency` | `number` | Max parallel embedding calls. Recommended: 4–8. | + +**Example:** + +```ts +await lite.indexBatch( + files.map((f) => ({ title: f.name, content: f.text, library: "docs" })), + { concurrency: 6 }, +); +``` + +--- + +### `search(query, opts?)` + +```ts +async search(query: string, opts?: LiteSearchOptions): Promise +``` + +Hybrid vector + FTS5 search using Reciprocal Rank Fusion. Returns chunks ranked by relevance. + +**`LiteSearchOptions`** + +```ts +interface LiteSearchOptions { + /** Maximum number of results. Default: 10. Max: 1000. */ + limit?: number; + + /** Restrict results to a specific library namespace. */ + library?: string; + + /** Restrict results to documents in this topic. */ + topic?: string; + + /** Restrict to documents with all of these tags. */ + tags?: string[]; + + /** + * MMR diversity reranking coefficient (0–1). + * 0 = pure relevance order. 1 = maximum diversity (no two similar chunks). + * Default: no reranking. + */ + diversity?: number; +} +``` + +**`LiteSearchResult`** + +```ts +interface LiteSearchResult { + /** Document ID. Use with rate() to record feedback. */ + docId: string; + + /** Chunk ID within the document. */ + chunkId: string; + + /** Document title. */ + title: string; + + /** Chunk text (the actual content that matched). */ + content: string; + + /** + * Relevance score. Higher is better. + * Combines vector similarity, BM25, and title boost. + */ + score: number; + + /** Source URL if set at index time, otherwise null. */ + url: string | null; +} +``` + +**Example:** + +```ts +const results = await lite.search("JWT token validation", { + limit: 5, + library: "auth-service", + diversity: 0.2, +}); + +for (const r of results) { + console.log(`[${r.score.toFixed(3)}] ${r.title}`); + console.log(r.content.slice(0, 200)); +} +``` + +--- + +### `getContext(question, opts?)` + +```ts +async getContext(question: string, opts?: LiteContextOptions): Promise +``` + +Retrieve top-K relevant chunks and return them as a formatted context string. Does not call an LLM — returns the context ready for injection into an external prompt. + +This is the primary integration point for external LLM pipelines. + +**`LiteContextOptions`** + +```ts +interface LiteContextOptions { + /** Number of chunks to retrieve. Default: 5. */ + topK?: number; + + /** Restrict retrieval to a specific library. */ + library?: string; + + /** Restrict retrieval to a specific topic. */ + topic?: string; +} +``` + +**Returns:** A formatted string containing the retrieved chunks with their titles. The exact format is: + +``` +[Document Title] +Chunk text here... + +[Another Document] +More chunk text... +``` + +**Example:** + +```ts +const context = await lite.getContext("How do I handle auth errors?", { topK: 3 }); +const prompt = `You are a helpful assistant. Answer using only this context: + +${context} + +Question: How do I handle auth errors?`; +``` + +--- + +### `ask(question, opts?)` + +```ts +async ask(question: string, opts?: LiteAskOptions): Promise +``` + +Full RAG: retrieves context then calls an LLM to produce a grounded answer. + +Requires an `llmProvider` configured in the constructor or passed in `opts`. + +**`LiteAskOptions`** + +```ts +interface LiteAskOptions { + /** Number of context chunks to retrieve. Default: 5. */ + topK?: number; + + /** Scope retrieval to a library. */ + library?: string; + + /** Scope retrieval to a topic. */ + topic?: string; + + /** Custom system prompt. Overrides the default "answer using context" instruction. */ + systemPrompt?: string; + + /** + * LLM provider for this request. + * Overrides the instance-level llmProvider for this single call. + */ + llmProvider?: LlmProvider; +} +``` + +**Returns:** The LLM's answer as a plain string. + +**Throws** `Error` if no `llmProvider` is configured. + +**Example:** + +```ts +const answer = await lite.ask("What authentication methods does the API support?", { + library: "api-docs", + topK: 8, + systemPrompt: "You are a concise technical assistant. Answer in bullet points.", +}); +``` + +--- + +### `askStream(question, opts?)` + +```ts +async *askStream(question: string, opts?: LiteAskOptions): AsyncGenerator +``` + +Streaming version of `ask()`. Yields string tokens as they arrive from the LLM. + +Requires an `llmProvider` with a `completeStream()` method. + +**Throws:** +- `Error` if no `llmProvider` is configured +- `Error` if the provider does not support streaming + +**Example:** + +```ts +process.stdout.write("Answer: "); +for await (const token of lite.askStream("Explain the rate limiting algorithm")) { + process.stdout.write(token); +} +process.stdout.write("\n"); +``` + +--- + +### `rate(docId, score)` + +```ts +rate(docId: string, score: number): void +``` + +Record a quality rating for a document. Ratings are stored persistently and influence subsequent search rankings — highly-rated documents are boosted. + +| Parameter | Type | Description | +|---|---|---| +| `docId` | `string` | Document ID (from `LiteSearchResult.docId`) | +| `score` | `number` | Rating 1–5 (1 = poor, 5 = excellent) | + +**Throws** `ValidationError` for invalid scores or unknown document IDs. + +**Example:** + +```ts +const results = await lite.search("error handling patterns"); +if (results[0]) { + lite.rate(results[0].docId, 4); // this result was useful +} +``` + +--- + +### `close()` + +```ts +close(): void +``` + +Close the database connection and release all resources. Must be called when the `LibScopeLite` instance is no longer needed. + +After `close()`, all other methods will throw if called. + +--- + +## `TreeSitterChunker` + +Code-aware chunker using tree-sitter AST parsing. Optional — requires `tree-sitter` and at least one grammar package. + +### Constructor + +```ts +new TreeSitterChunker() +``` + +The parser and grammar instances are lazily initialized on first `chunk()` call and cached for the lifetime of the instance. Create one `TreeSitterChunker` and reuse it across all files. + +--- + +### `supports(language)` + +```ts +supports(language: string): boolean +``` + +Returns `true` if the given language alias is supported. Case-insensitive. + +```ts +chunker.supports("ts"); // true +chunker.supports("TypeScript"); // true +chunker.supports("go"); // false (not yet supported) +chunker.supports("unknown"); // false +``` + +Does not throw. Safe to call before attempting `chunk()`. + +--- + +### `chunk(source, language, maxChunkSize?)` + +```ts +async chunk( + source: string, + language: string, + maxChunkSize?: number, +): Promise +``` + +Parse `source` and return an array of semantically meaningful chunks. + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `source` | `string` | — | Source code to chunk | +| `language` | `string` | — | Language name or alias (e.g., `"ts"`, `"python"`) | +| `maxChunkSize` | `number` | `1500` | Maximum characters per chunk | + +**`CodeChunk`** + +```ts +interface CodeChunk { + /** Source text of this chunk (function body, class, etc.) */ + content: string; + + /** 1-based line number where this chunk starts in the original source. */ + startLine: number; + + /** 1-based line number where this chunk ends. */ + endLine: number; + + /** + * Tree-sitter node type. Common values: + * - "function_declaration" + * - "class_declaration" + * - "method_definition" + * - "export_statement" + * - "preamble" (accumulated imports/comments before first declaration) + * - "trailing" (non-declaration nodes after last declaration) + * - "module" (entire source, returned when no declarations found) + */ + nodeType: string; +} +``` + +**Throws** `ValidationError`: +- If `language` is not in the supported list +- If `tree-sitter` is not installed (with install instructions) +- If the source file cannot be parsed + +**Example:** + +```ts +const chunks = await chunker.chunk( + await readFile("src/api.ts", "utf8"), + "typescript", + 2000, +); + +console.log(`${chunks.length} chunks`); +chunks.forEach((c) => { + console.log(` ${c.nodeType} (lines ${c.startLine}–${c.endLine}): ${c.content.length} chars`); +}); +``` + +--- + +## Type Reference + +### `EmbeddingProvider` + +```ts +interface EmbeddingProvider { + readonly name: string; + readonly dimensions: number; + embed(text: string): Promise; + embedBatch(texts: string[]): Promise; +} +``` + +Import from `libscope`: + +```ts +import type { EmbeddingProvider } from "libscope"; +``` + +### `LlmProvider` + +```ts +interface LlmProvider { + model: string; + complete(prompt: string, systemPrompt?: string): Promise<{ text: string }>; + completeStream?(prompt: string, systemPrompt?: string): AsyncGenerator; +} +``` + +Import from `libscope`: + +```ts +import type { LlmProvider } from "libscope"; +``` + +--- + +## Error Types + +All errors extend `LibScopeError` with a `.code` string property: + +| Class | Code | When thrown | +|---|---|---| +| `DatabaseError` | `DATABASE_ERROR` | SQLite failures, schema errors | +| `ValidationError` | `VALIDATION_ERROR` | Bad input, unsupported language, missing tree-sitter | +| `EmbeddingError` | `EMBEDDING_ERROR` | Embedding provider failures | +| `DocumentNotFoundError` | `DOCUMENT_NOT_FOUND` | `rate()` with unknown docId | + +```ts +import { ValidationError, DatabaseError } from "libscope"; + +try { + await lite.index([{ title: "", content: "..." }]); +} catch (err) { + if (err instanceof ValidationError) { + console.error("Invalid input:", err.message); // "Document title is required" + } +} +``` + +--- + +## See Also + +- [LibScope Lite Guide](/guide/lite) — usage guide with examples +- [Code Indexing Guide](/guide/code-indexing) — tree-sitter chunking in depth +- [How Search Works](/guide/how-search-works) — hybrid vector + FTS5 explained +- [Configuration Reference](/reference/configuration) — embedding providers, LLM setup diff --git a/package.json b/package.json index c77c93c..c31b49b 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,12 @@ }, "exports": { ".": "./dist/core/index.js", - "./mcp": "./dist/mcp/server.js" + "./mcp": "./dist/mcp/server.js", + "./lite": { + "import": "./dist/lite/index.js", + "require": "./dist/lite/index.js", + "types": "./dist/lite/index.d.ts" + } }, "files": [ "dist/" @@ -90,6 +95,18 @@ "vitepress": "^1.6.4", "vitest": "^4.0.18" }, + "peerDependencies": { + "tree-sitter": "^0.21.0", + "tree-sitter-javascript": "^0.21.0", + "tree-sitter-typescript": "^0.21.0", + "tree-sitter-python": "^0.21.0" + }, + "peerDependenciesMeta": { + "tree-sitter": { "optional": true }, + "tree-sitter-javascript": { "optional": true }, + "tree-sitter-typescript": { "optional": true }, + "tree-sitter-python": { "optional": true } + }, "lint-staged": { "*.ts": [ "eslint --fix", diff --git a/src/lite/chunker-treesitter.ts b/src/lite/chunker-treesitter.ts new file mode 100644 index 0000000..239fc33 --- /dev/null +++ b/src/lite/chunker-treesitter.ts @@ -0,0 +1,356 @@ +/** + * Tree-sitter based code-aware chunker. + * + * Splits source code at function/class boundaries using tree-sitter AST parsing. + * tree-sitter and its grammar packages are optional peer dependencies — + * this module is only loaded dynamically when available. + */ + +import { ValidationError } from "../errors.js"; + +/** A semantically meaningful chunk of source code. */ +export interface CodeChunk { + /** The source text of this chunk. */ + content: string; + /** 1-based start line in the original file. */ + startLine: number; + /** 1-based end line in the original file. */ + endLine: number; + /** The tree-sitter node type (e.g. "function_declaration", "class_definition"). */ + nodeType: string; +} + +/** Minimal tree-sitter node shape for type safety without importing tree-sitter types. */ +interface TSNode { + type: string; + text: string; + startPosition: { row: number; column: number }; + endPosition: { row: number; column: number }; + childCount: number; + child(index: number): TSNode | null; + namedChildCount: number; + namedChild(index: number): TSNode | null; +} + +/** Minimal tree-sitter tree shape. */ +interface TSTree { + rootNode: TSNode; +} + +/** Minimal tree-sitter parser shape. */ +interface TSParser { + setLanguage(language: unknown): void; + parse(input: string): TSTree; +} + +/** Canonical language name used internally. */ +type SupportedLanguage = "typescript" | "javascript" | "python"; + +/** Map from user-facing aliases to canonical names. */ +const LANGUAGE_ALIASES: Record = { + typescript: "typescript", + ts: "typescript", + tsx: "typescript", + javascript: "javascript", + js: "javascript", + jsx: "javascript", + mjs: "javascript", + cjs: "javascript", + python: "python", + py: "python", +}; + +/** Node types to treat as chunk boundaries per language. */ +const CHUNK_NODE_TYPES: Record> = { + typescript: new Set([ + "function_declaration", + "class_declaration", + "method_definition", + "export_statement", + "lexical_declaration", + "interface_declaration", + "type_alias_declaration", + "enum_declaration", + ]), + javascript: new Set([ + "function_declaration", + "class_declaration", + "method_definition", + "export_statement", + "lexical_declaration", + ]), + python: new Set(["function_definition", "class_definition", "decorated_definition"]), +}; + +const DEFAULT_MAX_CHUNK_SIZE = 1500; + +/** + * Code-aware chunker using tree-sitter. + * + * Parses source code into an AST and splits at function/class boundaries, + * producing semantically meaningful chunks suitable for embedding. + */ +export class TreeSitterChunker { + private parserCache: TSParser | undefined; + private readonly grammarCache = new Map(); + + /** Returns true if the given language (or alias) is supported. */ + supports(language: string): boolean { + return language.toLowerCase() in LANGUAGE_ALIASES; + } + + /** Resolve a language alias to its canonical name, or undefined if unsupported. */ + private resolveLanguage(language: string): SupportedLanguage | undefined { + return LANGUAGE_ALIASES[language.toLowerCase()]; + } + + /** + * Chunk source code into semantically meaningful pieces using tree-sitter. + * + * @param source - The raw source code string. + * @param language - Language name or alias (e.g. "typescript", "ts", "py"). + * @param maxChunkSize - Maximum characters per chunk (default 1500). + * @returns Array of CodeChunk with content, line range, and AST node type. + * @throws ValidationError if tree-sitter is not installed or parsing fails. + */ + async chunk( + source: string, + language: string, + maxChunkSize: number = DEFAULT_MAX_CHUNK_SIZE, + ): Promise { + const canonical = this.resolveLanguage(language); + if (canonical === undefined) { + throw new ValidationError(`Unsupported language for code chunking: "${language}"`); + } + + const parser = await this.getParser(); + const grammar = await this.loadGrammar(canonical); + parser.setLanguage(grammar); + + let tree: TSTree; + try { + tree = parser.parse(source); + } catch (err: unknown) { + throw new ValidationError( + `Failed to parse ${canonical} source with tree-sitter: ${err instanceof Error ? err.message : String(err)}`, + err, + ); + } + + const root = tree.rootNode; + const chunkNodeTypes = CHUNK_NODE_TYPES[canonical]; + const rawChunks = this.extractChunks(root, chunkNodeTypes, maxChunkSize); + + // If no declaration nodes found, return the whole source as a single chunk + if (rawChunks.length === 0) { + return [ + { + content: source, + startLine: 1, + endLine: source.split("\n").length, + nodeType: "module", + }, + ]; + } + + return rawChunks; + } + + /** + * Walk top-level children and extract chunks at declaration boundaries. + * Consecutive non-declaration nodes (imports, comments) are accumulated + * and prepended to the next declaration chunk for context. + */ + private extractChunks( + root: TSNode, + chunkNodeTypes: ReadonlySet, + maxChunkSize: number, + ): CodeChunk[] { + const chunks: CodeChunk[] = []; + let preamble = ""; + let preambleStartLine: number | undefined; + + for (let i = 0; i < root.childCount; i++) { + const child = root.child(i); + if (child === null) continue; + + if (chunkNodeTypes.has(child.type)) { + this.flushDeclaration(child, preamble, preambleStartLine, maxChunkSize, chunks); + preamble = ""; + preambleStartLine = undefined; + } else { + const text = child.text.trim(); + if (text) { + preambleStartLine ??= child.startPosition.row + 1; + preamble = preamble ? preamble + "\n" + child.text : child.text; + } + } + } + + if (preamble) { + chunks.push({ + content: preamble, + startLine: preambleStartLine ?? 1, + endLine: root.endPosition.row + 1, + nodeType: "trailing", + }); + } + + return chunks; + } + + /** Emit one or more chunks for a declaration node, prepending any accumulated preamble. */ + private flushDeclaration( + child: TSNode, + preamble: string, + preambleStartLine: number | undefined, + maxChunkSize: number, + chunks: CodeChunk[], + ): void { + const content = preamble ? preamble + "\n\n" + child.text : child.text; + const startLine = preambleStartLine ?? child.startPosition.row + 1; + + if (content.length <= maxChunkSize) { + chunks.push({ content, startLine, endLine: child.endPosition.row + 1, nodeType: child.type }); + return; + } + + // Large node — flush preamble separately, then split by children + if (preamble) { + chunks.push({ + content: preamble, + startLine: preambleStartLine ?? startLine, + endLine: child.startPosition.row, + nodeType: "preamble", + }); + } + chunks.push(...this.splitLargeNode(child, maxChunkSize)); + } + + /** + * Split a large declaration node into smaller chunks by recursing into + * its named children (e.g. methods inside a class). + */ + private splitLargeNode(node: TSNode, maxChunkSize: number): CodeChunk[] { + if (node.namedChildCount > 1) { + const chunks = this.accumulateNamedChildren(node, maxChunkSize); + if (chunks.length > 0) return chunks; + } + // Node has ≤1 child or accumulation produced nothing — return as-is + return [ + { + content: node.text, + startLine: node.startPosition.row + 1, + endLine: node.endPosition.row + 1, + nodeType: node.type, + }, + ]; + } + + /** Accumulate named children of a node into size-bounded chunks. */ + private accumulateNamedChildren(node: TSNode, maxChunkSize: number): CodeChunk[] { + const chunks: CodeChunk[] = []; + let accumulated = ""; + let accStartLine = node.startPosition.row + 1; + + for (let i = 0; i < node.namedChildCount; i++) { + const child = node.namedChild(i); + if (child === null) continue; + + const childText = child.text; + if (accumulated && accumulated.length + childText.length + 2 > maxChunkSize) { + chunks.push({ + content: accumulated, + startLine: accStartLine, + endLine: child.startPosition.row, + nodeType: node.type, + }); + accumulated = childText; + accStartLine = child.startPosition.row + 1; + } else { + if (!accumulated) accStartLine = child.startPosition.row + 1; + accumulated = accumulated ? accumulated + "\n\n" + childText : childText; + } + } + + if (accumulated) { + chunks.push({ + content: accumulated, + startLine: accStartLine, + endLine: node.endPosition.row + 1, + nodeType: node.type, + }); + } + + return chunks; + } + + /** Lazily create or return the cached tree-sitter Parser instance. */ + private async getParser(): Promise { + if (this.parserCache !== undefined) { + return this.parserCache; + } + + try { + // @ts-expect-error — tree-sitter is an optional peer dependency, not installed at compile time + const TreeSitter = (await import("tree-sitter")) as Record; + // tree-sitter exports vary: could be default export or named + const resolved = "default" in TreeSitter ? TreeSitter["default"] : TreeSitter; + const ParserClass = resolved as new () => TSParser; + this.parserCache = new ParserClass(); + return this.parserCache; + } catch (err: unknown) { + throw new ValidationError( + 'Code chunking requires the "tree-sitter" package. ' + + "Install it with: npm install tree-sitter tree-sitter-typescript tree-sitter-javascript tree-sitter-python", + err, + ); + } + } + + /** Lazily load and cache a tree-sitter grammar for the given language. */ + private async loadGrammar(language: SupportedLanguage): Promise { + const cached = this.grammarCache.get(language); + if (cached !== undefined) { + return cached; + } + + const packageName = this.grammarPackageName(language); + + try { + const mod = (await import(packageName)) as Record; + // Grammar packages typically export the language as the default export. + // tree-sitter-typescript exports { typescript, tsx } as named exports. + let grammar: unknown; + if (language === "typescript" && "typescript" in mod) { + grammar = mod["typescript"]; + } else if ("default" in mod) { + grammar = mod["default"]; + } else { + // Fallback: use the module itself (some packages export the grammar directly) + grammar = mod; + } + + this.grammarCache.set(language, grammar); + return grammar; + } catch (err: unknown) { + throw new ValidationError( + `Code chunking for ${language} requires the "${packageName}" package. ` + + `Install it with: npm install ${packageName}`, + err, + ); + } + } + + /** Map canonical language name to its npm grammar package. */ + private grammarPackageName(language: SupportedLanguage): string { + switch (language) { + case "typescript": + return "tree-sitter-typescript"; + case "javascript": + return "tree-sitter-javascript"; + case "python": + return "tree-sitter-python"; + } + } +} diff --git a/src/lite/core.ts b/src/lite/core.ts new file mode 100644 index 0000000..863439d --- /dev/null +++ b/src/lite/core.ts @@ -0,0 +1,182 @@ +import Database from "better-sqlite3"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import type { EmbeddingProvider } from "../providers/embedding.js"; +import { LocalEmbeddingProvider } from "../providers/local.js"; +import { createDatabase } from "../db/connection.js"; +import { runMigrations, createVectorTable } from "../db/schema.js"; +import { indexDocument } from "../core/indexing.js"; +import { searchDocuments } from "../core/search.js"; +import { rateDocument } from "../core/ratings.js"; +import { askQuestion, getContextForQuestion, type LlmProvider } from "../core/rag.js"; +import { normalizeRawInput } from "./normalize.js"; +import type { + LiteOptions, + LiteDoc, + RawInput, + LiteSearchOptions, + LiteSearchResult, + LiteContextOptions, + LiteAskOptions, +} from "./types.js"; + +export class LibScopeLite { + private readonly db: Database.Database; + private readonly provider: EmbeddingProvider; + private readonly llmProvider: LlmProvider | null; + + constructor(opts: LiteOptions = {}) { + this.provider = opts.provider ?? new LocalEmbeddingProvider(); + this.llmProvider = opts.llmProvider ?? null; + + if (opts.db === undefined) { + const dbPath = opts.dbPath ?? join(homedir(), ".libscope", "lite.db"); + // createDatabase handles directory creation, WAL mode, pragmas, and sqlite-vec loading. + this.db = createDatabase(dbPath); + runMigrations(this.db); + // Create vector table best-effort (requires sqlite-vec to be loaded). + try { + createVectorTable(this.db, this.provider.dimensions); + } catch { + /* sqlite-vec not loaded — FTS5 search still works */ + } + } else { + // Caller-provided DB: skip all setup (migrations, extension loading, vector table). + this.db = opts.db; + } + } + + async index(docs: LiteDoc[]): Promise { + for (const doc of docs) { + await indexDocument(this.db, this.provider, { + title: doc.title, + content: doc.content, + sourceType: doc.sourceType ?? "manual", + library: doc.library, + version: doc.version, + topicId: doc.topicId, + url: doc.url, + }); + } + } + + async indexRaw(input: RawInput): Promise { + const normalized = await normalizeRawInput(input); + if (normalized.chunks !== undefined && normalized.chunks.length > 1) { + let firstId = ""; + for (let i = 0; i < normalized.chunks.length; i++) { + const chunk = normalized.chunks[i]!; + const result = await indexDocument(this.db, this.provider, { + title: `${normalized.title} (part ${String(i + 1)})`, + content: chunk, + sourceType: "manual", + }); + if (i === 0) firstId = result.id; + } + return firstId; + } + const result = await indexDocument(this.db, this.provider, { + title: normalized.title, + content: normalized.content, + sourceType: "manual", + url: input.type === "url" ? input.url : undefined, + }); + return result.id; + } + + async indexBatch(docs: LiteDoc[], opts: { concurrency: number }): Promise { + const concurrency = Math.max(1, opts.concurrency); + let activeCount = 0; + let idx = 0; + + await new Promise((resolve) => { + if (docs.length === 0) { + resolve(); + return; + } + + const runNext = (): void => { + while (activeCount < concurrency && idx < docs.length) { + const doc = docs[idx]; + if (!doc) break; + idx++; + activeCount++; + void this.index([doc]).finally(() => { + activeCount--; + if (idx >= docs.length && activeCount === 0) { + resolve(); + } else { + runNext(); + } + }); + } + }; + + runNext(); + }); + } + + async search(query: string, opts?: LiteSearchOptions): Promise { + const { results } = await searchDocuments(this.db, this.provider, { + query, + limit: opts?.limit ?? 10, + topic: opts?.topic, + library: opts?.library, + tags: opts?.tags, + diversity: opts?.diversity, + }); + return results.map((r) => ({ + docId: r.documentId, + chunkId: r.chunkId, + title: r.title, + content: r.content, + score: r.score, + url: r.url, + })); + } + + async getContext(question: string, opts?: LiteContextOptions): Promise { + const { contextPrompt } = await getContextForQuestion(this.db, this.provider, { + question, + topK: opts?.topK ?? 5, + topic: opts?.topic, + library: opts?.library, + }); + return contextPrompt; + } + + async ask(question: string, opts?: LiteAskOptions): Promise { + const llm = opts?.llmProvider ?? this.llmProvider; + if (!llm) { + throw new Error("No LlmProvider configured. Pass llmProvider to constructor or ask() opts."); + } + const result = await askQuestion(this.db, this.provider, llm, { + question, + topK: opts?.topK ?? 5, + topic: opts?.topic, + library: opts?.library, + systemPrompt: opts?.systemPrompt, + }); + return result.answer; + } + + async *askStream(question: string, opts?: LiteAskOptions): AsyncGenerator { + const llm = opts?.llmProvider ?? this.llmProvider; + if (!llm) { + throw new Error("No LlmProvider configured."); + } + if (!llm.completeStream) { + throw new Error("This LlmProvider does not support streaming."); + } + const context = await this.getContext(question, opts); + yield* llm.completeStream(context, opts?.systemPrompt); + } + + rate(docId: string, score: number): void { + rateDocument(this.db, { documentId: docId, rating: score }); + } + + close(): void { + this.db.close(); + } +} diff --git a/src/lite/index.ts b/src/lite/index.ts new file mode 100644 index 0000000..fd87f96 --- /dev/null +++ b/src/lite/index.ts @@ -0,0 +1,11 @@ +export { LibScopeLite } from "./core.js"; +export type { + LiteOptions, + LiteDoc, + RawInput, + LiteSearchOptions, + LiteSearchResult, + LiteContextOptions, + LiteAskOptions, +} from "./types.js"; +export type { LlmProvider } from "../core/rag.js"; diff --git a/src/lite/normalize.ts b/src/lite/normalize.ts new file mode 100644 index 0000000..8366b02 --- /dev/null +++ b/src/lite/normalize.ts @@ -0,0 +1,100 @@ +import { readFileSync } from "node:fs"; +import { basename, extname } from "node:path"; +import { getParserForFile } from "../core/parsers/index.js"; +import { fetchAndConvert } from "../core/url-fetcher.js"; +import type { RawInput } from "./types.js"; + +export interface NormalizedInput { + title: string; + content: string; + chunks?: string[]; +} + +// Code extensions that trigger tree-sitter attempt +const CODE_EXTENSIONS = new Set([".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs", ".py"]); + +type TreeSitterChunkerType = import("./chunker-treesitter.js").TreeSitterChunker; +let treeSitterChunker: TreeSitterChunkerType | null = null; +let treeSitterLoaded = false; + +async function getTreeSitterChunker(): Promise { + if (treeSitterLoaded) return treeSitterChunker; + treeSitterLoaded = true; + try { + const { TreeSitterChunker } = await import("./chunker-treesitter.js"); + treeSitterChunker = new TreeSitterChunker(); + } catch { + /* optional dep not installed — graceful fallback */ + } + return treeSitterChunker; +} + +function extToLang(ext: string): string { + const map: Record = { + ts: "typescript", + tsx: "typescript", + js: "javascript", + jsx: "javascript", + mjs: "javascript", + cjs: "javascript", + py: "python", + }; + return map[ext.slice(1)] ?? ext.slice(1); +} + +export async function normalizeRawInput(input: RawInput): Promise { + switch (input.type) { + case "text": + return { title: input.title, content: input.content }; + + case "file": { + const ext = extname(input.path).toLowerCase(); + const buf = readFileSync(input.path); + const title = input.title ?? basename(input.path, ext); + + if (CODE_EXTENSIONS.has(ext)) { + const chunker = await getTreeSitterChunker(); + const lang = extToLang(ext); + if (chunker?.supports(lang)) { + const codeChunks = await chunker.chunk(buf.toString("utf-8"), lang); + return { + title, + content: codeChunks[0]?.content ?? "", + chunks: codeChunks.map((c) => c.content), + }; + } + } + + const parser = getParserForFile(input.path); + const content = parser ? await parser.parse(buf) : buf.toString("utf-8"); + return { title, content }; + } + + case "buffer": { + const ext = extname(input.filename).toLowerCase(); + const title = input.title ?? basename(input.filename, ext); + + if (CODE_EXTENSIONS.has(ext)) { + const chunker = await getTreeSitterChunker(); + const lang = extToLang(ext); + if (chunker?.supports(lang)) { + const codeChunks = await chunker.chunk(input.buffer.toString("utf-8"), lang); + return { + title, + content: codeChunks[0]?.content ?? "", + chunks: codeChunks.map((c) => c.content), + }; + } + } + + const parser = getParserForFile(input.filename); + const content = parser ? await parser.parse(input.buffer) : input.buffer.toString("utf-8"); + return { title, content }; + } + + case "url": { + const fetched = await fetchAndConvert(input.url); + return { title: input.title ?? fetched.title, content: fetched.content }; + } + } +} diff --git a/src/lite/types.ts b/src/lite/types.ts new file mode 100644 index 0000000..600303b --- /dev/null +++ b/src/lite/types.ts @@ -0,0 +1,62 @@ +import type Database from "better-sqlite3"; +import type { EmbeddingProvider } from "../providers/embedding.js"; +import type { LlmProvider } from "../core/rag.js"; + +export interface LiteOptions { + /** Path to SQLite database file. Defaults to ~/.libscope/lite.db. Use ':memory:' for in-memory. */ + dbPath?: string | undefined; + /** Pre-configured database instance. When provided, dbPath is ignored and no migrations or + * vector table setup are performed — caller is responsible for schema initialization. */ + db?: Database.Database | undefined; + provider?: EmbeddingProvider | undefined; + model?: string | undefined; + llmProvider?: LlmProvider | undefined; +} + +export interface LiteDoc { + title: string; + content: string; + url?: string | undefined; + sourceType?: "library" | "topic" | "manual" | "model-generated" | undefined; + library?: string | undefined; + version?: string | undefined; + topicId?: string | undefined; + language?: string | undefined; +} + +export type RawInput = + | { type: "file"; path: string; title?: string | undefined } + | { type: "url"; url: string; title?: string | undefined } + | { type: "text"; content: string; title: string } + | { type: "buffer"; buffer: Buffer; filename: string; title?: string | undefined }; + +export interface LiteSearchOptions { + limit?: number | undefined; + topic?: string | undefined; + library?: string | undefined; + tags?: string[] | undefined; + diversity?: number | undefined; +} + +export interface LiteSearchResult { + docId: string; + chunkId: string; + title: string; + content: string; + score: number; + url: string | null; +} + +export interface LiteContextOptions { + topK?: number | undefined; + topic?: string | undefined; + library?: string | undefined; +} + +export interface LiteAskOptions { + topK?: number | undefined; + topic?: string | undefined; + library?: string | undefined; + systemPrompt?: string | undefined; + llmProvider?: LlmProvider | undefined; +} diff --git a/tests/fixtures/mock-provider.ts b/tests/fixtures/mock-provider.ts index 143a617..d8977c0 100644 --- a/tests/fixtures/mock-provider.ts +++ b/tests/fixtures/mock-provider.ts @@ -23,16 +23,17 @@ export class MockEmbeddingProvider implements EmbeddingProvider { /** Simple deterministic hash → 4D unit vector. */ private hashToVector(text: string): number[] { - let hash = 0; + let hash = 5381; // Non-zero seed avoids the zero-hash collapse for (let i = 0; i < text.length; i++) { - hash = Math.trunc(hash * 31 + text.codePointAt(i)!); + hash = Math.trunc((hash * 33) ^ text.codePointAt(i)!); } const a = Math.sin(hash) * 10000; const b = Math.sin(hash + 1) * 10000; const c = Math.sin(hash + 2) * 10000; const d = Math.sin(hash + 3) * 10000; - // Normalize + // Normalize — guard against zero magnitude (hash collision to 0) const mag = Math.hypot(a, b, c, d); + if (mag === 0) return [1, 0, 0, 0]; return [a / mag, b / mag, c / mag, d / mag]; } } diff --git a/tests/integration/lite-embed.test.ts b/tests/integration/lite-embed.test.ts new file mode 100644 index 0000000..6a48e9c --- /dev/null +++ b/tests/integration/lite-embed.test.ts @@ -0,0 +1,172 @@ +import { describe, it, expect, beforeAll, afterAll } from "vitest"; +import { LibScopeLite } from "../../src/lite/index.js"; +import { MockEmbeddingProvider } from "../fixtures/mock-provider.js"; + +/** + * Integration test: full LibScopeLite workflow. + * + * Uses a real in-memory SQLite database with MockEmbeddingProvider + * to exercise the complete pipeline: indexBatch → search → getContext → rate. + */ +describe("LibScopeLite integration", () => { + let lite: LibScopeLite; + let provider: MockEmbeddingProvider; + + const corpus = [ + { + title: "React useState Hook", + content: + "The useState hook lets you add state to functional components. " + + "Call useState with the initial state value and it returns an array with " + + "the current state and a setter function. Re-renders happen when state changes.", + }, + { + title: "React useEffect Hook", + content: + "useEffect runs side effects in functional components. " + + "Pass a function and a dependency array. The effect re-runs when dependencies change. " + + "Return a cleanup function for subscriptions or timers.", + }, + { + title: "TypeScript Generics", + content: + "Generics allow creating reusable components that work with multiple types. " + + "Use angle brackets to declare type parameters. " + + "Constraints narrow what types are accepted using the extends keyword.", + }, + { + title: "Node.js Event Loop", + content: + "The Node.js event loop processes callbacks in phases: timers, pending, idle, " + + "poll, check, and close. setTimeout and setInterval run in the timers phase. " + + "setImmediate runs in the check phase, after I/O callbacks.", + }, + { + title: "SQL Indexes", + content: + "Database indexes speed up queries by creating sorted data structures. " + + "B-tree indexes are the default in most databases. " + + "Composite indexes cover multiple columns and follow the leftmost prefix rule.", + }, + ]; + + beforeAll(async () => { + provider = new MockEmbeddingProvider(); + lite = new LibScopeLite({ dbPath: ":memory:", provider }); + await lite.indexBatch(corpus, { concurrency: 2 }); + }); + + afterAll(() => { + lite.close(); + }); + + describe("indexBatch → search", () => { + it("should find indexed documents via search", async () => { + const results = await lite.search("React hooks"); + expect(results.length).toBeGreaterThan(0); + }); + + it("should return results with all expected fields", async () => { + const results = await lite.search("generics"); + expect(results.length).toBeGreaterThan(0); + + const r = results[0]!; + expect(typeof r.docId).toBe("string"); + expect(typeof r.chunkId).toBe("string"); + expect(typeof r.title).toBe("string"); + expect(typeof r.content).toBe("string"); + expect(typeof r.score).toBe("number"); + expect(r.score).toBeGreaterThan(0); + }); + + it("should respect the limit option", async () => { + const results = await lite.search("Node.js", { limit: 2 }); + expect(results.length).toBeLessThanOrEqual(2); + }); + + it("should return results for different queries", async () => { + const r1 = await lite.search("React useState"); + const r2 = await lite.search("SQL database index"); + + expect(r1.length).toBeGreaterThan(0); + expect(r2.length).toBeGreaterThan(0); + }); + }); + + describe("getContext", () => { + it("should return a context prompt string containing relevant content", async () => { + const context = await lite.getContext("How does the Node.js event loop work?"); + expect(typeof context).toBe("string"); + expect(context.length).toBeGreaterThan(0); + }); + + it("should include question in context", async () => { + const context = await lite.getContext("What are TypeScript generics?"); + // The context prompt typically includes the question + expect(context).toContain("TypeScript generics"); + }); + }); + + describe("rate", () => { + it("should rate a document found via search", async () => { + const results = await lite.search("React hooks"); + expect(results.length).toBeGreaterThan(0); + + const docId = results[0]!.docId; + // Should not throw + lite.rate(docId, 5); + lite.rate(docId, 3); + }); + + it("should reject invalid ratings", async () => { + // We need a valid doc ID first + const rateInvalid = async (): Promise => { + const results = await lite.search("React"); + const docId = results[0]!.docId; + lite.rate(docId, 0); // 0 is out of range + }; + await expect(rateInvalid()).rejects.toThrow(); + }); + }); + + describe("full pipeline: index → search → getContext → rate", () => { + it("should execute the complete workflow end-to-end", async () => { + // 1. Index additional docs + const extraLite = new LibScopeLite({ + dbPath: ":memory:", + provider: new MockEmbeddingProvider(), + }); + await extraLite.index([ + { + title: "Docker Basics", + content: + "Docker containers package applications with their dependencies. " + + "Images are built from Dockerfiles. Containers run as isolated processes.", + library: "docker", + }, + { + title: "Kubernetes Pods", + content: + "Kubernetes pods are the smallest deployable units. " + + "A pod can contain one or more containers sharing network and storage.", + library: "kubernetes", + }, + ]); + + // 2. Search + const searchResults = await extraLite.search("Docker containers"); + expect(searchResults.length).toBeGreaterThan(0); + expect(searchResults[0]!.title).toBeDefined(); + + // 3. Get context + const context = await extraLite.getContext("How does Docker work?"); + expect(context.length).toBeGreaterThan(0); + + // 4. Rate + const docId = searchResults[0]!.docId; + extraLite.rate(docId, 4); + + extraLite.close(); + }); + }); +}); diff --git a/tests/unit/code-chunker.test.ts b/tests/unit/code-chunker.test.ts new file mode 100644 index 0000000..5c29d16 --- /dev/null +++ b/tests/unit/code-chunker.test.ts @@ -0,0 +1,315 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { TreeSitterChunker } from "../../src/lite/chunker-treesitter.js"; +import { ValidationError } from "../../src/errors.js"; + +describe("TreeSitterChunker", () => { + let chunker: TreeSitterChunker; + + beforeEach(() => { + chunker = new TreeSitterChunker(); + }); + + describe("supports()", () => { + it("should return true for TypeScript aliases", () => { + expect(chunker.supports("typescript")).toBe(true); + expect(chunker.supports("ts")).toBe(true); + expect(chunker.supports("tsx")).toBe(true); + }); + + it("should return true for JavaScript aliases", () => { + expect(chunker.supports("javascript")).toBe(true); + expect(chunker.supports("js")).toBe(true); + expect(chunker.supports("jsx")).toBe(true); + expect(chunker.supports("mjs")).toBe(true); + expect(chunker.supports("cjs")).toBe(true); + }); + + it("should return true for Python aliases", () => { + expect(chunker.supports("python")).toBe(true); + expect(chunker.supports("py")).toBe(true); + }); + + it("should be case-insensitive", () => { + expect(chunker.supports("TypeScript")).toBe(true); + expect(chunker.supports("PYTHON")).toBe(true); + expect(chunker.supports("Js")).toBe(true); + }); + + it("should return false for unsupported languages", () => { + expect(chunker.supports("rust")).toBe(false); + expect(chunker.supports("go")).toBe(false); + expect(chunker.supports("java")).toBe(false); + expect(chunker.supports("c++")).toBe(false); + expect(chunker.supports("")).toBe(false); + }); + }); + + describe("chunk() — language validation", () => { + it("should throw ValidationError for unsupported language", async () => { + await expect(chunker.chunk("fn main() {}", "rust")).rejects.toThrow(ValidationError); + await expect(chunker.chunk("fn main() {}", "rust")).rejects.toThrow( + 'Unsupported language for code chunking: "rust"', + ); + }); + + it("should throw ValidationError for empty language string", async () => { + await expect(chunker.chunk("code", "")).rejects.toThrow(ValidationError); + }); + }); + + describe("chunk() — tree-sitter not installed", () => { + it("should throw ValidationError with install instructions when tree-sitter is missing", async () => { + // tree-sitter is not installed in test environment, so chunk() should fail gracefully + // If it does happen to be installed, this test is still valid — it just takes the other path + try { + await chunker.chunk("const x = 1;", "typescript"); + // If tree-sitter IS installed, we skip this assertion + } catch (err: unknown) { + expect(err).toBeInstanceOf(ValidationError); + expect((err as ValidationError).message).toMatch(/tree-sitter/i); + expect((err as ValidationError).message).toMatch(/npm install/i); + } + }); + }); + + describe("chunk() — with mocked tree-sitter", () => { + interface MockNode { + type: string; + text: string; + startPosition: { row: number; column: number }; + endPosition: { row: number; column: number }; + childCount: number; + child: (i: number) => MockNode | null; + namedChildCount: number; + namedChild: (i: number) => MockNode | null; + } + + /** + * Helper: create a mock TSNode that simulates tree-sitter node shape. + */ + function makeMockNode( + type: string, + text: string, + startRow: number, + endRow: number, + children: MockNode[] = [], + ): MockNode { + return { + type, + text, + startPosition: { row: startRow, column: 0 }, + endPosition: { row: endRow, column: 0 }, + childCount: children.length, + child: (i: number) => children[i] ?? null, + namedChildCount: children.length, + namedChild: (i: number) => children[i] ?? null, + }; + } + + /** + * Create a chunker with mocked tree-sitter internals for testing + * the algorithm without requiring tree-sitter to be installed. + */ + function createMockedChunker( + rootChildren: ReturnType[], + ): TreeSitterChunker { + const instance = new TreeSitterChunker(); + + const rootNode = makeMockNode("program", "", 0, 100, rootChildren); + + // Mock the private getParser and loadGrammar methods + // @ts-expect-error — accessing private method for testing + instance.getParser = vi.fn().mockResolvedValue({ + setLanguage: vi.fn(), + parse: vi.fn().mockReturnValue({ rootNode }), + }); + // @ts-expect-error — accessing private method for testing + instance.loadGrammar = vi.fn().mockResolvedValue({}); + + return instance; + } + + it("should chunk TypeScript code at function boundaries", async () => { + const importNode = makeMockNode("import_statement", 'import { foo } from "bar";', 0, 0); + const fn1 = makeMockNode( + "function_declaration", + "function greet() {\n return 'hi';\n}", + 2, + 4, + ); + const fn2 = makeMockNode( + "function_declaration", + "function farewell() {\n return 'bye';\n}", + 6, + 8, + ); + + const chunker = createMockedChunker([importNode, fn1, fn2]); + const chunks = await chunker.chunk("unused — mocked", "typescript"); + + expect(chunks.length).toBe(2); + + // First function should include the preamble (import) + expect(chunks[0]?.content).toContain('import { foo } from "bar"'); + expect(chunks[0]?.content).toContain("function greet()"); + expect(chunks[0]?.nodeType).toBe("function_declaration"); + + // Second function standalone + expect(chunks[1]?.content).toContain("function farewell()"); + expect(chunks[1]?.nodeType).toBe("function_declaration"); + }); + + it("should chunk at class declaration boundaries", async () => { + const cls = makeMockNode("class_declaration", "class Foo {\n bar() {}\n}", 0, 2); + + const chunker = createMockedChunker([cls]); + const chunks = await chunker.chunk("unused", "typescript"); + + expect(chunks.length).toBe(1); + expect(chunks[0]?.nodeType).toBe("class_declaration"); + expect(chunks[0]?.content).toContain("class Foo"); + }); + + it("should return whole source as single chunk when no declarations found", async () => { + // Empty program with no children — extractChunks returns [] + const instance = new TreeSitterChunker(); + const source = "// just a comment\n"; + + const rootNode = makeMockNode("program", source, 0, 1, []); + + // @ts-expect-error — accessing private method for testing + instance.getParser = vi.fn().mockResolvedValue({ + setLanguage: vi.fn(), + parse: vi.fn().mockReturnValue({ rootNode }), + }); + // @ts-expect-error — accessing private method for testing + instance.loadGrammar = vi.fn().mockResolvedValue({}); + + const chunks = await instance.chunk(source, "typescript"); + + expect(chunks.length).toBe(1); + expect(chunks[0]?.nodeType).toBe("module"); + expect(chunks[0]?.startLine).toBe(1); + }); + + it("should accumulate preamble (imports/comments) into first declaration", async () => { + const imp1 = makeMockNode("import_statement", 'import a from "a";', 0, 0); + const imp2 = makeMockNode("import_statement", 'import b from "b";', 1, 1); + const fn = makeMockNode("function_declaration", "function main() {}", 3, 3); + + const chunker = createMockedChunker([imp1, imp2, fn]); + const chunks = await chunker.chunk("unused", "ts"); + + expect(chunks.length).toBe(1); + expect(chunks[0]?.content).toContain('import a from "a"'); + expect(chunks[0]?.content).toContain('import b from "b"'); + expect(chunks[0]?.content).toContain("function main()"); + }); + + it("should handle trailing non-declaration content", async () => { + const fn = makeMockNode("function_declaration", "function foo() {}", 0, 0); + const trailing = makeMockNode("expression_statement", "console.log('done');", 2, 2); + + const chunker = createMockedChunker([fn, trailing]); + const chunks = await chunker.chunk("unused", "js"); + + expect(chunks.length).toBe(2); + expect(chunks[0]?.nodeType).toBe("function_declaration"); + expect(chunks[1]?.nodeType).toBe("trailing"); + expect(chunks[1]?.content).toContain("console.log"); + }); + + it("should split oversized nodes by recursing into children", async () => { + const method1 = makeMockNode("method_definition", "a".repeat(100), 1, 3); + const method2 = makeMockNode("method_definition", "b".repeat(100), 4, 6); + + const bigClass = makeMockNode( + "class_declaration", + "a".repeat(100) + "\n\n" + "b".repeat(100), + 0, + 6, + [method1, method2], + ); + + // Use a small maxChunkSize to trigger splitting + const chunker = createMockedChunker([bigClass]); + const chunks = await chunker.chunk("unused", "typescript", 150); + + // Should have been split into multiple chunks + expect(chunks.length).toBeGreaterThan(1); + }); + + it("should produce correct startLine and endLine (1-based)", async () => { + const fn = makeMockNode("function_declaration", "function test() {}", 5, 10); + + const chunker = createMockedChunker([fn]); + const chunks = await chunker.chunk("unused", "typescript"); + + expect(chunks[0]?.startLine).toBe(6); // 0-based row 5 → 1-based line 6 + expect(chunks[0]?.endLine).toBe(11); // 0-based row 10 → 1-based line 11 + }); + + it("should support Python function_definition nodes", async () => { + const fn = makeMockNode("function_definition", "def hello():\n pass", 0, 1); + + const chunker = createMockedChunker([fn]); + const chunks = await chunker.chunk("unused", "python"); + + expect(chunks.length).toBe(1); + expect(chunks[0]?.content).toContain("def hello()"); + }); + + it("should support Python class_definition nodes", async () => { + const cls = makeMockNode( + "class_definition", + "class MyClass:\n def __init__(self):\n pass", + 0, + 2, + ); + + const chunker = createMockedChunker([cls]); + const chunks = await chunker.chunk("unused", "py"); + + expect(chunks.length).toBe(1); + expect(chunks[0]?.nodeType).toBe("class_definition"); + }); + + it("should handle empty source returning single module chunk", async () => { + // No children means extractChunks returns empty → falls back to whole source + const instance = new TreeSitterChunker(); + const rootNode = makeMockNode("program", "", 0, 0, []); + + // @ts-expect-error — accessing private method for testing + instance.getParser = vi.fn().mockResolvedValue({ + setLanguage: vi.fn(), + parse: vi.fn().mockReturnValue({ rootNode }), + }); + // @ts-expect-error — accessing private method for testing + instance.loadGrammar = vi.fn().mockResolvedValue({}); + + const chunks = await instance.chunk("", "typescript"); + + expect(chunks.length).toBe(1); + expect(chunks[0]?.nodeType).toBe("module"); + }); + + it("should handle parse failure with ValidationError", async () => { + const instance = new TreeSitterChunker(); + + // @ts-expect-error — accessing private method for testing + instance.getParser = vi.fn().mockResolvedValue({ + setLanguage: vi.fn(), + parse: vi.fn().mockImplementation(() => { + throw new Error("Parse error"); + }), + }); + // @ts-expect-error — accessing private method for testing + instance.loadGrammar = vi.fn().mockResolvedValue({}); + + await expect(instance.chunk("bad code", "typescript")).rejects.toThrow(ValidationError); + await expect(instance.chunk("bad code", "typescript")).rejects.toThrow( + "Failed to parse typescript source", + ); + }); + }); +}); diff --git a/tests/unit/lite.test.ts b/tests/unit/lite.test.ts new file mode 100644 index 0000000..a427ea2 --- /dev/null +++ b/tests/unit/lite.test.ts @@ -0,0 +1,272 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; +import { LibScopeLite } from "../../src/lite/index.js"; +import { MockEmbeddingProvider } from "../fixtures/mock-provider.js"; +import type { LlmProvider } from "../../src/core/rag.js"; + +function* fakeStream(): Generator { + yield "Hello"; + yield " world"; +} + +describe("LibScopeLite", () => { + let lite: LibScopeLite; + let provider: MockEmbeddingProvider; + + beforeEach(() => { + provider = new MockEmbeddingProvider(); + lite = new LibScopeLite({ dbPath: ":memory:", provider }); + }); + + afterEach(() => { + lite.close(); + }); + + describe("constructor", () => { + it("should create an instance with in-memory DB", () => { + expect(lite).toBeInstanceOf(LibScopeLite); + }); + + it("should accept custom embedding provider", () => { + const custom = new MockEmbeddingProvider(); + const instance = new LibScopeLite({ dbPath: ":memory:", provider: custom }); + expect(instance).toBeInstanceOf(LibScopeLite); + instance.close(); + }); + }); + + describe("index()", () => { + it("should index a single document", async () => { + await lite.index([{ title: "Test Doc", content: "This is test content for indexing." }]); + + expect(provider.embedBatchCallCount).toBeGreaterThan(0); + }); + + it("should index multiple documents", async () => { + await lite.index([ + { title: "Doc A", content: "Content of document A about TypeScript." }, + { title: "Doc B", content: "Content of document B about Python." }, + ]); + + // Both docs should have been processed + expect(provider.embedBatchCallCount).toBeGreaterThanOrEqual(2); + }); + + it("should index with optional metadata fields", async () => { + await lite.index([ + { + title: "Library Doc", + content: "React documentation content here.", + library: "react", + sourceType: "library", + version: "18.0.0", + url: "https://react.dev", + }, + ]); + + // Should succeed without errors + expect(provider.embedBatchCallCount).toBeGreaterThan(0); + }); + }); + + describe("indexBatch()", () => { + it("should index documents with concurrency control", async () => { + const docs = Array.from({ length: 5 }, (_, i) => ({ + title: `Batch Doc ${i}`, + content: `Batch content number ${i} with enough text to be meaningful.`, + })); + + await lite.indexBatch(docs, { concurrency: 2 }); + + expect(provider.embedBatchCallCount).toBe(5); + }); + + it("should handle empty array", async () => { + await lite.indexBatch([], { concurrency: 2 }); + expect(provider.embedBatchCallCount).toBe(0); + }); + + it("should handle concurrency of 1 (sequential)", async () => { + const docs = [ + { title: "A", content: "Content A for sequential test." }, + { title: "B", content: "Content B for sequential test." }, + ]; + + await lite.indexBatch(docs, { concurrency: 1 }); + expect(provider.embedBatchCallCount).toBe(2); + }); + }); + + describe("search()", () => { + beforeEach(async () => { + await lite.index([ + { + title: "React Hooks", + content: "useState and useEffect are the most common React hooks.", + }, + { title: "Vue Composition", content: "Vue 3 composition API uses setup function." }, + { title: "Angular DI", content: "Angular uses dependency injection pattern extensively." }, + ]); + }); + + it("should return search results", async () => { + const results = await lite.search("React hooks"); + expect(results.length).toBeGreaterThan(0); + }); + + it("should return results with expected shape", async () => { + const results = await lite.search("React"); + const first = results[0]; + expect(first).toBeDefined(); + expect(first).toHaveProperty("docId"); + expect(first).toHaveProperty("chunkId"); + expect(first).toHaveProperty("title"); + expect(first).toHaveProperty("content"); + expect(first).toHaveProperty("score"); + expect(first).toHaveProperty("url"); + expect(typeof first?.score).toBe("number"); + }); + + it("should respect limit option", async () => { + const results = await lite.search("API", { limit: 1 }); + expect(results.length).toBeLessThanOrEqual(1); + }); + }); + + describe("getContext()", () => { + beforeEach(async () => { + await lite.index([ + { + title: "Node.js Streams", + content: "Readable streams in Node.js are a fundamental pattern.", + }, + ]); + }); + + it("should return a context string", async () => { + const context = await lite.getContext("How do Node.js streams work?"); + expect(typeof context).toBe("string"); + expect(context.length).toBeGreaterThan(0); + }); + }); + + describe("ask()", () => { + it("should throw when no LlmProvider is configured", async () => { + await lite.index([{ title: "Test", content: "Some content for testing ask." }]); + await expect(lite.ask("What is this about?")).rejects.toThrow("No LlmProvider configured"); + }); + + it("should call LlmProvider.complete with context", async () => { + // Declare the spy separately so we never reference it as an object method + const completeSpy = vi.fn().mockResolvedValue({ text: "Mocked LLM response" }); + const mockLlm: LlmProvider = { model: "test-model", complete: completeSpy }; + + const liteWithLlm = new LibScopeLite({ + dbPath: ":memory:", + provider, + llmProvider: mockLlm, + }); + + await liteWithLlm.index([ + { title: "Test Doc", content: "Information about testing patterns." }, + ]); + + const answer = await liteWithLlm.ask("What are testing patterns?"); + expect(answer).toBe("Mocked LLM response"); + expect(completeSpy).toHaveBeenCalledOnce(); + expect(completeSpy.mock.calls[0]?.[0]).toContain("testing patterns"); + + liteWithLlm.close(); + }); + + it("should allow llmProvider override in ask() opts", async () => { + const mockLlm: LlmProvider = { + model: "override-model", + complete: vi.fn().mockResolvedValue({ text: "Override response" }), + }; + + await lite.index([{ title: "Test", content: "Some content for LLM." }]); + + const answer = await lite.ask("Question?", { llmProvider: mockLlm }); + expect(answer).toBe("Override response"); + }); + }); + + describe("askStream()", () => { + it("should throw when no LlmProvider is configured", async () => { + await lite.index([{ title: "Test", content: "Content here." }]); + const gen = lite.askStream("Question?"); + await expect(gen.next()).rejects.toThrow("No LlmProvider configured"); + }); + + it("should throw when LlmProvider does not support streaming", async () => { + const mockLlm: LlmProvider = { + model: "no-stream", + complete: vi.fn().mockResolvedValue({ text: "done" }), + // No completeStream method + }; + + const liteWithLlm = new LibScopeLite({ + dbPath: ":memory:", + provider, + llmProvider: mockLlm, + }); + + await liteWithLlm.index([{ title: "Test", content: "Content." }]); + + const gen = liteWithLlm.askStream("Question?"); + await expect(gen.next()).rejects.toThrow("does not support streaming"); + + liteWithLlm.close(); + }); + + it("should stream tokens from LlmProvider", async () => { + const mockLlm: LlmProvider = { + model: "stream-model", + complete: vi.fn().mockResolvedValue({ text: "done" }), + completeStream: vi.fn().mockReturnValue(fakeStream()), + }; + + const liteWithLlm = new LibScopeLite({ + dbPath: ":memory:", + provider, + llmProvider: mockLlm, + }); + + await liteWithLlm.index([{ title: "Test", content: "Test content." }]); + + const tokens: string[] = []; + for await (const token of liteWithLlm.askStream("Question?")) { + tokens.push(token); + } + + expect(tokens).toEqual(["Hello", " world"]); + liteWithLlm.close(); + }); + }); + + describe("rate()", () => { + it("should rate an indexed document", async () => { + await lite.index([{ title: "Rate Me", content: "Content to rate." }]); + + // Find the doc ID via search + const results = await lite.search("rate"); + expect(results.length).toBeGreaterThan(0); + const docId = results[0]?.docId; + expect(docId).toBeDefined(); + + // Should not throw + lite.rate(docId!, 5); + }); + + it("should throw for nonexistent document", () => { + expect(() => lite.rate("nonexistent-doc", 3)).toThrow(); + }); + }); + + describe("close()", () => { + it("should close the database without error", () => { + const instance = new LibScopeLite({ dbPath: ":memory:", provider }); + expect(() => instance.close()).not.toThrow(); + }); + }); +});