Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion eslint.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ import prettierConfig from "eslint-config-prettier";

export default [
{
files: ["src/**/*.ts", "tests/**/*.ts"],
ignores: ["**/dist/**"],
},
{
files: ["src/**/*.ts", "tests/**/*.ts", "packages/parsers/src/**/*.ts", "packages/parsers/tests/**/*.ts"],
languageOptions: {
parser: tsParser,
parserOptions: {
Expand Down
44 changes: 34 additions & 10 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 12 additions & 18 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,20 @@
"files": [
"dist/"
],
"workspaces": [
"packages/*"
],
"scripts": {
"build": "tsc",
"build": "npm run build --workspace=packages/parsers && tsc",
"dev": "tsc --watch",
"lint": "eslint src/ tests/",
"lint:fix": "eslint src/ tests/ --fix",
"format": "prettier --write 'src/**/*.ts' 'tests/**/*.ts'",
"format:check": "prettier --check 'src/**/*.ts' 'tests/**/*.ts'",
"typecheck": "tsc --noEmit",
"test": "vitest run",
"lint": "eslint src/ tests/ packages/",
"lint:fix": "eslint src/ tests/ packages/ --fix",
"format": "prettier --write 'src/**/*.ts' 'tests/**/*.ts' 'packages/parsers/src/**/*.ts' 'packages/parsers/tests/**/*.ts'",
"format:check": "prettier --check 'src/**/*.ts' 'tests/**/*.ts' 'packages/parsers/src/**/*.ts' 'packages/parsers/tests/**/*.ts'",
"typecheck": "tsc -p tsconfig.typecheck.json",
"test": "npm run test --workspace=packages/parsers && vitest run",
"test:watch": "vitest",
"test:coverage": "vitest run --coverage",
"test:coverage": "npm run test --workspace=packages/parsers && vitest run --coverage",
"prepare": "husky",
"serve": "node dist/mcp/server.js",
"docs:dev": "vitepress dev docs",
Expand Down Expand Up @@ -57,29 +60,20 @@
},
"dependencies": {
"@anthropic-ai/sdk": "^0.78.0",
"@libscope/parsers": "*",
"@modelcontextprotocol/sdk": "^1.0.0",
"@xenova/transformers": "^2.17.2",
"better-sqlite3": "^12.6.2",
"commander": "^14.0.3",
"csv-parse": "^6.1.0",
"epub2": "^3.0.2",
"js-yaml": "^4.1.1",
"node-cron": "^4.2.1",
"node-html-markdown": "^2.0.0",
"openai": "^6.25.0",
"pino": "^10.3.1",
"pizzip": "^3.2.0",
"sqlite-vec": "^0.1.0",
"undici": "^7.24.5",
"zod": "^4.3.6"
},
"optionalDependencies": {
"mammoth": "^1.11.0",
"pdf-parse": "^2.4.5"
},
"devDependencies": {
"@types/better-sqlite3": "^7.6.0",
"@types/js-yaml": "^4.0.9",
"@types/node": "^25.3.3",
"@types/node-cron": "^3.0.11",
"@types/pdf-parse": "^1.1.5",
Expand Down
42 changes: 42 additions & 0 deletions packages/parsers/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"name": "@libscope/parsers",
"version": "1.0.0",
"description": "Format parsers (PDF, DOCX, EPUB, PPTX, CSV, JSON, YAML, HTML → text/markdown) for libscope",
"type": "module",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": "./dist/index.js"
},
"files": [
"dist/"
],
"scripts": {
"build": "tsc",
"test": "vitest run",
"test:coverage": "vitest run --coverage",
"typecheck": "tsc --noEmit"
},
"engines": {
"node": ">=20"
},
"dependencies": {
"csv-parse": "^6.1.0",
"epub2": "^3.0.2",
"js-yaml": "^4.1.1",
"node-html-markdown": "^2.0.0",
"pizzip": "^3.2.0"
},
"optionalDependencies": {
"mammoth": "^1.11.0",
"pdf-parse": "^2.4.5"
},
"devDependencies": {
"@types/js-yaml": "^4.0.9",
"@types/node": "^25.3.3",
"@types/pdf-parse": "^1.1.5",
"@vitest/coverage-v8": "^4.0.18",
"typescript": "^5.6.0",
"vitest": "^4.0.18"
}
}
7 changes: 2 additions & 5 deletions src/core/parsers/csv.ts → packages/parsers/src/csv.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { DocumentParser } from "./index.js";
import { ValidationError } from "../../errors.js";
import { ParseError } from "./errors.js";
import { parse } from "csv-parse/sync";

/** Parses CSV files, converting to a Markdown table. */
Expand Down Expand Up @@ -36,10 +36,7 @@ export class CsvParser implements DocumentParser {
return Promise.resolve(lines.join("\n"));
} catch (err) {
return Promise.reject(
new ValidationError(
`Invalid CSV: ${err instanceof Error ? err.message : String(err)}`,
err,
),
new ParseError(`Invalid CSV: ${err instanceof Error ? err.message : String(err)}`, err),
);
}
}
Expand Down
30 changes: 22 additions & 8 deletions src/core/parsers/epub.ts → packages/parsers/src/epub.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,24 @@ import { join } from "node:path";
import { tmpdir } from "node:os";
import { randomUUID } from "node:crypto";
import type { DocumentParser } from "./index.js";
import { ValidationError } from "../../errors.js";
import { ParseError } from "./errors.js";

/** Strip HTML tags from a string in O(n) time without backtracking. */
function stripHtmlTags(input: string): string {
let result = "";
let inTag = false;
for (const char of input) {
if (char === "<") {
inTag = true;
result += " ";
} else if (char === ">") {
inTag = false;
} else if (!inTag) {
result += char;
}
}
return result;
}

/** Parses EPUB files using epub2. */
export class EpubParser implements DocumentParser {
Expand All @@ -15,7 +32,7 @@ export class EpubParser implements DocumentParser {
const mod = await import("epub2");
EPub = mod.EPub;
} catch (err) {
throw new ValidationError(
throw new ParseError(
'EPUB parsing requires the "epub2" package. Install it with: npm install epub2',
err,
);
Expand All @@ -36,11 +53,8 @@ export class EpubParser implements DocumentParser {
.getChapterAsync;
if (!getChapter) continue;
const html: string = await getChapter.call(epub, item.id);
// Strip HTML tags to get plain text
const text = html
.replaceAll(/<[^>]+>/g, " ")
.replaceAll(/\s+/g, " ")
.trim();
// Strip HTML tags and collapse whitespace
const text = stripHtmlTags(html).replaceAll(/\s+/g, " ").trim();
if (text.length > 0) {
chapters.push(text);
}
Expand All @@ -50,7 +64,7 @@ export class EpubParser implements DocumentParser {
}

if (chapters.length === 0) {
throw new ValidationError("EPUB file contains no readable chapters");
throw new ParseError("EPUB file contains no readable chapters");
}

return chapters.join("\n\n");
Expand Down
8 changes: 8 additions & 0 deletions packages/parsers/src/errors.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/** Standalone error class for @libscope/parsers. No cross-package dependencies. */
export class ParseError extends Error {
constructor(message: string, cause?: unknown) {
super(message);
this.name = "ParseError";
this.cause = cause;
}
}
4 changes: 2 additions & 2 deletions src/core/parsers/html.ts → packages/parsers/src/html.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { NodeHtmlMarkdown } from "node-html-markdown";
import { ValidationError } from "../../errors.js";
import { ParseError } from "./errors.js";
import type { DocumentParser } from "./index.js";

const nhm = new NodeHtmlMarkdown({ ignore: ["script", "style", "nav"] });
Expand All @@ -17,7 +17,7 @@ export class HtmlParser implements DocumentParser {
return Promise.resolve(markdown.replaceAll(/\n{3,}/g, "\n\n").trimEnd());
} catch (err: unknown) {
const message = err instanceof Error ? err.message : "Unknown HTML parsing error";
throw new ValidationError(`Failed to parse HTML: ${message}`);
throw new ParseError(`Failed to parse HTML: ${message}`);
}
}
}
50 changes: 50 additions & 0 deletions packages/parsers/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { extname } from "node:path";
import { MarkdownParser } from "./markdown.js";
import { PlainTextParser } from "./text.js";
import { JsonParser } from "./json-parser.js";
import { YamlParser } from "./yaml.js";
import { CsvParser } from "./csv.js";
import { PdfParser } from "./pdf.js";
import { WordParser } from "./word.js";
import { HtmlParser } from "./html.js";
import { EpubParser } from "./epub.js";
import { PptxParser } from "./pptx.js";

/** Interface for document format parsers. */
export interface DocumentParser {
/** File extensions this parser handles (e.g. [".pdf", ".docx"]). */
readonly extensions: string[];
/** Parse a file buffer into plain text or markdown suitable for indexing. */
parse(content: Buffer): Promise<string>;
}

const parsers: DocumentParser[] = [
new MarkdownParser(),
new PlainTextParser(),
new JsonParser(),
new YamlParser(),
new CsvParser(),
new PdfParser(),
new WordParser(),
new HtmlParser(),
new EpubParser(),
new PptxParser(),
];

const extensionMap = new Map<string, DocumentParser>();
for (const parser of parsers) {
for (const ext of parser.extensions) {
extensionMap.set(ext.toLowerCase(), parser);
}
}

/** Get a parser for the given filename based on its extension. Returns null if unsupported. */
export function getParserForFile(filename: string): DocumentParser | null {
const ext = extname(filename).toLowerCase();
return extensionMap.get(ext) ?? null;
}

/** Get all file extensions supported by the parsers. */
export function getSupportedExtensions(): string[] {
return [...extensionMap.keys()].sort((a, b) => a.localeCompare(b));
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { DocumentParser } from "./index.js";
import { ValidationError } from "../../errors.js";
import { ParseError } from "./errors.js";

/** Parses JSON files, outputting a fenced code block. */
export class JsonParser implements DocumentParser {
Expand All @@ -13,10 +13,7 @@ export class JsonParser implements DocumentParser {
return Promise.resolve("```json\n" + formatted + "\n```");
} catch (err) {
return Promise.reject(
new ValidationError(
`Invalid JSON: ${err instanceof Error ? err.message : String(err)}`,
err,
),
new ParseError(`Invalid JSON: ${err instanceof Error ? err.message : String(err)}`, err),
);
}
}
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions src/core/parsers/pdf.ts → packages/parsers/src/pdf.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { DocumentParser } from "./index.js";
import { ValidationError } from "../../errors.js";
import { ParseError } from "./errors.js";

/** Parses PDF files using pdf-parse. */
export class PdfParser implements DocumentParser {
Expand All @@ -11,7 +11,7 @@ export class PdfParser implements DocumentParser {
const mod = await import("pdf-parse");
PDFParse = mod.PDFParse;
} catch (err) {
throw new ValidationError(
throw new ParseError(
'PDF parsing requires the "pdf-parse" package. Install it with: npm install pdf-parse',
err,
);
Expand All @@ -22,7 +22,7 @@ export class PdfParser implements DocumentParser {
const result = await parser.getText();
return result.text;
} catch (err) {
throw new ValidationError(
throw new ParseError(
`Failed to parse PDF: ${err instanceof Error ? err.message : String(err)}`,
err,
);
Expand Down
Loading
Loading