From 0995c74c4c192e002c37ad17ca8648da129e929d Mon Sep 17 00:00:00 2001 From: "aaron.wwj" Date: Tue, 26 May 2026 10:53:22 +0800 Subject: [PATCH] feat: add Objective-C language support for code extraction Add ObjC extractor with tree-sitter-objc grammar support: - Register .m and .mm file extensions - Handle message_expression, class_interface, class_implementation, protocol_declaration, and method nodes - Add extractBareCall for message_expression capture during visitFunctionBody traversal (critical fix for call edges) - Build method signatures from parameter selectors This enables proper callers/callees relationship extraction for Objective-C codebases. --- src/extraction/grammars.ts | 4 + src/extraction/languages/index.ts | 2 + src/extraction/languages/objc.ts | 227 ++++++++++++++++++++++++++++++ src/types.ts | 1 + 4 files changed, 234 insertions(+) create mode 100644 src/extraction/languages/objc.ts diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index c167d28b3..bb7304a8b 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -33,6 +33,7 @@ const WASM_GRAMMAR_FILES: Record = { swift: 'tree-sitter-swift.wasm', kotlin: 'tree-sitter-kotlin.wasm', dart: 'tree-sitter-dart.wasm', + objc: 'tree-sitter-objc.wasm', pascal: 'tree-sitter-pascal.wasm', scala: 'tree-sitter-scala.wasm', lua: 'tree-sitter-lua.wasm', @@ -56,6 +57,8 @@ export const EXTENSION_MAP: Record = { '.java': 'java', '.c': 'c', '.h': 'c', // Could also be C++, defaulting to C + '.m': 'objc', + '.mm': 'objc', '.cpp': 'cpp', '.cc': 'cpp', '.cxx': 'cpp', @@ -330,6 +333,7 @@ export function getLanguageDisplayName(language: Language): string { c: 'C', cpp: 'C++', csharp: 'C#', + objc: 'Objective-C', php: 'PHP', ruby: 'Ruby', swift: 'Swift', diff --git a/src/extraction/languages/index.ts b/src/extraction/languages/index.ts index a289f0289..543598b8e 100644 --- a/src/extraction/languages/index.ts +++ b/src/extraction/languages/index.ts @@ -25,6 +25,7 @@ import { pascalExtractor } from './pascal'; import { scalaExtractor } from './scala'; import { luaExtractor } from './lua'; import { luauExtractor } from './luau'; +import { objcExtractor } from './objc'; export const EXTRACTORS: Partial> = { typescript: typescriptExtractor, @@ -47,4 +48,5 @@ export const EXTRACTORS: Partial> = { scala: scalaExtractor, lua: luaExtractor, luau: luauExtractor, + objc: objcExtractor, }; diff --git a/src/extraction/languages/objc.ts b/src/extraction/languages/objc.ts new file mode 100644 index 000000000..e561a0779 --- /dev/null +++ b/src/extraction/languages/objc.ts @@ -0,0 +1,227 @@ +/** + * Objective-C language extractor for CodeGraph. + * + * Extracts classes (@interface / @implementation), methods (±method), + * properties (@property), imports (#import / #include), call expressions + * (message sends and C calls), and top-level variable declarations. + * + * Key design choices driven by tree-sitter-objc's AST: + * - `message_expression` (e.g. `[receiver selector:arg]`) is NOT a `call_expression` — + * it must be handled via `visitNode` (class-level) and `extractBareCall` (method-body). + * - `class_interface` and `class_implementation` both produce 'class' nodes; + * `class_implementation` pairs with its interface via name matching. + * - Method names use ONLY the first keyword (no colons), matching how tree-sitter-objc + * names method symbols — this alignment is critical for `callers`/`callees` resolution. + */ +import type { Node as SyntaxNode } from 'web-tree-sitter'; +import type { LanguageExtractor, ExtractorContext } from '../tree-sitter-types'; +import { getNodeText } from '../tree-sitter-helpers'; + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +/** + * Extract the method selector name from a selector_expression or method field node. + * + * tree-sitter-objc names method symbols using only the first keyword (no colon). + * For resolution to work, call edges must use the same naming convention. + * + * Examples: + * No-arg: `compressionQueue` → "compressionQueue" + * Single-arg: `shouldUseReferenceCompressionFlowForScene:scene` → "shouldUseReferenceCompressionFlowForScene" + * Multi-arg: `buildMetaInfoWithImageData:asset:scene:completion:` → "buildMetaInfoWithImageData" + */ +function getSelectorName(selectorNode: SyntaxNode, source: string): string { + // Return ONLY the first keyword (no colon) — matching tree-sitter-objc method naming. + // ObjC selectors are composed of keyword_argument children; + // the first keyword is the method's identity for callers/callees resolution. + const first = selectorNode.namedChild(0); + if (first) { + return getNodeText(first, source); + } + return getNodeText(selectorNode, source).trim(); +} + +/** + * Handle an ObjC property declaration (@property). + * Extracts the property name and type, creates a 'property' node under the current class. + */ +function handlePropertyDeclaration(node: SyntaxNode, ctx: ExtractorContext): boolean { + const nameNode = node.childForFieldName('name'); + if (!nameNode) return false; + + const name = getNodeText(nameNode, ctx.source); + const parentId = ctx.nodeStack[ctx.nodeStack.length - 1]; + if (!parentId) return false; + + let propType = 'unknown'; + for (let i = 0; i < node.namedChildCount; i++) { + const child = node.namedChild(i); + if (child && child.type === 'property_type') { + propType = getNodeText(child, ctx.source); + break; + } + } + + ctx.createNode('property', name, node, { + signature: `@property ${propType} ${name}`, + }); + return true; // handled +} + +/** + * Handle a top-level (class-body) ObjC message expression: `[receiver methodName]`. + * Records an unresolved call reference from the current scope. + * + * NOTE: This hook only fires for class-level message expressions (e.g. in ivar + * initializers or static blocks). Method-body message expressions are handled + * by `extractBareCall` because `visitFunctionBody` uses a separate inner walker + * that bypasses the main `visitNode` hook. Without `extractBareCall`, ObjC method + * calls inside method bodies would never produce call edges. + */ +function handleMessageExpression(node: SyntaxNode, ctx: ExtractorContext): boolean { + const callerId = ctx.nodeStack[ctx.nodeStack.length - 1]; + if (!callerId) return false; + + const methodField = node.childForFieldName('method'); + if (!methodField) return false; + + const methodName = getSelectorName(methodField, ctx.source); + if (!methodName) return false; + + ctx.addUnresolvedReference({ + fromNodeId: callerId, + referenceName: methodName, + referenceKind: 'calls', + line: node.startPosition.row + 1, + column: node.startPosition.column, + }); + + return false; // let walker descend into children (for nested message expressions) +} + +// ─── Extractor ────────────────────────────────────────────────────────────── + +export const objcExtractor: LanguageExtractor = { + functionTypes: [], + classTypes: ['class_interface', 'class_implementation'], + methodTypes: ['method_definition', 'method_declaration'], + interfaceTypes: [], + structTypes: [], + enumTypes: [], + enumMemberTypes: [], + typeAliasTypes: [], + importTypes: ['preproc_include'], + callTypes: ['call_expression'], // NOTE: message_expression handled by visitNode + extractBareCall + variableTypes: ['declaration'], + propertyTypes: ['property_declaration'], + fieldTypes: [], + + nameField: 'name', + bodyField: 'body', + paramsField: 'parameters', + + methodsAreTopLevel: false, + + /** + * For `method_definition`, the body is a `compound_statement` named child, + * not a named field. For `method_declaration` (@interface), no body exists. + */ + resolveBody(node: SyntaxNode): SyntaxNode | null { + if (node.type === 'method_definition') { + for (let i = 0; i < node.namedChildCount; i++) { + const child = node.namedChild(i); + if (child && child.type === 'compound_statement') { + return child; + } + } + } + return null; + }, + + /** + * CRITICAL FIX: Called by visitFunctionBody's inner walker for every + * non-call_expression node inside method/function bodies. We handle + * `message_expression` here because visitFunctionBody bypasses the + * main visitNode dispatch hook. + * + * Without this, ObjC method calls inside method bodies never create `calls` + * edges, and `callers`/`callees` return empty results. + */ + extractBareCall(node: SyntaxNode, source: string): string | undefined { + if (node.type !== 'message_expression') return undefined; + + const methodField = node.childForFieldName('method'); + if (!methodField) return undefined; + + return getSelectorName(methodField, source); + }, + + /** + * ObjC class methods start with `+`, instance methods with `-`. + */ + isStatic(node: SyntaxNode): boolean { + if (node.type !== 'method_definition' && node.type !== 'method_declaration') { + return false; + } + const punctNode = node.child(0); + return !(punctNode && punctNode.type === '-'); // '+' or unexpected → static + }, + + /** + * Custom visitor for top-level nodes (class-body walking). + * - property_declaration → creates property node under current class + * - message_expression → creates unresolved call ref (class-level only; + * method-body message expressions use extractBareCall instead) + */ + visitNode(node: SyntaxNode, ctx: ExtractorContext): boolean { + if (node.type === 'property_declaration') { + return handlePropertyDeclaration(node, ctx); + } + if (node.type === 'message_expression') { + return handleMessageExpression(node, ctx); + } + return false; // not handled, fall through to default walker + }, + + /** + * Build a readable method signature for display. + */ + getSignature(node: SyntaxNode, source: string): string | undefined { + if (node.type !== 'method_definition' && node.type !== 'method_declaration') { + return undefined; + } + + const declNode = node.childForFieldName('declaration'); + if (!declNode) return undefined; + + // Build parameter list + const params: string[] = []; + for (let i = 0; i < declNode.namedChildCount; i++) { + const child = declNode.namedChild(i); + if (child && child.type === 'method_parameter') { + const paramType = child.childForFieldName('type'); + const paramName = child.childForFieldName('name'); + const typeStr = paramType ? getNodeText(paramType, source) : '?'; + const nameStr = paramName ? getNodeText(paramName, source) : '?'; + params.push(`(${typeStr})${nameStr}`); + } + } + + const returnTypeNode = declNode.childForFieldName('return_type'); + let returnType = 'void'; + if (returnTypeNode) { + returnType = getNodeText(returnTypeNode, source); + } + + const selectorNode = declNode.childForFieldName('selector'); + let selector = ''; + if (selectorNode) { + selector = getSelectorName(selectorNode, source); + } + + if (params.length > 0) { + return `(${returnType})${selector}:${params[0]}`; + } + return `(${returnType})${selector}`; + }, +}; diff --git a/src/types.ts b/src/types.ts index 0168665d2..7676bd8ca 100644 --- a/src/types.ts +++ b/src/types.ts @@ -75,6 +75,7 @@ export const LANGUAGES = [ 'c', 'cpp', 'csharp', + 'objc', 'php', 'ruby', 'swift',