Skip to content

Commit 1ca6b47

Browse files
jahoomaclaude
andauthored
evalbuff: Codebuff SDK integration, direct LLM API, and quality improvements (#486)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d6a3db3 commit 1ca6b47

File tree

14 files changed

+431
-328
lines changed

14 files changed

+431
-328
lines changed

agents/base2/base2-free-evals.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import { createBase2 } from './base2'
2+
3+
const definition = {
4+
...createBase2('free', { noAskUser: true }),
5+
id: 'base2-free-evals',
6+
displayName: 'Buffy the Free Evals Orchestrator',
7+
}
8+
export default definition

bun.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cli/src/hooks/helpers/__tests__/send-message.test.ts

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ const { createBatchedMessageUpdater } = await import(
3535
'../../../utils/message-updater'
3636
)
3737
import { createPaymentRequiredError } from '@codebuff/sdk'
38+
import type { RunState } from '@codebuff/sdk'
3839

3940
const createMockTimerController = (): SendMessageTimerController & {
4041
startCalls: string[]
@@ -348,7 +349,7 @@ describe('handleRunCompletion', () => {
348349
let hasReceivedPlanResponse = false
349350

350351
const runState = {
351-
sessionState: null,
352+
sessionState: undefined,
352353
output: { type: 'lastMessage' as const, value: [] },
353354
}
354355

@@ -372,7 +373,7 @@ describe('handleRunCompletion', () => {
372373
expect(chainInProgress).toBe(false)
373374
expect(canProcessQueue).toBe(true)
374375
expect(isProcessingQueueRef.current).toBe(false)
375-
expect(streamStatus).toBe('idle')
376+
expect(streamStatus as StreamStatus).toBe('idle')
376377
})
377378

378379
test('does not process server response when wasAbortedByUser is true', () => {
@@ -388,7 +389,7 @@ describe('handleRunCompletion', () => {
388389
let hasReceivedPlanResponse = false
389390

390391
const runState = {
391-
sessionState: null,
392+
sessionState: undefined,
392393
output: {
393394
type: 'lastMessage' as const,
394395
value: [{ type: 'text' as const, text: 'Server response that should be ignored' }],
@@ -431,7 +432,7 @@ describe('handleRunCompletion', () => {
431432
let canProcessQueueCalled = false
432433

433434
const runState = {
434-
sessionState: null,
435+
sessionState: undefined,
435436
output: { type: 'lastMessage' as const, value: [] },
436437
}
437438

@@ -929,7 +930,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
929930

930931
// Abort handler fires synchronously: UI is updated, but chain lock stays held
931932
expect(streamRefsA.state.wasAbortedByUser).toBe(true)
932-
expect(streamStatus).toBe('idle') // UI shows idle
933+
expect(streamStatus as StreamStatus).toBe('idle') // UI shows idle
933934
expect(chainInProgress).toBe(true) // But chain lock is still held!
934935

935936
// --- PHASE 3: User types run B — verify it's BLOCKED ---
@@ -952,8 +953,8 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
952953
// Simulate what happens in useSendMessage after `await client.run(runConfig)`:
953954
// 1. previousRunStateRef.current = runState (state saved)
954955
// 2. handleRunCompletion is called
955-
const runStateFromA = {
956-
sessionState: { conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'] },
956+
const runStateFromA: RunState = {
957+
sessionState: { conversationId: 'conv-123', history: ['user msg A', 'partial assistant response'] } as any,
957958
output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'partial' }] },
958959
}
959960

@@ -991,11 +992,11 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
991992
expect(chainInProgress).toBe(false)
992993
expect(canProcessQueue).toBe(true)
993994
expect(isProcessingQueueRef.current).toBe(false)
994-
expect(streamStatus).toBe('idle')
995+
expect(streamStatus as StreamStatus).toBe('idle')
995996

996997
// The crucial state continuity: previousRunState from A is available for B
997998
expect(previousRunState).toBe(runStateFromA)
998-
expect(previousRunState.sessionState).toEqual({
999+
expect(previousRunState.sessionState as any).toEqual({
9991000
conversationId: 'conv-123',
10001001
history: ['user msg A', 'partial assistant response'],
10011002
})
@@ -1049,7 +1050,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
10491050
let chainInProgress = true
10501051
const isProcessingQueueRef = { current: false }
10511052
const isQueuePausedRef = { current: false }
1052-
let previousRunState: { sessionState: unknown; output: unknown } | null = null
1053+
let previousRunState: RunState | null = null
10531054

10541055
const setStreamStatus = (status: StreamStatus) => { streamStatus = status }
10551056
const setCanProcessQueue = (can: boolean) => { canProcessQueue = can }
@@ -1083,14 +1084,14 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
10831084
expect(chainInProgress).toBe(true) // Lock held
10841085

10851086
// client.run() resolves for run A
1086-
const runStateA = {
1087+
const runStateA: RunState = {
10871088
sessionState: {
10881089
id: 'session-abc',
10891090
messages: [
10901091
{ role: 'user', content: 'first message' },
10911092
{ role: 'assistant', content: 'partial response before cancel' },
10921093
],
1093-
},
1094+
} as any,
10941095
output: { type: 'lastMessage' as const, value: [] },
10951096
}
10961097
previousRunState = runStateA
@@ -1146,7 +1147,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
11461147
// In the real code, this is: previousRunState: previousRunStateRef.current
11471148
// passed to createRunConfig
11481149
expect(previousRunState).toBe(runStateA)
1149-
expect(previousRunState!.sessionState).toEqual({
1150+
expect(previousRunState!.sessionState as any).toEqual({
11501151
id: 'session-abc',
11511152
messages: [
11521153
{ role: 'user', content: 'first message' },
@@ -1155,7 +1156,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
11551156
})
11561157

11571158
// Simulate run B completing normally
1158-
const runStateB = {
1159+
const runStateB: RunState = {
11591160
sessionState: {
11601161
id: 'session-abc',
11611162
messages: [
@@ -1164,7 +1165,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
11641165
{ role: 'user', content: 'second message' },
11651166
{ role: 'assistant', content: 'full response to second message' },
11661167
],
1167-
},
1168+
} as any,
11681169
output: { type: 'lastMessage' as const, value: [{ type: 'text' as const, text: 'full response' }] },
11691170
}
11701171
previousRunState = runStateB
@@ -1186,7 +1187,7 @@ describe('CLI-level race condition: abort run A, attempt run B before A resolves
11861187
})
11871188

11881189
// Final state: both runs' messages are preserved in session history
1189-
expect(previousRunState!.sessionState).toEqual({
1190+
expect(previousRunState!.sessionState as any).toEqual({
11901191
id: 'session-abc',
11911192
messages: [
11921193
{ role: 'user', content: 'first message' },

docs/patterns/handle-steps-generators.md

Lines changed: 0 additions & 180 deletions
This file was deleted.

evalbuff/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414
"run": "bun run src/run-evalbuff.ts"
1515
},
1616
"dependencies": {
17+
"@ai-sdk/anthropic": "^2.0.50",
1718
"@codebuff/common": "workspace:*",
1819
"@codebuff/sdk": "workspace:*",
20+
"ai": "^5.0.0",
1921
"zod": "^4.2.1"
2022
}
2123
}

evalbuff/src/__tests__/e2e.test.ts

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,25 @@ mock.module('../test-repo-utils', () => ({
4040
},
4141
}))
4242

43-
mock.module('../cli-runner', () => ({
44-
runCliAgent: async () => ({
45-
diff: 'mock diff content',
46-
durationMs: 1000,
47-
exitCode: 0,
48-
stdout: 'mock stdout',
49-
stderr: '',
50-
}),
43+
mock.module('../runners/codebuff', () => ({
44+
CodebuffRunner: class {
45+
constructor() {}
46+
async run() {
47+
return {
48+
steps: [{ type: 'text', content: 'mock trace' }],
49+
totalCostUsd: 0.01,
50+
diff: 'mock diff content',
51+
}
52+
}
53+
},
54+
}))
55+
56+
mock.module('@codebuff/sdk', () => ({
57+
CodebuffClient: class {
58+
constructor() {}
59+
async run() { return { output: { type: 'success' }, sessionState: null } }
60+
},
61+
loadLocalAgents: async () => ({}),
5162
}))
5263

5364
// Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement)
@@ -126,7 +137,7 @@ describe('evalbuff E2E', () => {
126137
await runLearnMode({
127138
mode: 'learn',
128139
repoPath: repoDir,
129-
agentCommand: 'echo',
140+
agentId: 'base2-free-evals',
130141
parallelism: 1,
131142
maxCostUsd: 50,
132143
agentTimeoutMs: 10_000,

0 commit comments

Comments
 (0)