Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: release

on:
push:
tags: ["v*"]

permissions:
contents: read
id-token: write # required for PyPI Trusted Publishing (no API token needed)

jobs:
build-and-publish:
runs-on: ubuntu-latest
environment: pypi
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Build distribution
run: |
python -m pip install --upgrade build hatchling
python -m build
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
33 changes: 33 additions & 0 deletions .github/workflows/sin-verify.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: sin-verify

on:
pull_request:
branches: ["main"]

jobs:
verify:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install SIN-Code Bundle
run: pip install "sin-code-bundle[dev]"

- name: Run test suite
run: pytest -q

- name: Audit chain integrity
run: |
# Passes if no audit log exists yet (clean repo).
python -c "
from pathlib import Path
from sin_code_bundle.policy import AuditLog
ok = AuditLog(Path('.')).verify_chain()
print('Audit chain:', 'intact' if ok else 'TAMPERED')
raise SystemExit(0 if ok else 1)
"
9 changes: 9 additions & 0 deletions .opencode/plugin/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "sin-opencode-plugin",
"version": "0.1.0",
"private": true,
"type": "module",
"dependencies": {
"@opencode-ai/plugin": "^0.4.0"
}
}
236 changes: 236 additions & 0 deletions .opencode/plugin/sin.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
/**
* SIN-Code Bundle — opencode plugin
*
* Turns the AGENTS.md doctrine into an *enforced* protocol:
* - after every file edit -> run semantic_diff + architectural_debt
* - before a session ends -> require a GREEN Oracle verification
* - on a tripped ADW breaker -> hard-stop the agent
*
* Docs: https://opencode.ai/docs/plugins
*
* The plugin talks to the SIN MCP tools that opencode already loaded via
* `opencode.json` (mcp.sin). It does not shell out to `sin` itself; instead it
* reads/writes a small session ledger under `.sin/session/` so the gate state
* survives across tool calls.
*/

import type { Plugin } from "@opencode-ai/plugin"
import { mkdir, readFile, writeFile } from "node:fs/promises"
import { join } from "node:path"

// --------------------------------------------------------------------------- //
// Config (overridable via env)
// --------------------------------------------------------------------------- //
const SIN_DIR = ".sin"
const SESSION_DIR = join(SIN_DIR, "session")
const LEDGER = join(SESSION_DIR, "gate.json")

const RISK_BLOCK_LEVEL = (process.env.SIN_RISK_BLOCK ?? "high").toLowerCase()
const DEBT_BREAKER = Number(process.env.SIN_DEBT_BREAKER ?? "85") // 0-100
const ENFORCE = (process.env.SIN_ENFORCE ?? "1") !== "0"

type RiskLevel = "low" | "medium" | "high"

interface Ledger {
/** files edited but not yet verified green */
dirty: string[]
/** last Oracle verdict: "pass" | "fail" | "unknown" */
oracle: "pass" | "fail" | "unknown"
/** last architectural debt score 0-100 */
debt: number
/** highest risk seen since last green verification */
risk: RiskLevel
/** human-readable reasons accumulated for the current gate */
notes: string[]
updatedAt: string
}

const EMPTY_LEDGER: Ledger = {
dirty: [],
oracle: "unknown",
debt: 0,
risk: "low",
notes: [],
updatedAt: new Date(0).toISOString(),
}

// --------------------------------------------------------------------------- //
// Ledger persistence
// --------------------------------------------------------------------------- //
async function readLedger(): Promise<Ledger> {
try {
const raw = await readFile(LEDGER, "utf8")
return { ...EMPTY_LEDGER, ...(JSON.parse(raw) as Partial<Ledger>) }
} catch {
return { ...EMPTY_LEDGER }
}
}

async function writeLedger(ledger: Ledger): Promise<void> {
ledger.updatedAt = new Date().toISOString()
await mkdir(SESSION_DIR, { recursive: true })
await writeFile(LEDGER, JSON.stringify(ledger, null, 2), "utf8")
}

const RISK_ORDER: Record<RiskLevel, number> = { low: 0, medium: 1, high: 2 }
function maxRisk(a: RiskLevel, b: RiskLevel): RiskLevel {
return RISK_ORDER[a] >= RISK_ORDER[b] ? a : b
}

// --------------------------------------------------------------------------- //
// Helpers to call the SIN MCP tools through the opencode client
// --------------------------------------------------------------------------- //
async function callSin(
client: any,
tool: string,
args: Record<string, unknown>,
): Promise<any> {
try {
return await client.tool.call({ server: "sin", tool, arguments: args })
} catch (err) {
// Subsystem may be unavailable (graceful degradation). Never crash the agent.
return { ok: false, error: String(err) }
}
}

function parseRisk(result: any): RiskLevel {
const r = String(result?.risk ?? result?.risk_level ?? "low").toLowerCase()
if (r === "high" || r === "critical") return "high"
if (r === "medium" || r === "moderate") return "medium"
return "low"
}

function parseDebt(result: any): number {
const d = Number(result?.score ?? result?.debt ?? result?.complexity ?? 0)
return Number.isFinite(d) ? d : 0
}

function parseOracle(result: any): "pass" | "fail" | "unknown" {
const v = String(result?.verdict ?? result?.status ?? "").toLowerCase()
if (v === "pass" || v === "passed" || v === "green" || result?.ok === true)
return "pass"
if (v === "fail" || v === "failed" || v === "red" || result?.ok === false)
return "fail"
return "unknown"
}

// --------------------------------------------------------------------------- //
// Plugin
// --------------------------------------------------------------------------- //
export const SinPlugin: Plugin = async ({ client, $ }) => {
return {
/**
* After any file edit: assess the change semantically and update debt.
* This is the "review" + "guard debt" steps of the SIN loop, automated.
*/
"file.edited": async ({ file }) => {
if (!file) return
const ledger = await readLedger()

// 1) semantic diff against git HEAD for this file
const diff = await callSin(client, "semantic_diff", {
file_a: `git:HEAD:${file}`,
file_b: file,
})
const risk = parseRisk(diff)
ledger.risk = maxRisk(ledger.risk, risk)

// 2) architectural debt snapshot
const debt = await callSin(client, "architectural_debt", {})
ledger.debt = parseDebt(debt)

// any edit invalidates the previous green verification
ledger.oracle = "unknown"
if (!ledger.dirty.includes(file)) ledger.dirty.push(file)

const note = `edited ${file} (risk=${risk}, debt=${ledger.debt})`
ledger.notes.push(note)
await writeLedger(ledger)

// 3) ADW breaker: hard stop
if (ENFORCE && ledger.debt >= DEBT_BREAKER) {
throw new Error(
`[SIN] ADW breaker tripped: debt ${ledger.debt} >= ${DEBT_BREAKER}. ` +
`Stop adding code and refactor. Re-run architectural_debt after refactor.`,
)
}

// 4) risk gate: warn loudly (does not stop the edit, stops "done")
if (RISK_ORDER[risk] >= RISK_ORDER[RISK_BLOCK_LEVEL as RiskLevel]) {
await client.session.log?.({
level: "warn",
message:
`[SIN] High-risk change in ${file}. Justify it and run ` +
`verify_tests before reporting done.`,
})
}
},

/**
* Before a tool runs: if the agent tries to "finish" while the gate is not
* green, intercept and force a verification first.
*/
"tool.execute.before": async ({ tool }, output) => {
if (!ENFORCE) return
const name = (tool ?? "").toLowerCase()
const isFinishSignal =
name.includes("done") ||
name.includes("finish") ||
name.includes("complete")
if (!isFinishSignal) return

const ledger = await readLedger()
if (ledger.dirty.length === 0) return

if (ledger.oracle !== "pass") {
throw new Error(
`[SIN] Cannot report done: Oracle verification is "${ledger.oracle}". ` +
`Files awaiting green verification: ${ledger.dirty.join(", ")}. ` +
`Run the SIN "verify_tests" tool until it returns pass.`,
)
}
// gate is green -> reset ledger for next task
await writeLedger({ ...EMPTY_LEDGER })
},

/**
* After a verification tool runs: record the Oracle verdict so the finish
* gate can open. We watch for verify_tests / prove / verify_change results.
*/
"tool.execute.after": async ({ tool }, output) => {
const name = (tool ?? "").toLowerCase()
const isVerify =
name.includes("verify") || name.includes("prove") || name.includes("oracle")
if (!isVerify) return

const ledger = await readLedger()
const verdict = parseOracle(output?.result ?? output)
ledger.oracle = verdict
if (verdict === "pass") {
ledger.dirty = []
ledger.risk = "low"
ledger.notes.push("oracle: PASS")
} else if (verdict === "fail") {
ledger.notes.push("oracle: FAIL")
}
await writeLedger(ledger)
},

/**
* Session idle: gentle reminder if there is unverified work on the table.
*/
"session.idle": async () => {
const ledger = await readLedger()
if (ledger.dirty.length > 0 && ledger.oracle !== "pass") {
await client.session.log?.({
level: "info",
message:
`[SIN] ${ledger.dirty.length} file(s) edited without a green ` +
`verification. Run verify_tests before finishing.`,
})
}
},
}
}

export default SinPlugin
43 changes: 43 additions & 0 deletions BENCHMARKS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SIN-Code Benchmarks

We measure one thing: **does exposing the SIN tools improve an agent's
resolved-rate?** The harness (`sin bench`) runs the same task set twice — once
with SIN tools disabled (`control`) and once enabled (`sin`) — and reports the
delta in percentage points.

## Reproduce

```bash
pip install "sin-code-bundle[bench]"

# Smoke test (no LLM cost — validates the clone/apply/test pipeline)
sin bench --runner dry --limit 5

# Full A/B on SWE-bench Lite with opencode
sin bench --runner opencode --limit 100 --out report.json
```

## Methodology

- **Dataset:** SWE-bench Lite (`princeton-nlp/SWE-bench_Lite`, test split).
- **Arms:** `control` (SIN_ENFORCE=0) vs `sin` (SIN_ENFORCE=1, MCP tools loaded).
- **Resolved:** patch applies cleanly AND all FAIL_TO_PASS tests pass.
- **Isolation:** each task runs in a fresh git clone at `base_commit`.

## Results

| Arm | Resolved | Rate | Mean time |
|-----|----------|------|-----------|
| control | *TBD* | *TBD* | *TBD* |
| sin | *TBD* | *TBD* | *TBD* |
| **delta** | | ***TBD* pp** | |

> Fill this table from `report.json` after a full run and commit the
> `report.json` alongside the version tag so results are auditable.

## Interpretation

A positive delta means the SIN tools (impact analysis, semantic diff, Oracle
verification) caused the agent to produce more correct patches. The harness is
runner-agnostic — the same JSON report can compare opencode, codex, and hermes
on identical tasks.
Loading
Loading