Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,28 @@ linters:
- name: error-naming
- name: if-return

exclusions:
# Path-scoped exclusions consolidate repeated per-site nolint
# waivers into one place so reviewers grep the config, not 20
# scattered comments. Issue #499 lane A: gosec G304 (potential file
# inclusion via variable) is the canonical noise in test files —
# every detector test reads "testdata/<x>.schema.json" through
# filepath.Join, which is by definition a variable path. The path
# is checked into the repo, the working dir is set by `go test`,
# and the file is not operator-controlled at runtime. Waiving G304
# on `_test.go` collapses ~22 inline waivers across
# module/pkg/patterns/** and the integration / SDK test trees with
# no loss of coverage on production code (gosec still runs G304 on
# non-test files; see module/pkg/replay/runner.go and
# module/receiver/ncclfrreceiver/nccl_fr.go which keep their inline
# rationale comments because they read paths derived from operator
# config).
rules:
- linters:
- gosec
path: _test\.go
text: "G304"

issues:
max-issues-per-linter: 0
max-same-issues: 0
Expand Down
4 changes: 2 additions & 2 deletions module/pkg/nccl/fr_parser/fixtures_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,15 @@ func TestFixtures_MatchGoldens(t *testing.T) {
if err != nil {
t.Fatalf("Bytes: %v", err)
}
gotBytes, err := os.ReadFile(pklPath) //nolint:gosec // testdata path is package-controlled
gotBytes, err := os.ReadFile(pklPath)
if err != nil {
t.Fatalf("read %s: %v (run `make generate-fixtures`)", pklPath, err)
}
if !reflect.DeepEqual(wantBytes, gotBytes) {
t.Fatalf("%s out of date: run `make generate-fixtures`", pklPath)
}

gotJSON, err := os.ReadFile(jsonPath) //nolint:gosec // testdata path is package-controlled
gotJSON, err := os.ReadFile(jsonPath)
if err != nil {
t.Fatalf("read %s: %v (run `make generate-fixtures`)", jsonPath, err)
}
Expand Down
25 changes: 3 additions & 22 deletions module/pkg/patterns/checkpointer_hang_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@ package patterns_test

import (
"encoding/json"
"os"
"path/filepath"
"testing"
"time"

"github.com/santhosh-tekuri/jsonschema/v6"
"github.com/stretchr/testify/require"

"github.com/tracecoreai/tracecore/module/pkg/patterns"
"github.com/tracecoreai/tracecore/module/pkg/testutil/jsonschematest"
)

// checkpointer_hang detector test suite (NORTHSTAR pattern #11).
Expand Down Expand Up @@ -582,16 +581,7 @@ func TestCheckpointerHangDetector_AsymmetricWindowTightensBackwardLeg(t *testing
func TestCheckpointerHangVerdict_SchemaConformance(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "checkpointer_hang_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "checkpointer_hang_verdict.schema.json"))

t0 := time.Unix(1_700_000_000, 0).UTC()
stalls := []patterns.TrainingStepStallRecord{
Expand Down Expand Up @@ -636,16 +626,7 @@ func TestCheckpointerHangVerdict_SchemaConformance(t *testing.T) {
func TestCheckpointerHangVerdict_SchemaRejectsDrift(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "checkpointer_hang_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "checkpointer_hang_verdict.schema.json"))

validEvidence := []any{
map[string]any{"kind": "checkpoint_phase", "uid": "u1", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
Expand Down
25 changes: 3 additions & 22 deletions module/pkg/patterns/cuda_oom_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@ package patterns_test

import (
"encoding/json"
"os"
"path/filepath"
"testing"
"time"

"github.com/santhosh-tekuri/jsonschema/v6"
"github.com/stretchr/testify/require"

"github.com/tracecoreai/tracecore/module/pkg/patterns"
"github.com/tracecoreai/tracecore/module/pkg/testutil/jsonschematest"
)

// cuda_oom detector test suite (NORTHSTAR pattern #10). The detector
Expand Down Expand Up @@ -466,16 +465,7 @@ func TestCUDAOOMDetector_ThresholdConfigurable(t *testing.T) {
func TestCUDAOOMVerdict_SchemaConformance(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "cuda_oom_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "cuda_oom_verdict.schema.json"))

fbAt := time.Unix(1_700_000_000, 0).UTC()
oomAt := fbAt.Add(30 * time.Second)
Expand Down Expand Up @@ -506,16 +496,7 @@ func TestCUDAOOMVerdict_SchemaConformance(t *testing.T) {
func TestCUDAOOMVerdict_SchemaRejectsDrift(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "cuda_oom_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "cuda_oom_verdict.schema.json"))

validEvidence := []any{
map[string]any{"kind": "hw_fb", "uid": "u1", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
Expand Down
25 changes: 3 additions & 22 deletions module/pkg/patterns/dataloader_hang_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@ package patterns_test

import (
"encoding/json"
"os"
"path/filepath"
"testing"
"time"

"github.com/santhosh-tekuri/jsonschema/v6"
"github.com/stretchr/testify/require"

"github.com/tracecoreai/tracecore/module/pkg/patterns"
"github.com/tracecoreai/tracecore/module/pkg/testutil/jsonschematest"
)

// TestDataLoaderHangDetector_WorkerKilledDiscriminatorFires pins the
Expand Down Expand Up @@ -434,16 +433,7 @@ func TestDataLoaderHangDetector_ThresholdConfigurable(t *testing.T) {
func TestDataLoaderHangVerdict_SchemaConformance(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "dataloader_hang_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "dataloader_hang_verdict.schema.json"))

now := mustParseDLTime(t, "2026-06-01T10:00:00Z")
stalls := []patterns.TrainingStepStallRecord{
Expand Down Expand Up @@ -483,16 +473,7 @@ func TestDataLoaderHangVerdict_SchemaConformance(t *testing.T) {
func TestDataLoaderHangVerdict_SchemaRejectsDrift(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "dataloader_hang_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "dataloader_hang_verdict.schema.json"))

validEvidence := []any{
map[string]any{"kind": "training_step_stall", "uid": "u1", "timestamp": "2026-06-01T10:00:00Z", "description": "d"},
Expand Down
25 changes: 3 additions & 22 deletions module/pkg/patterns/hbm_ecc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@ package patterns_test

import (
"encoding/json"
"os"
"path/filepath"
"regexp"
"testing"
"time"

"github.com/santhosh-tekuri/jsonschema/v6"
"github.com/stretchr/testify/require"

"github.com/tracecoreai/tracecore/module/pkg/patterns"
"github.com/tracecoreai/tracecore/module/pkg/testutil/jsonschematest"
)

// hbm_ecc detector test suite. The detector reads DCGM-derived HBM
Expand Down Expand Up @@ -342,16 +341,7 @@ func TestHBMECCDetector_MostRecentECCWins(t *testing.T) {
func TestHBMECCVerdict_SchemaConformance(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "hbm_ecc_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "hbm_ecc_verdict.schema.json"))

eccAt := time.Unix(1_700_000_000, 0).UTC()
eccs := []patterns.HBMECCRecord{
Expand Down Expand Up @@ -379,16 +369,7 @@ func TestHBMECCVerdict_SchemaConformance(t *testing.T) {
func TestHBMECCVerdict_SchemaRejectsDrift(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "hbm_ecc_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "hbm_ecc_verdict.schema.json"))

validEvidence := []any{
map[string]any{"kind": "hw_error", "uid": "u1", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
Expand Down
25 changes: 3 additions & 22 deletions module/pkg/patterns/nccl_bootstrap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@ package patterns_test

import (
"encoding/json"
"os"
"path/filepath"
"testing"
"time"

"github.com/santhosh-tekuri/jsonschema/v6"
"github.com/stretchr/testify/require"

"github.com/tracecoreai/tracecore/module/pkg/patterns"
"github.com/tracecoreai/tracecore/module/pkg/testutil/jsonschematest"
)

// nccl_bootstrap detector test suite (NORTHSTAR pattern #9). The
Expand Down Expand Up @@ -402,16 +401,7 @@ func TestNCCLBootstrapDetector_MaxPodReadyAnchorsEvidence(t *testing.T) {
func TestNCCLBootstrapVerdict_SchemaConformance(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "nccl_bootstrap_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "nccl_bootstrap_verdict.schema.json"))

now := mustParseTime(t, "2026-06-01T10:00:00Z")
readyAt := now.Add(-10 * time.Minute)
Expand Down Expand Up @@ -439,16 +429,7 @@ func TestNCCLBootstrapVerdict_SchemaConformance(t *testing.T) {
func TestNCCLBootstrapVerdict_SchemaRejectsDrift(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "nccl_bootstrap_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "nccl_bootstrap_verdict.schema.json"))

validEvidence := []any{
map[string]any{"kind": "training_pod", "uid": "u1", "timestamp": "2026-06-01T10:00:00Z", "description": "d"},
Expand Down
25 changes: 3 additions & 22 deletions module/pkg/patterns/nccl_hang_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@ package patterns_test

import (
"encoding/json"
"os"
"path/filepath"
"regexp"
"testing"
"time"

"github.com/santhosh-tekuri/jsonschema/v6"
"github.com/stretchr/testify/require"

"github.com/tracecoreai/tracecore/module/pkg/patterns"
"github.com/tracecoreai/tracecore/module/pkg/testutil/jsonschematest"
)

// nccl_hang detector test suite. The detector reads cross-rank NCCL
Expand Down Expand Up @@ -215,16 +214,7 @@ func TestNCCLHangDetector_LaterCompletedRecordSupersedes(t *testing.T) {
func TestNCCLHangVerdict_SchemaConformance(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "nccl_hang_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "nccl_hang_verdict.schema.json"))

now := time.Unix(1_700_000_600, 0).UTC()
stuckNs := now.Add(-10 * time.Minute).UnixNano()
Expand All @@ -249,16 +239,7 @@ func TestNCCLHangVerdict_SchemaConformance(t *testing.T) {
func TestNCCLHangVerdict_SchemaRejectsDrift(t *testing.T) {
t.Parallel()

schemaPath := filepath.Join("testdata", "nccl_hang_verdict.schema.json")
schemaBytes, err := os.ReadFile(schemaPath) //nolint:gosec // schemaPath is a test-local relative path
require.NoError(t, err)

compiler := jsonschema.NewCompiler()
var schemaDoc any
require.NoError(t, json.Unmarshal(schemaBytes, &schemaDoc))
require.NoError(t, compiler.AddResource(schemaPath, schemaDoc))
schema, err := compiler.Compile(schemaPath)
require.NoError(t, err)
schema := jsonschematest.Compile(t, filepath.Join("testdata", "nccl_hang_verdict.schema.json"))

validEvidence := []any{
map[string]any{"kind": "nccl_fr", "uid": "u1", "timestamp": "2026-05-18T10:00:00Z", "description": "d"},
Expand Down
Loading
Loading