diff --git a/module/pkg/patterns/checkpointer_hang.go b/module/pkg/patterns/checkpointer_hang.go index a8284087..4fb20477 100644 --- a/module/pkg/patterns/checkpointer_hang.go +++ b/module/pkg/patterns/checkpointer_hang.go @@ -7,6 +7,8 @@ import ( "sort" "strings" "time" + + "go.opentelemetry.io/collector/pdata/pcommon" ) // Design spec: docs/patterns/11-checkpointer-hang.md. The detector @@ -266,6 +268,32 @@ type CheckpointerHangVerdict struct { MissingLayers []string `json:"missing_layers,omitempty"` } +// Common returns the shared verdict scalars; satisfies the +// patterndetectorprocessor's verdictAttrer interface so the generic +// appendVerdict helper emits checkpointer_hang verdicts without +// per-pattern boilerplate. +func (v CheckpointerHangVerdict) Common() VerdictCommon { + return VerdictCommon{ + PatternID: v.PatternID, + Headline: v.Headline, + Remediation: v.Remediation, + EvidenceTrail: v.EvidenceTrail, + Kind: "checkpointer_hang", + } +} + +// PutAttrs stamps the checkpointer_hang-specific promoted scalars +// onto the verdict log record's attribute map (issue #270 contract). +func (v CheckpointerHangVerdict) PutAttrs(attrs pcommon.Map) { + attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) + attrs.PutInt(verdictAttrCheckpointerStallSeconds, v.StallSeconds) + attrs.PutStr(verdictAttrCheckpointerPhase, string(v.Phase)) + attrs.PutStr(verdictAttrCheckpointerStorageBackend, v.StorageBackend) + putStrIfSet(attrs, verdictAttrK8sPodName, v.PodName) + putStrIfSet(attrs, verdictAttrK8sPodNamespace, v.PodNamespace) + putStrIfSet(attrs, verdictAttrK8sNodeName, v.Node) +} + // CheckpointerHangDetector is the checkpointer_hang pattern // detector (NORTHSTAR pattern #11). Zero-value usage is permitted // — BackwardWindow defaults to diff --git a/module/pkg/patterns/cuda_oom.go b/module/pkg/patterns/cuda_oom.go index 126994c5..0413770f 100644 --- a/module/pkg/patterns/cuda_oom.go +++ b/module/pkg/patterns/cuda_oom.go @@ -6,6 +6,8 @@ import ( "fmt" "sort" "time" + + "go.opentelemetry.io/collector/pdata/pcommon" ) // DefaultCUDAOOMCorrelationWindow is the maximum gap between a CUDA @@ -191,6 +193,37 @@ type CUDAOOMVerdict struct { MissingLayers []string `json:"missing_layers,omitempty"` } +// Common returns the shared verdict scalars; satisfies the +// patterndetectorprocessor's verdictAttrer interface so the generic +// appendVerdict helper emits cuda_oom verdicts without per-pattern +// boilerplate. +func (v CUDAOOMVerdict) Common() VerdictCommon { + return VerdictCommon{ + PatternID: v.PatternID, + Headline: v.Headline, + Remediation: v.Remediation, + EvidenceTrail: v.EvidenceTrail, + Kind: "cuda_oom", + } +} + +// PutAttrs stamps the cuda_oom-specific promoted scalars onto the +// verdict log record's attribute map (issue #270 contract). Empty- +// string scalars (GPUID/Node/Pod*) skip via putStrIfSet so dashboards +// filtering by attribute presence don't silently match empty-filter +// queries. +func (v CUDAOOMVerdict) PutAttrs(attrs pcommon.Map) { + attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) + attrs.PutStr(verdictAttrCUDAOOMKind, string(v.Kind)) + attrs.PutInt(verdictAttrCUDAOOMTriedAllocBytes, v.TriedAllocBytes) + attrs.PutInt(verdictAttrCUDAOOMFBFreeBytes, v.FBFreeBytes) + attrs.PutDouble(verdictAttrCUDAOOMFBFreeRatio, v.FBFreeRatio) + putStrIfSet(attrs, verdictAttrPCIeAERGPUID, v.GPUID) + putStrIfSet(attrs, verdictAttrK8sNodeName, v.Node) + putStrIfSet(attrs, verdictAttrK8sPodName, v.PodName) + putStrIfSet(attrs, verdictAttrK8sPodNamespace, v.PodNamespace) +} + // CUDAOOMDetector is the cuda_oom pattern detector (NORTHSTAR pattern // #10). Zero-value usage is permitted — CorrelationWindow defaults to // DefaultCUDAOOMCorrelationWindow; FBFreeFragmentationThreshold diff --git a/module/pkg/patterns/dataloader_hang.go b/module/pkg/patterns/dataloader_hang.go index 91d879ee..f2f54bf7 100644 --- a/module/pkg/patterns/dataloader_hang.go +++ b/module/pkg/patterns/dataloader_hang.go @@ -6,6 +6,8 @@ import ( "fmt" "sort" "time" + + "go.opentelemetry.io/collector/pdata/pcommon" ) // Design spec: docs/patterns/07-dataloader-hang.md. The detector joins @@ -180,6 +182,45 @@ type DataLoaderHangVerdict struct { EventReason string `json:"k8s.event.reason,omitempty"` } +// Common returns the shared verdict scalars; satisfies the +// patterndetectorprocessor's verdictAttrer interface so the generic +// appendVerdict helper emits dataloader_hang verdicts without +// per-pattern boilerplate. +func (v DataLoaderHangVerdict) Common() VerdictCommon { + return VerdictCommon{ + PatternID: v.PatternID, + Headline: v.Headline, + Remediation: v.Remediation, + EvidenceTrail: v.EvidenceTrail, + Kind: "dataloader_hang", + } +} + +// PutAttrs stamps the dataloader_hang-specific promoted scalars onto +// the verdict log record's attribute map (issue #270 contract). The +// discriminator-conditional fields (worker_pid / error_class / +// event.reason) emit only on the matching discriminator branch so the +// wire-format contract documented in +// docs/patterns/07-dataloader-hang.md §"Verdict attributes" is +// honored. +func (v DataLoaderHangVerdict) PutAttrs(attrs pcommon.Map) { + attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) + attrs.PutStr(verdictAttrDataLoaderHangDiscriminator, string(v.Discriminator)) + attrs.PutInt(verdictAttrDataLoaderHangStallSeconds, v.StallSeconds) + putStrIfSet(attrs, verdictAttrK8sPodName, v.PodName) + putStrIfSet(attrs, verdictAttrK8sPodNamespace, v.PodNamespace) + putStrIfSet(attrs, verdictAttrK8sNodeName, v.NodeName) + switch v.Discriminator { + case DataLoaderHangDiscriminatorWorkerKilled: + if v.WorkerPID > 0 { + attrs.PutInt(verdictAttrDataLoaderWorkerPID, v.WorkerPID) + } + putStrIfSet(attrs, verdictAttrDataLoaderErrorClass, v.ErrorClass) + case DataLoaderHangDiscriminatorStorageEvent: + putStrIfSet(attrs, verdictAttrK8sEventReason, v.EventReason) + } +} + // DataLoaderHangDetector is the dataloader_hang pattern detector // (NORTHSTAR pattern #7 per docs/patterns/07-dataloader-hang.md). // Zero-value usage is permitted — StallThreshold defaults to diff --git a/module/pkg/patterns/nccl_bootstrap.go b/module/pkg/patterns/nccl_bootstrap.go index 3aa763d8..74130ee5 100644 --- a/module/pkg/patterns/nccl_bootstrap.go +++ b/module/pkg/patterns/nccl_bootstrap.go @@ -6,6 +6,8 @@ import ( "fmt" "sort" "time" + + "go.opentelemetry.io/collector/pdata/pcommon" ) // Design spec: docs/patterns/09-nccl-bootstrap-timeout.md. The detector @@ -219,6 +221,33 @@ type NCCLBootstrapTimeoutVerdict struct { MissingLayers []string `json:"missing_layers,omitempty"` } +// Common returns the shared verdict scalars; satisfies the +// patterndetectorprocessor's verdictAttrer interface so the generic +// appendVerdict helper emits nccl_bootstrap verdicts without +// per-pattern boilerplate. +func (v NCCLBootstrapTimeoutVerdict) Common() VerdictCommon { + return VerdictCommon{ + PatternID: v.PatternID, + Headline: v.Headline, + Remediation: v.Remediation, + EvidenceTrail: v.EvidenceTrail, + Kind: "nccl_bootstrap", + } +} + +// PutAttrs stamps the nccl_bootstrap-specific promoted scalars onto +// the verdict log record's attribute map (issue #270 contract). +// JobID skips via putStrIfSet because the namespace-only fallback +// path leaves it empty. +func (v NCCLBootstrapTimeoutVerdict) PutAttrs(attrs pcommon.Map) { + attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) + putStrIfSet(attrs, verdictAttrK8sNamespaceName, v.Namespace) + putStrIfSet(attrs, verdictAttrGenAITrainingJobID, v.JobID) + attrs.PutInt(verdictAttrNCCLBootstrapCohortSize, int64(v.CohortSize)) + attrs.PutInt(verdictAttrNCCLBootstrapFailedRankCount, int64(v.FailedRankCount)) + attrs.PutStr(verdictAttrNCCLBootstrapDiscriminator, string(v.Discriminator)) +} + // NCCLBootstrapDetector is the nccl_bootstrap_timeout pattern detector // (NORTHSTAR pattern #9, per docs/patterns/09-nccl-bootstrap-timeout.md). // Zero-value usage is permitted — BootstrapDeadline defaults to diff --git a/module/pkg/patterns/silent_data_corruption.go b/module/pkg/patterns/silent_data_corruption.go index a7cba9c6..1e8c0cfe 100644 --- a/module/pkg/patterns/silent_data_corruption.go +++ b/module/pkg/patterns/silent_data_corruption.go @@ -6,6 +6,8 @@ import ( "fmt" "sort" "time" + + "go.opentelemetry.io/collector/pdata/pcommon" ) // DefaultSDCAccuracyDropThreshold is the absolute accuracy regression @@ -212,6 +214,34 @@ type SilentDataCorruptionVerdict struct { MissingLayers []string `json:"missing_layers,omitempty"` } +// Common returns the shared verdict scalars; satisfies the +// patterndetectorprocessor's verdictAttrer interface so the generic +// appendVerdict helper emits silent_data_corruption verdicts without +// per-pattern boilerplate. +func (v SilentDataCorruptionVerdict) Common() VerdictCommon { + return VerdictCommon{ + PatternID: v.PatternID, + Headline: v.Headline, + Remediation: v.Remediation, + EvidenceTrail: v.EvidenceTrail, + Kind: "silent_data_corruption", + } +} + +// PutAttrs stamps the silent_data_corruption-specific promoted +// scalars onto the verdict log record's attribute map (issue #270 +// contract). SuspectGPUID/SuspectNode skip via putStrIfSet because +// the partial-confidence (Kind=accuracy_only) verdict carries no +// suspect hardware. +func (v SilentDataCorruptionVerdict) PutAttrs(attrs pcommon.Map) { + attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) + attrs.PutStr(verdictAttrSDCKind, string(v.Kind)) + attrs.PutDouble(verdictAttrSDCAccuracyDrop, v.AccuracyDrop) + putStrIfSet(attrs, verdictAttrSDCSuspectGPUID, v.SuspectGPUID) + putStrIfSet(attrs, verdictAttrSDCSuspectNode, v.SuspectNode) + putStrIfSet(attrs, verdictAttrGenAITrainingJobID, v.JobID) +} + // SilentDataCorruptionDetector is the silent_data_corruption pattern // detector (pattern #13). Zero-value usage is permitted — // AccuracyDropThreshold defaults to DefaultSDCAccuracyDropThreshold; diff --git a/module/pkg/patterns/verdict.go b/module/pkg/patterns/verdict.go index 1516fe35..f21a3761 100644 --- a/module/pkg/patterns/verdict.go +++ b/module/pkg/patterns/verdict.go @@ -49,18 +49,17 @@ type VerdictCommon struct { Kind string } -// Verdict attribute keys shared across the seven patterns whose -// PutAttrs methods live in this package. Co-located with the verdict -// types so the wire-format key vocabulary is owned by the producer -// (patterns) rather than the consumer (patterndetectorprocessor) — -// the processor's existing private constants intentionally mirror -// these for the five other detectors that still build the attribute -// map inline. Keep the two sets in sync when the wire contract -// changes (see docs/ATTRIBUTES.md for the rename / removal policy). +// Verdict attribute keys owned by the producer (patterns package) so +// the wire-format vocabulary lives next to the verdict type that emits +// it. Every verdict's PutAttrs method stamps from this block; the +// patterndetectorprocessor's generic appendVerdict[V verdictAttrer] +// helper drives PutAttrs without needing to know the keys itself. See +// docs/ATTRIBUTES.md for the rename / removal policy. const ( verdictAttrConfidence = "pattern.confidence" verdictAttrK8sPodName = "k8s.pod.name" verdictAttrK8sPodNamespace = "k8s.pod.namespace" + verdictAttrK8sNamespaceName = "k8s.namespace.name" verdictAttrK8sNodeName = "k8s.node.name" verdictAttrK8sEventReason = "k8s.event.reason" verdictAttrNCCLPgID = "nccl.fr.pg_id" @@ -75,6 +74,36 @@ const ( verdictAttrIBHCADevice = "hw.network.ib.device" verdictAttrIBPortNum = "hw.network.ib.port.num" verdictAttrIBTransitionCount = "tracecore.alert.ib_link_flap.transition_count" + verdictAttrGenAITrainingJobID = "gen_ai.training.job_id" + + // CUDA OOM (pattern #10) promoted scalars. Customer-stable per + // docs/patterns/10-cuda-oom-deceptive.md §"Verdict attributes". + verdictAttrCUDAOOMKind = "cuda_oom.kind" + verdictAttrCUDAOOMTriedAllocBytes = "cuda_oom.tried_alloc_bytes" + verdictAttrCUDAOOMFBFreeBytes = "cuda_oom.fb_free_bytes" + verdictAttrCUDAOOMFBFreeRatio = "cuda_oom.fb_free_ratio" + + // Checkpointer hang promoted scalars. + verdictAttrCheckpointerStallSeconds = "tracecore.alert.checkpointer_hang.stall_seconds" + verdictAttrCheckpointerPhase = "tracecore.alert.checkpointer_hang.phase" + verdictAttrCheckpointerStorageBackend = "tracecore.alert.checkpointer_hang.storage_backend" + + // DataLoader hang promoted scalars. + verdictAttrDataLoaderHangDiscriminator = "tracecore.alert.dataloader_hang.discriminator" + verdictAttrDataLoaderHangStallSeconds = "tracecore.alert.dataloader_hang.stall_seconds" + verdictAttrDataLoaderWorkerPID = "dataloader.worker_pid" + verdictAttrDataLoaderErrorClass = "dataloader.error_class" + + // NCCL bootstrap promoted scalars. + verdictAttrNCCLBootstrapCohortSize = "tracecore.alert.nccl_bootstrap_timeout.cohort_size" + verdictAttrNCCLBootstrapFailedRankCount = "tracecore.alert.nccl_bootstrap_timeout.failed_rank_count" + verdictAttrNCCLBootstrapDiscriminator = "tracecore.alert.nccl_bootstrap_timeout.discriminator" + + // Silent data corruption promoted scalars. + verdictAttrSDCKind = "tracecore.alert.silent_data_corruption.kind" + verdictAttrSDCAccuracyDrop = "tracecore.alert.silent_data_corruption.accuracy_drop" + verdictAttrSDCSuspectGPUID = "tracecore.alert.silent_data_corruption.suspect_gpu_id" + verdictAttrSDCSuspectNode = "tracecore.alert.silent_data_corruption.suspect_node" ) // putStrIfSet skips empty-string scalars so dashboards filtering by diff --git a/module/processor/patterndetectorprocessor/checkpointer_hang.go b/module/processor/patterndetectorprocessor/checkpointer_hang.go index ef0eb534..e1e39c3b 100644 --- a/module/processor/patterndetectorprocessor/checkpointer_hang.go +++ b/module/processor/patterndetectorprocessor/checkpointer_hang.go @@ -12,26 +12,12 @@ import ( "github.com/tracecoreai/tracecore/module/pkg/patterns" ) -// Checkpointer hang (pattern #11) verdict-attribute namespace. The -// attribute keys mirror the docs/patterns/11-checkpointer-hang.md -// §"Verdict attributes" table. Promoted onto the verdict log record -// per the issue #270 scalar-promotion contract so dashboards -// table-aggregate on stall_seconds / phase / storage_backend -// without server-side JSON parsing of pattern.verdict_json. -const ( - // verdictAttrCheckpointerStallSeconds promotes the wall-clock - // stall duration so dashboards graph hang severity. - verdictAttrCheckpointerStallSeconds = "tracecore.alert.checkpointer_hang.stall_seconds" - - // verdictAttrCheckpointerPhase promotes the checkpoint phase - // (plan / write / barrier) so operators triage by phase - // (write ⇒ storage; barrier ⇒ slow rank; plan ⇒ metadata). - verdictAttrCheckpointerPhase = "tracecore.alert.checkpointer_hang.phase" - - // verdictAttrCheckpointerStorageBackend promotes the inferred - // storage backend (lustre / fsx / weka / nfs / s3 / unknown). - verdictAttrCheckpointerStorageBackend = "tracecore.alert.checkpointer_hang.storage_backend" -) +// Checkpointer hang (pattern #11) verdict-attribute keys live next +// to the verdict type — see module/pkg/patterns/verdict.go for the +// canonical constant block and CheckpointerHangVerdict.PutAttrs for +// the stamper. Co-locating keys + stamper with the producer side +// ensures the wire-format vocabulary is owned by the patterns +// package, not duplicated here. // Customer-stable attribute family the OTTL recipes (sibling to // future filelogreceiver + prometheusreceiver + journaldreceiver @@ -126,27 +112,6 @@ func projectStoragePressureRecord(lr plog.LogRecord, resAttrs pcommon.Map) (patt return r, true } -// appendCheckpointerHangVerdict emits a checkpointer_hang verdict -// log record. Promoted attrs: k8s.pod.name, k8s.pod.namespace, -// k8s.node.name, the stall_seconds / phase / storage_backend -// scalars, and pattern.confidence (issue #270 contract). -func appendCheckpointerHangVerdict(ld plog.Logs, v patterns.CheckpointerHangVerdict, logger *zap.Logger) { - appendVerdictRecord(ld, logger, verdictCommon{ - PatternID: v.PatternID, - Headline: v.Headline, - Remediation: v.Remediation, - EvidenceTrail: v.EvidenceTrail, - }, v, "checkpointer_hang", func(attrs pcommon.Map) { - attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) - attrs.PutInt(verdictAttrCheckpointerStallSeconds, v.StallSeconds) - attrs.PutStr(verdictAttrCheckpointerPhase, string(v.Phase)) - attrs.PutStr(verdictAttrCheckpointerStorageBackend, v.StorageBackend) - putStrIfSet(attrs, verdictAttrK8sPodName, v.PodName) - putStrIfSet(attrs, verdictAttrK8sPodNamespace, v.PodNamespace) - putStrIfSet(attrs, verdictAttrK8sNodeName, v.Node) - }) -} - // collectCheckpointerHangInputs walks the incoming plog.Logs and // projects the three input shapes the detector consumes. Hoisted // into the checkpointer-hang-specific file so the cross-cutting @@ -217,7 +182,7 @@ func runCheckpointerHangDetector(ld plog.Logs, cfg *Config, emitPartial bool, lo if v.Confidence == patterns.ConfidencePartial && !emitPartial { continue } - appendCheckpointerHangVerdict(ld, v, logger) + appendVerdict(ld, v, logger) } } diff --git a/module/processor/patterndetectorprocessor/cuda_oom.go b/module/processor/patterndetectorprocessor/cuda_oom.go index 9fa48639..f9c7e5d5 100644 --- a/module/processor/patterndetectorprocessor/cuda_oom.go +++ b/module/processor/patterndetectorprocessor/cuda_oom.go @@ -12,31 +12,12 @@ import ( "github.com/tracecoreai/tracecore/module/pkg/patterns" ) -// CUDA OOM (pattern #10) verdict-attribute namespace. Promoted onto -// the verdict log record per issue #270 scalar-promotion contract so -// downstream LogQL / Grafana queries can table-aggregate by kind + -// alloc-size buckets without server-side parsing of -// pattern.verdict_json. The attribute keys mirror the customer-stable -// `cuda_oom.*` family the docs/patterns/10-cuda-oom-deceptive.md -// §"Verdict attributes" table pins. -const ( - // verdictAttrCUDAOOMKind promotes the cuda_oom discriminator - // branch (`fragmentation` / `true_oom` / `unknown`). - verdictAttrCUDAOOMKind = "cuda_oom.kind" - - // verdictAttrCUDAOOMTriedAllocBytes promotes the requested-bytes - // scalar so dashboards can branch on alloc-size buckets without - // parsing JSON. - verdictAttrCUDAOOMTriedAllocBytes = "cuda_oom.tried_alloc_bytes" - - // verdictAttrCUDAOOMFBFreeBytes promotes the free-bytes scalar - // observed via DCGM at the proximate FB sample. - verdictAttrCUDAOOMFBFreeBytes = "cuda_oom.fb_free_bytes" - - // verdictAttrCUDAOOMFBFreeRatio promotes the free-ratio scalar - // (FreeBytes / TotalBytes). Range [0, 1]. - verdictAttrCUDAOOMFBFreeRatio = "cuda_oom.fb_free_ratio" -) +// CUDA OOM (pattern #10) verdict-attribute keys live next to the +// verdict type — see module/pkg/patterns/verdict.go for the +// canonical constant block and CUDAOOMVerdict.PutAttrs for the +// stamper. Co-locating keys + stamper with the producer side ensures +// the wire-format vocabulary is owned by the patterns package, not +// duplicated here. // projectCUDAOOMLogRecord reads OTel attributes off a log record and // builds a patterns.CUDAOOMLogRecord. The projection's gate is the @@ -126,32 +107,6 @@ func projectFBMemoryRecord(lr plog.LogRecord, resAttrs pcommon.Map) (patterns.FB return r, true } -// appendCUDAOOMVerdict emits a cuda_oom verdict log record. Promoted -// attrs: gpu.id, k8s.pod.name, k8s.pod.namespace, k8s.node.name, -// cuda_oom.kind, cuda_oom.tried_alloc_bytes, cuda_oom.fb_free_bytes, -// cuda_oom.fb_free_ratio, pattern.confidence (issue #270 contract). -// -// putStrIfSet guards optional scalars so an empty-string upstream -// stamp doesn't silently match empty-filter dashboard queries. -func appendCUDAOOMVerdict(ld plog.Logs, v patterns.CUDAOOMVerdict, logger *zap.Logger) { - appendVerdictRecord(ld, logger, verdictCommon{ - PatternID: v.PatternID, - Headline: v.Headline, - Remediation: v.Remediation, - EvidenceTrail: v.EvidenceTrail, - }, v, "cuda_oom", func(attrs pcommon.Map) { - attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) - attrs.PutStr(verdictAttrCUDAOOMKind, string(v.Kind)) - attrs.PutInt(verdictAttrCUDAOOMTriedAllocBytes, v.TriedAllocBytes) - attrs.PutInt(verdictAttrCUDAOOMFBFreeBytes, v.FBFreeBytes) - attrs.PutDouble(verdictAttrCUDAOOMFBFreeRatio, v.FBFreeRatio) - putStrIfSet(attrs, verdictAttrPCIeAERGPUID, v.GPUID) - putStrIfSet(attrs, verdictAttrK8sNodeName, v.Node) - putStrIfSet(attrs, verdictAttrK8sPodName, v.PodName) - putStrIfSet(attrs, verdictAttrK8sPodNamespace, v.PodNamespace) - }) -} - // collectCUDAOOMInputs walks the incoming plog.Logs and projects // cuda_oom + FB memory log records out. Hoisted into the // cuda_oom-specific file so the cross-cutting `collectInputs` doesn't @@ -222,7 +177,7 @@ func runCUDAOOMDetector(ld plog.Logs, cfg *Config, emitPartial bool, buffer *fbB if v.Confidence == patterns.ConfidencePartial && !emitPartial { continue } - appendCUDAOOMVerdict(ld, v, logger) + appendVerdict(ld, v, logger) } } diff --git a/module/processor/patterndetectorprocessor/dataloader_hang.go b/module/processor/patterndetectorprocessor/dataloader_hang.go index adf52c68..13d96011 100644 --- a/module/processor/patterndetectorprocessor/dataloader_hang.go +++ b/module/processor/patterndetectorprocessor/dataloader_hang.go @@ -12,33 +12,12 @@ import ( "github.com/tracecoreai/tracecore/module/pkg/patterns" ) -// DataLoader hang (pattern #7) verdict-attribute namespace. Promoted -// onto the verdict per issue #270 scalar-promotion contract so -// downstream LogQL / Grafana queries can table-aggregate by -// discriminator + stall-duration buckets without parsing -// pattern.verdict_json. Attribute keys mirror the docs/patterns/ -// 07-dataloader-hang.md §"Verdict attributes" table. -const ( - // verdictAttrDataLoaderHangDiscriminator promotes the - // discriminator branch (`worker_killed` | `storage_event`). - verdictAttrDataLoaderHangDiscriminator = "tracecore.alert.dataloader_hang.discriminator" - - // verdictAttrDataLoaderHangStallSeconds promotes the wall-clock - // stall duration scalar so dashboards can render a histogram - // without parsing JSON. - verdictAttrDataLoaderHangStallSeconds = "tracecore.alert.dataloader_hang.stall_seconds" - - // verdictAttrDataLoaderWorkerPID promotes the killed DataLoader - // worker PID when the discriminator is worker_killed AND the - // OTTL stanza extracted the pid. Mirrors the customer-stable - // `dataloader.worker_pid` attribute. - verdictAttrDataLoaderWorkerPID = "dataloader.worker_pid" - - // verdictAttrDataLoaderErrorClass promotes the OTTL-extracted - // error class string when the discriminator is worker_killed. - // Mirrors `dataloader.error_class`. - verdictAttrDataLoaderErrorClass = "dataloader.error_class" -) +// DataLoader hang (pattern #7) verdict-attribute keys live next to +// the verdict type — see module/pkg/patterns/verdict.go for the +// canonical constant block and DataLoaderHangVerdict.PutAttrs for +// the stamper. Co-locating keys + stamper with the producer side +// ensures the wire-format vocabulary is owned by the patterns +// package, not duplicated here. // Customer-stable input attribute keys the dataloader_hang // projections gate on. Hoisted into one place so a rename in the @@ -177,36 +156,6 @@ func projectStorageEventRecord(lr plog.LogRecord, resAttrs pcommon.Map) (pattern return r, true } -// appendDataLoaderHangVerdict emits a dataloader_hang verdict log -// record. Promoted attrs: pattern.confidence, the discriminator + -// stall-seconds scalars, and the discriminator-conditional -// (worker_pid / error_class / event.reason) fields per issue #270 -// scalar-promotion contract. -func appendDataLoaderHangVerdict(ld plog.Logs, v patterns.DataLoaderHangVerdict, logger *zap.Logger) { - appendVerdictRecord(ld, logger, verdictCommon{ - PatternID: v.PatternID, - Headline: v.Headline, - Remediation: v.Remediation, - EvidenceTrail: v.EvidenceTrail, - }, v, "dataloader_hang", func(attrs pcommon.Map) { - attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) - attrs.PutStr(verdictAttrDataLoaderHangDiscriminator, string(v.Discriminator)) - attrs.PutInt(verdictAttrDataLoaderHangStallSeconds, v.StallSeconds) - putStrIfSet(attrs, verdictAttrK8sPodName, v.PodName) - putStrIfSet(attrs, verdictAttrK8sPodNamespace, v.PodNamespace) - putStrIfSet(attrs, verdictAttrK8sNodeName, v.NodeName) - switch v.Discriminator { - case patterns.DataLoaderHangDiscriminatorWorkerKilled: - if v.WorkerPID > 0 { - attrs.PutInt(verdictAttrDataLoaderWorkerPID, v.WorkerPID) - } - putStrIfSet(attrs, verdictAttrDataLoaderErrorClass, v.ErrorClass) - case patterns.DataLoaderHangDiscriminatorStorageEvent: - putStrIfSet(attrs, verdictAttrK8sEventReason, v.EventReason) - } - }) -} - // collectDataLoaderHangInputs walks the incoming plog.Logs and // projects the three dataloader_hang input shapes out. Hoisted into // the dataloader_hang-specific file so the cross-cutting @@ -271,7 +220,7 @@ func runDataLoaderHangDetector(ld plog.Logs, cfg *Config, logger *zap.Logger) { CorrelationWindow: dataloaderHangWindow(cfg), } for _, v := range det.Evaluate(stalls, errs, storage) { - appendDataLoaderHangVerdict(ld, v, logger) + appendVerdict(ld, v, logger) } } diff --git a/module/processor/patterndetectorprocessor/nccl_bootstrap.go b/module/processor/patterndetectorprocessor/nccl_bootstrap.go index d692e60f..2c88d538 100644 --- a/module/processor/patterndetectorprocessor/nccl_bootstrap.go +++ b/module/processor/patterndetectorprocessor/nccl_bootstrap.go @@ -14,51 +14,19 @@ import ( // NCCL bootstrap-timeout (pattern #9) processor surface. // -// Two input projections + one writer + one runner — same shape as the -// cuda_oom file (also a multi-input pattern): +// Two input projections + one runner — the writer is now the generic +// appendVerdict in patterndetector.go, driven by +// NCCLBootstrapTimeoutVerdict.Common() + PutAttrs(): // - projectTrainingPodRecord: k8sobjectsreceiver pod-Ready log lines // → patterns.TrainingPodRecord // - projectCNINetworkEventRecord: k8sobjectsreceiver Event log lines // with reason in {FailedCreatePodSandBox, NetworkNotReady, CNIError} // → patterns.CNINetworkEventRecord -// - appendNCCLBootstrapVerdict: writes one verdict log record with -// the promoted-scalar attrs per issue #270 -// - runNCCLBootstrapDetector: wires the detector against Config - -// NCCL bootstrap (pattern #9) verdict-attribute namespace. Promoted -// onto the verdict log record per issue #270 scalar-promotion contract -// so dashboards table-aggregate by namespace + discriminator + cohort -// size without server-side parsing of pattern.verdict_json. -const ( - // verdictAttrK8sNamespaceName promotes the k8s namespace onto the - // verdict log record. Reused for pod_evicted / xid_correlation - // would have benefited from the same vocabulary; today the field - // is pattern-#9-only because no other detector needed namespace - // disambiguation (pod-evicted promotes k8s.pod.namespace which is - // the same value via a different OTel-semconv name). - verdictAttrK8sNamespaceName = "k8s.namespace.name" - - // verdictAttrGenAITrainingJobID promotes the cohort's training- - // job id (the alpha gen_ai.training.job_id per ATTRIBUTES.md) onto - // the verdict. Empty (the namespace-only fallback) suppresses the - // stamp via putStrIfSet. - verdictAttrGenAITrainingJobID = "gen_ai.training.job_id" - - // verdictAttrNCCLBootstrapCohortSize promotes the cohort size - // scalar (number of distinct ranks the detector observed Ready) - // so dashboards can graph cohort fanout without parsing JSON. - verdictAttrNCCLBootstrapCohortSize = "tracecore.alert.nccl_bootstrap_timeout.cohort_size" - - // verdictAttrNCCLBootstrapFailedRankCount promotes the failed- - // rank-count scalar so dashboards can show "3 of 8 failed" without - // parsing JSON. - verdictAttrNCCLBootstrapFailedRankCount = "tracecore.alert.nccl_bootstrap_timeout.failed_rank_count" - - // verdictAttrNCCLBootstrapDiscriminator promotes the discriminator - // branch so dashboards can fan out by cause family (cni_error / - // socket_ifname_mismatch / rendezvous_unreachable / unknown). - verdictAttrNCCLBootstrapDiscriminator = "tracecore.alert.nccl_bootstrap_timeout.discriminator" -) +// - runNCCLBootstrapDetector: wires the detector against Config and +// emits via appendVerdict +// +// Wire-format attribute keys live next to the verdict type — see +// module/pkg/patterns/verdict.go for the canonical constant block. // projectTrainingPodRecord reads OTel attributes off a log record and // builds a patterns.TrainingPodRecord. The projection's gate is the @@ -222,28 +190,6 @@ func collectNCCLBootstrapInputs(ld plog.Logs) ([]patterns.TrainingPodRecord, []p return pods, cnis } -// appendNCCLBootstrapVerdict emits a nccl_bootstrap verdict log record. -// Promoted attrs: pattern.confidence, k8s.namespace.name, -// gen_ai.training.job_id, cohort_size, failed_rank_count, discriminator. -// -// putStrIfSet guards optional scalars (gen_ai.training.job_id is empty -// on the namespace-only fallback path). -func appendNCCLBootstrapVerdict(ld plog.Logs, v patterns.NCCLBootstrapTimeoutVerdict, logger *zap.Logger) { - appendVerdictRecord(ld, logger, verdictCommon{ - PatternID: v.PatternID, - Headline: v.Headline, - Remediation: v.Remediation, - EvidenceTrail: v.EvidenceTrail, - }, v, "nccl_bootstrap", func(attrs pcommon.Map) { - attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) - putStrIfSet(attrs, verdictAttrK8sNamespaceName, v.Namespace) - putStrIfSet(attrs, verdictAttrGenAITrainingJobID, v.JobID) - attrs.PutInt(verdictAttrNCCLBootstrapCohortSize, int64(v.CohortSize)) - attrs.PutInt(verdictAttrNCCLBootstrapFailedRankCount, int64(v.FailedRankCount)) - attrs.PutStr(verdictAttrNCCLBootstrapDiscriminator, string(v.Discriminator)) - }) -} - // runNCCLBootstrapDetector wires the patterns.NCCLBootstrapDetector // against the processor config and emits one nccl_bootstrap verdict // per match. Consumes NCCL FR records from the cross-cutting @@ -269,7 +215,7 @@ func runNCCLBootstrapDetector( if v.Confidence == patterns.ConfidencePartial && !emitPartial { continue } - appendNCCLBootstrapVerdict(ld, v, logger) + appendVerdict(ld, v, logger) } } diff --git a/module/processor/patterndetectorprocessor/patterndetector.go b/module/processor/patterndetectorprocessor/patterndetector.go index 56ed39e1..f8d51c8c 100644 --- a/module/processor/patterndetectorprocessor/patterndetector.go +++ b/module/processor/patterndetectorprocessor/patterndetector.go @@ -114,10 +114,6 @@ const instrumentationScope = "github.com/tracecoreai/tracecore/module/processor/ // pattern library's PodEvictedVerdict shape. const verdictAttrPatternID = "pattern.id" -// verdictAttrConfidence is the canonical confidence attribute on an -// emitted verdict log record. -const verdictAttrConfidence = "pattern.confidence" - // verdictAttrHeadline is the operator-facing one-line summary. const verdictAttrHeadline = "pattern.headline" @@ -181,22 +177,16 @@ const ( // verdicts. verdictAttrThermalGPUCount = "hw.gpu.throttle.cascade_size" - // verdictAttrPCIeAERGPUID promotes the verdict's PCI BDF onto - // the emitted log record so dashboards can table-aggregate by - // GPU without parsing pattern.verdict_json (issue #270 - // scalar-promotion contract). Mirrors the customer-stable - // `gpu.id` contract from RFC-0013 §3 so a downstream LogQL - // query joins this verdict to raw metric/kernel telemetry on - // the same attribute name. Emitted on pcie_aer verdicts. - verdictAttrPCIeAERGPUID = "gpu.id" - - // PCIeAER and IBLinkFlap attribute keys (severity, type, drop_ratio, - // hca_device, port.num, transition_count) are now owned by the - // patterns package — see module/pkg/patterns/verdict.go for the - // canonical constant block. The per-pattern PutAttrs methods on - // patterns.PCIeAERVerdict and patterns.IBLinkFlapVerdict stamp them - // directly so the wire-format vocabulary lives next to the verdict - // type that owns it. + // Per-pattern verdict-attribute keys (PCIeAER, IBLinkFlap, + // CUDAOOM, CheckpointerHang, DataLoaderHang, NCCLBootstrap, + // SilentDataCorruption) are owned by the patterns package — see + // module/pkg/patterns/verdict.go for the canonical constant + // block. Each verdict's PutAttrs method stamps them directly so + // the wire-format vocabulary lives next to the verdict type that + // owns it. The keys above are the cross-pattern boilerplate + // (pattern.id / headline / remediation / verdict_json) plus the + // vocabulary the seven inline runners in this file consume via + // their typed verdict structs. ) // patterndetectorProcessor implements processor.Logs. Pure data-path @@ -527,14 +517,6 @@ func projectNodeCondition(lr plog.LogRecord, resAttrs pcommon.Map) (patterns.Nod return r, true } -// verdictCommon is a package-local alias for patterns.VerdictCommon -// so the five inline-stamp call sites in this package (cuda_oom, -// checkpointer_hang, sdc, nccl_bootstrap, dataloader_hang) keep their -// existing `verdictCommon{...}` literal shape. The canonical type -// lives in module/pkg/patterns/verdict.go alongside the verdict types -// whose Common() methods return it. -type verdictCommon = patterns.VerdictCommon - // verdictAttrer is the minimal interface a patterns.XxxVerdict must // satisfy for the generic appendVerdict helper to emit it. Common() // supplies the shared boilerplate scalars; PutAttrs stamps the @@ -546,47 +528,27 @@ type verdictAttrer interface { PutAttrs(pcommon.Map) } -// appendVerdict is the generic verdict emitter. Consolidates the -// seven near-identical per-pattern wrappers (PodEvicted, NCCLHang, -// XidCorrelation, HBMECC, ThermalThrottle, IBLinkFlap, PCIeAER) by -// delegating Common() + PutAttrs to the verdict type. Each verdict's -// JSON-tag honoring is preserved because V is the concrete struct -// type at the call site — json.Marshal sees the real fields, not the -// interface. +// appendVerdict is the generic verdict emitter. Consolidates all +// twelve detectors (PodEvicted, NCCLHang, XidCorrelation, HBMECC, +// ThermalThrottle, IBLinkFlap, PCIeAER, CUDAOOM, CheckpointerHang, +// DataLoaderHang, NCCLBootstrap, SilentDataCorruption) by delegating +// Common() + PutAttrs to the verdict type. Each verdict's JSON-tag +// honoring is preserved because V is the concrete struct type at the +// call site — json.Marshal sees the real fields, not the interface. +// +// Writes the shared verdict log-record boilerplate (rl/sl/lr alloc, +// scope, body, observed + event timestamps, common scalar attrs, +// JSON marshal + warn-on-fail). The diagnostic kind label for the +// warn-on-marshal-failure log is sourced from VerdictCommon.Kind (set +// by each verdict's Common() method) so the processor doesn't +// duplicate the verdict-type → label mapping. Per-pattern scalar +// promotion is the only per-detector knob — it's owned by the verdict +// type's PutAttrs method in module/pkg/patterns/. // -// The diagnostic kind label for the warn-on-marshal-failure log is -// sourced from VerdictCommon.Kind (set by each verdict's Common() -// method) so the processor doesn't duplicate the verdict-type → -// label mapping. +// A failure to marshal logs at warn and skips the JSON attribute; the +// broken-out scalars still emit. func appendVerdict[V verdictAttrer](ld plog.Logs, v V, logger *zap.Logger) { c := v.Common() - appendVerdictRecord(ld, logger, c, v, c.Kind, v.PutAttrs) -} - -// appendVerdictRecord writes the shared verdict log-record boilerplate -// (rl/sl/lr alloc, scope, body, observed + event timestamps, common -// scalar attrs, JSON marshal + warn-on-fail) and delegates the -// per-pattern scalar promotion to stampAttrs. -// -// The closure is what makes this NOT a leaky abstraction: each -// detector's promoted attrs are intentionally different (PodEvicted -// promotes Confidence + 4 k8s.* strings; NCCLHang promotes 3 ints; -// PCIeAER promotes 4 strings + 1 double + uses a different attribute -// vocabulary). Stamping is the irreducible per-pattern decision — -// the boilerplate around it isn't. -// -// vAny is the json.Marshal target so the generic stays single-purpose -// (boilerplate writer) and the marshaller sees the concrete struct -// for json-tag honoring. A failure to marshal logs at warn and skips -// the JSON attribute; the broken-out scalars still emit. -func appendVerdictRecord( - ld plog.Logs, - logger *zap.Logger, - c verdictCommon, - vAny any, - patternKindForLog string, - stampAttrs func(pcommon.Map), -) { rl := ld.ResourceLogs().AppendEmpty() sl := rl.ScopeLogs().AppendEmpty() sl.Scope().SetName(instrumentationScope) @@ -603,12 +565,12 @@ func appendVerdictRecord( attrs.PutStr(verdictAttrPatternID, c.PatternID) attrs.PutStr(verdictAttrHeadline, c.Headline) attrs.PutStr(verdictAttrRemediation, c.Remediation) - stampAttrs(attrs) - if b, err := json.Marshal(vAny); err == nil { + v.PutAttrs(attrs) + if b, err := json.Marshal(v); err == nil { attrs.PutStr(verdictAttrVerdictJSON, string(b)) } else if logger != nil { logger.Warn("patterndetector: failed to marshal verdict JSON; broken-out attrs still emit", - zap.String("pattern", patternKindForLog), zap.Error(err)) + zap.String("pattern", c.Kind), zap.Error(err)) } } @@ -621,15 +583,3 @@ func firstEvidenceTimestamp(trail []patterns.EvidenceRef) time.Time { } return time.Time{} } - -// putStrIfSet is a small guard so an unpopulated optional scalar -// (e.g. an evicted pod with no namespace on a malformed event) -// doesn't stamp an empty-string attribute on the verdict record. -// Dashboards filter by attribute presence; the empty-string variant -// would silently match empty-filter queries. -func putStrIfSet(attrs pcommon.Map, key, value string) { - if value == "" { - return - } - attrs.PutStr(key, value) -} diff --git a/module/processor/patterndetectorprocessor/silent_data_corruption.go b/module/processor/patterndetectorprocessor/silent_data_corruption.go index d9e844f0..d3310e1e 100644 --- a/module/processor/patterndetectorprocessor/silent_data_corruption.go +++ b/module/processor/patterndetectorprocessor/silent_data_corruption.go @@ -12,43 +12,12 @@ import ( "github.com/tracecoreai/tracecore/module/pkg/patterns" ) -// Silent data corruption (pattern #13) verdict-attribute namespace. -// Promoted onto the verdict log record per issue #270 scalar-promotion -// contract so downstream LogQL / Grafana queries can table-aggregate -// by job + suspect-GPU + kind without server-side parsing of -// pattern.verdict_json. Attribute keys mirror the customer-stable -// `tracecore.alert.silent_data_corruption.*` family the -// docs/patterns/13-silent-data-corruption.md §"Verdict attributes" -// table pins. -const ( - // verdictAttrSDCKind promotes the silent_data_corruption - // discriminator branch (`vendor_signaled` / `accuracy_only`). - verdictAttrSDCKind = "tracecore.alert.silent_data_corruption.kind" - - // verdictAttrSDCAccuracyDrop promotes the absolute accuracy drop - // (baseline - observed) so dashboards can branch on - // regression-magnitude buckets without parsing JSON. Range - // [0, 1]. - verdictAttrSDCAccuracyDrop = "tracecore.alert.silent_data_corruption.accuracy_drop" - - // verdictAttrSDCSuspectGPUID promotes the PCI BDF of the GPU - // whose vendor SDC counter rose during the job window. Empty - // when kind=accuracy_only — guarded by putStrIfSet so the - // attribute is omitted (no empty-string false-match on dashboard - // filters). - verdictAttrSDCSuspectGPUID = "tracecore.alert.silent_data_corruption.suspect_gpu_id" - - // verdictAttrSDCSuspectNode promotes the Kubernetes node name - // carrying the suspect GPU. Empty when kind=accuracy_only. - verdictAttrSDCSuspectNode = "tracecore.alert.silent_data_corruption.suspect_node" - - // verdictAttrSDCJobID promotes the affected training job id. - // Mirrors the customer-stable `gen_ai.training.job_id` attribute - // the eval-pipeline OTTL recipe emits — same name on the verdict - // as on the underlying input record so a LogQL query joins - // without renaming. - verdictAttrSDCJobID = "gen_ai.training.job_id" -) +// Silent data corruption (pattern #13) verdict-attribute keys live +// next to the verdict type — see module/pkg/patterns/verdict.go for +// the canonical constant block and SilentDataCorruptionVerdict.PutAttrs +// for the stamper. Co-locating keys + stamper with the producer side +// ensures the wire-format vocabulary is owned by the patterns +// package, not duplicated here. // projectEvalAccuracyRecord reads OTel attributes off a log record // and builds a patterns.EvalAccuracyRecord. The projection's gate is @@ -158,33 +127,6 @@ func projectSDCCounterRecord(lr plog.LogRecord, resAttrs pcommon.Map) (patterns. return r, true } -// appendSilentDataCorruptionVerdict emits a silent_data_corruption -// verdict log record. Promoted attrs: pattern.confidence, -// tracecore.alert.silent_data_corruption.{kind,accuracy_drop, -// suspect_gpu_id,suspect_node}, gen_ai.training.job_id (issue #270 -// scalar-promotion contract). -// -// putStrIfSet guards optional scalars (suspect_gpu_id, suspect_node) -// so the partial-confidence (kind=accuracy_only) verdict — which -// carries no suspect hardware — does not stamp empty-string -// attributes that would silently match empty-filter dashboard -// queries. -func appendSilentDataCorruptionVerdict(ld plog.Logs, v patterns.SilentDataCorruptionVerdict, logger *zap.Logger) { - appendVerdictRecord(ld, logger, verdictCommon{ - PatternID: v.PatternID, - Headline: v.Headline, - Remediation: v.Remediation, - EvidenceTrail: v.EvidenceTrail, - }, v, "silent_data_corruption", func(attrs pcommon.Map) { - attrs.PutStr(verdictAttrConfidence, string(v.Confidence)) - attrs.PutStr(verdictAttrSDCKind, string(v.Kind)) - attrs.PutDouble(verdictAttrSDCAccuracyDrop, v.AccuracyDrop) - putStrIfSet(attrs, verdictAttrSDCSuspectGPUID, v.SuspectGPUID) - putStrIfSet(attrs, verdictAttrSDCSuspectNode, v.SuspectNode) - putStrIfSet(attrs, verdictAttrSDCJobID, v.JobID) - }) -} - // collectSDCInputs walks the incoming plog.Logs and projects // eval-accuracy + SDC-counter log records out. Hoisted into the // silent_data_corruption-specific file (mirroring cuda_oom's @@ -238,7 +180,7 @@ func runSDCDetector(ld plog.Logs, cfg *Config, emitPartial bool, logger *zap.Log if v.Confidence == patterns.ConfidencePartial && !emitPartial { continue } - appendSilentDataCorruptionVerdict(ld, v, logger) + appendVerdict(ld, v, logger) } }