diff --git a/module/receiver/ncclfrreceiver/README.md b/module/receiver/ncclfrreceiver/README.md index 80d2d1d3..8ba16ad6 100644 --- a/module/receiver/ncclfrreceiver/README.md +++ b/module/receiver/ncclfrreceiver/README.md @@ -44,9 +44,14 @@ you can exercise the full receiver pipeline on a laptop: ```bash mkdir -p /tmp/nccl-fr tracecore failure-inject nccl-hang --out /tmp/nccl-fr/rank-0.pkl -tracecore collect --config module/receiver/ncclfrreceiver/example_config.yaml +./_build/tracecore --config=module/receiver/ncclfrreceiver/example_config.yaml ``` +> The legacy `tracecore collect` subcommand was removed in +> [RFC-0013 PR-A2](../../../docs/rfcs/0013-distro-first-pivot.md); +> the OCB-assembled binary takes `--config=` directly. Build via +> `make build` from the repo root. + The receiver picks up the synthesized hang dump within `poll_interval` and emits two records (a completed all-reduce + a started one that never completes) via the configured exporter. diff --git a/module/receiver/ncclfrreceiver/example_config.yaml b/module/receiver/ncclfrreceiver/example_config.yaml index a97af0c8..7bc3bffa 100644 --- a/module/receiver/ncclfrreceiver/example_config.yaml +++ b/module/receiver/ncclfrreceiver/example_config.yaml @@ -26,10 +26,14 @@ receivers: hw_id: gpu-0 exporters: - stdoutexporter: + # RFC-0013 PR-A2 retired the in-tree `stdoutexporter`; the upstream + # `debug` exporter is the post-pivot successor. `verbosity: detailed` + # prints every attribute, matching the old stdoutexporter output. + debug: + verbosity: detailed service: pipelines: logs: receivers: [nccl_fr] - exporters: [stdoutexporter] + exporters: [debug]