From 18312be1ade70e5f91fc648d4b086332bea28889 Mon Sep 17 00:00:00 2001 From: Yan Cui Date: Sat, 7 Mar 2026 15:39:41 -0800 Subject: [PATCH] Expose comms_id in traces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: # Context Add a comms_id to PyTorch profiler traces that uniquely identifies each collective/P2P communication operation across all ranks. This enables trace analysis tools to correlate the same operation across different ranks for debugging distributed training performance. How comms_id is computed comms_id = hash(pg_name, seqNumber, isP2P, globalRankStart, globalRankStride, worldSize) - pg_name — identifies the process group - seqNumber — per-PG operation counter, identifies which operation within the PG - isP2P — distinguishes P2P ops (send/recv) from collectives (allreduce, etc.), since they use separate sequence number counters - globalRankStart, globalRankStride, worldSize — encodes the communicator topology, disambiguating cases where one PG creates multiple communicators (e.g., comm splits) Changes by layer 1. Data model (ParamCommsUtils.hpp/.cpp) — Added seqNumber, isP2P fields to ParamCommsDebugInfo, the class that carries communication metadata through the profiling stack. 2. Hash computation (profiler/util.cpp/.h) — In saveNcclMeta(), computes comms_id from the 6 fields above and emits it as "Comms Id" in the profiler metadata map. 3. Trace output (output_json.cpp) — Kineto reads "Comms Id" from the metadata and writes it into the Chrome trace JSON, making it visible in trace viewers. 4. Tests (comms_id.cpp, CuptiActivityProfilerTest.cpp) — 9 unit tests covering: - Storage/retrieval of seqNumber and isP2P - Default values - End-to-end: comms_id appears in saveNcclMeta() output with correct hash - Determinism across instances - Uniqueness across different PG names, sequence numbers, P2P vs collective, and communicator topologies Differential Revision: D95659539 --- libkineto/src/output_json.cpp | 10 ++++++++++ libkineto/test/CuptiActivityProfilerTest.cpp | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/libkineto/src/output_json.cpp b/libkineto/src/output_json.cpp index cf26b5e8d..d7d9a250f 100644 --- a/libkineto/src/output_json.cpp +++ b/libkineto/src/output_json.cpp @@ -49,6 +49,7 @@ static constexpr const std::string_view kOutTensorsStart = static constexpr const std::string_view kRank = "Rank"; static constexpr const std::string_view kP2pSrc = "Src Rank"; static constexpr const std::string_view kP2pDst = "Dst Rank"; +static constexpr const std::string_view kCommsId = "Comms Id"; #ifdef __linux__ static constexpr std::string_view kDefaultLogFileFmt = @@ -555,6 +556,15 @@ void ChromeTraceLogger::handleActivity(const libkineto::ITraceActivity& op) { arg_values.append(fmt::format(", \"{}\": {}", kP2pSrc, srcRank)); } + const auto& commsId = + collectiveRecord->getMetadataValue(std::string(kCommsId)); + if (!commsId.empty()) { + if (!arg_values.empty()) { + arg_values.append(","); + } + arg_values.append(fmt::format(" \"{}\": {}", kCommsId, commsId)); + } + if (distInfo_.backend.empty() && processGroupDesc == "\"default_pg\"") { distInfo_.backend = "nccl"; distInfo_.rank = collectiveRecord->getMetadataValue(std::string(kRank)); diff --git a/libkineto/test/CuptiActivityProfilerTest.cpp b/libkineto/test/CuptiActivityProfilerTest.cpp index bd886dcd2..dabe29a37 100644 --- a/libkineto/test/CuptiActivityProfilerTest.cpp +++ b/libkineto/test/CuptiActivityProfilerTest.cpp @@ -52,6 +52,7 @@ static constexpr auto kGroupSize = "Group size"; static constexpr const char* kProcessGroupName = "Process Group Name"; static constexpr const char* kProcessGroupDesc = "Process Group Description"; static constexpr const char* kGroupRanks = "Process Group Ranks"; +static constexpr const char* kCommsId = "Comms Id"; static constexpr int32_t kTruncatLength = 30; #define CUDA_LAUNCH_KERNEL CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 @@ -663,6 +664,7 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) { metadataMap.emplace(kGroupSize, "2"); metadataMap.emplace(kProcessGroupName, fmt::format("\"{}\"", "12341234")); metadataMap.emplace(kProcessGroupDesc, fmt::format("\"{}\"", "test_purpose")); + metadataMap.emplace(kCommsId, "12345678"); std::vector inSplitSizes(50, 0); std::string inSplitSizesStr; @@ -804,6 +806,8 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) { EXPECT_EQ(2, countSubstrings(jsonString, "test_purpose")); EXPECT_EQ(2, countSubstrings(jsonString, kGroupRanks)); EXPECT_EQ(2, countSubstrings(jsonString, expectedGroupRanksStr)); + EXPECT_EQ(2, countSubstrings(jsonString, kCommsId)); + EXPECT_EQ(2, countSubstrings(jsonString, "12345678")); #endif }