Skip to content
Open
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
33f1850
Fix logging of XPUPTI_BUILD_FLAG
tsocha Jan 27, 2026
8fd8eff
Add support for collective comunication events.
tsocha Mar 30, 2026
f4b4c93
Add support for synchronization events
tsocha Mar 17, 2026
4b7d3de
🤖 Add integration tests for communication and synchronization handlers
tsocha Mar 30, 2026
32fb645
🤖 Use gtest_add_tests to fix MSVC test discovery
tsocha Mar 20, 2026
6871698
🤖 Log warning instead of throwing when PTI_VIEW_COMMUNICATION is unsu…
tsocha Mar 20, 2026
4e12909
🤖 Add XPU_SYNC ActivityType for device synchronization events
tsocha Mar 20, 2026
51a83ee
🤖 Fix Windows linking for XpuptiProfilerTestLib
tsocha Mar 30, 2026
4cde5a1
Fix compilation on MSVC
tsocha Mar 23, 2026
38efe31
Apply suggestions from 🤖 code review
tsocha Mar 24, 2026
ff15033
Self-review I
tsocha Mar 24, 2026
477dea0
🤖 AI review I
tsocha Mar 24, 2026
7d36e18
AI Review II
tsocha Mar 25, 2026
52d795a
remove else as clang-tidy suggests.
tsocha Mar 25, 2026
057ef30
Add information about env flag for the user
tsocha Mar 30, 2026
8d173de
Merge branch 'pytorch:main' into dev/tsocha/oneccl
tsocha Apr 7, 2026
7ce741e
Merge remote-tracking branch 'origin/main' into upstream-oneccl
tsocha Apr 9, 2026
afcb0ed
🤖 Fix Windows build: Guard Linux-only linker flag
tsocha Apr 9, 2026
274a559
Don't change order of Activity types
tsocha Apr 9, 2026
7aee399
Merge remote-tracking branch 'staging/dev/tsocha/oneccl' into upstrea…
tsocha Apr 9, 2026
66ea51c
Remove duplicated list
tsocha Apr 9, 2026
1037218
Merge branch 'pytorch:main' into dev/tsocha/oneccl
tsocha Apr 13, 2026
e9c88e4
Fix order of activities
tsocha Apr 14, 2026
3bbe1ac
Merge branch 'pytorch:main' into dev/tsocha/oneccl
tsocha Apr 20, 2026
014433b
Merge remote-tracking branch 'origin/main' into upstream-oneccl
tsocha May 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libkineto/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ For more information on how to run on-demand profiling, please refer to the Dyno

The default trace output is a JSON file that can be visualized in Chrome Trace Viewer or Perfetto. The trace output is generated by the `ChromeTraceLogger` instance. The `ChromeTraceLogger` writes to a JSON file using `std::ofstream` in `output_json.cpp` to maximize performance during export. This instance is created by the `ActivityProfilerController` and is stored in the `ActivityLoggerFactory` alongside its protocol. Using this schema, Kineto supports multiple trace output formats.

- Intel XCCL: to enable collecting of oneCCL host events, `INTEL_LIBITTNOTIFY64` enviroment variable have to be set as path to `pti_view.so` location.

## Full documentation
We strive to keep our source files readable. The best and up-to-date
documentation for implementation specifics is available in the source files.
Expand Down
5 changes: 4 additions & 1 deletion libkineto/include/ActivityType.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ enum class ActivityType {
PRIVATEUSE1_RUNTIME = 24, // host side privateUse1 runtime events
PRIVATEUSE1_DRIVER = 25, // host side privateUse1 driver events

ENUM_COUNT = 26, // This is to add buffer and not used for any profiling logic. Add
XPU_SYNC = 26, // XPU synchronization events

ENUM_COUNT = 27, // This is to add buffer and not used for any profiling logic. Add
// your new type before it.
OPTIONAL_ACTIVITY_TYPE_START = GLOW_RUNTIME,
};
Expand Down Expand Up @@ -94,6 +96,7 @@ inline constexpr std::array<_ActivityTypeName, activityTypeCount + 1> _activityT
{"collective_comm", ActivityType::COLLECTIVE_COMM},
{"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME},
{"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER},
{"xpu_sync", ActivityType::XPU_SYNC},
{"ENUM_COUNT", ActivityType::ENUM_COUNT},
}};

Expand Down
3 changes: 2 additions & 1 deletion libkineto/src/plugin/xpupti/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ if(TARGET Pti::pti_view)
list(APPEND XPUPTI_INCLUDE_DIR ${PTI_INCLUDE_DIR})
set(XPUPTI_INCLUDE_DIR ${XPUPTI_INCLUDE_DIR} PARENT_SCOPE)

set(XPUPTI_BUILD_FLAG "-DHAS_XPUPTI" PARENT_SCOPE)
set(XPUPTI_BUILD_FLAG "-DHAS_XPUPTI")
Copy link
Copy Markdown

@gujinghui gujinghui May 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tsocha with this PR #1337 , do we still need to add the flag here?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#1337 Remove only part of HAS_XPUPTI usage.
My change fix the logging of this variable.
Currently it always produce empty string in the logs.

set(XPUPTI_BUILD_FLAG ${XPUPTI_BUILD_FLAG} PARENT_SCOPE)

message(STATUS " XPU_xpupti_LIBRARY = ${XPU_xpupti_LIBRARY}")
message(STATUS " XPUPTI_INCLUDE_DIR = ${XPUPTI_INCLUDE_DIR}")
Expand Down
29 changes: 29 additions & 0 deletions libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include "XpuptiActivityApi.h"
#include "Logger.h"

#include <chrono>
#include <stdexcept>
Expand Down Expand Up @@ -249,6 +250,20 @@ void XpuptiActivityApi::enableXpuptiActivities(
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_COLLECTION_OVERHEAD));
break;

#if PTI_VERSION_AT_LEAST(0, 17)
Copy link
Copy Markdown

@gujinghui gujinghui Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we follow the similar design in #1174 , instead of introducing many #if here? @moksiuc is able to give more details.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you be more precise here?
This change is consistent with this function: example in line 233.

This PR have 7 hits with #if PTI_VERSION_AT_LEAST(0, 17):

  • 2x ptiViewEnable and ptiViewDisable. It's consistent within the function handling PTI versions.
  • 3x in event handlers .cpp and .h for XpuptiActivityProfilerSession::handleCommunicationActivity.
    I could move .cpp part to another file but it's a single function.
  • 2x in tests.

I think that I can't use @moksiuc design here.
Most of these #ifs are here because it's the new feature in PTI.

@moksiuc could you confirm that?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At first glance I don't see how new code could be extracted to separate files or classes except separate Handlers file.

case ActivityType::COLLECTIVE_COMM: {
auto rc = ptiViewEnable(PTI_VIEW_COMMUNICATION);
if (rc != PTI_SUCCESS) {
LOG(WARNING) << "Failed to enable PTI_VIEW_COMMUNICATION: "
<< ptiResultTypeToString(rc);
}
break;
}
#endif
case ActivityType::XPU_SYNC:
XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DEVICE_SYNCHRONIZATION));
break;

default:
break;
}
Expand Down Expand Up @@ -293,6 +308,20 @@ void XpuptiActivityApi::disablePtiActivities(
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_COLLECTION_OVERHEAD));
break;

#if PTI_VERSION_AT_LEAST(0, 17)
case ActivityType::COLLECTIVE_COMM: {
auto rc = ptiViewDisable(PTI_VIEW_COMMUNICATION);
if (rc != PTI_SUCCESS) {
LOG(WARNING) << "Failed to disable PTI_VIEW_COMMUNICATION: "
<< ptiResultTypeToString(rc);
}
break;
}
#endif
case ActivityType::XPU_SYNC:
XPUPTI_CALL(ptiViewDisable(PTI_VIEW_DEVICE_SYNCHRONIZATION));
break;

default:
break;
}
Expand Down
144 changes: 126 additions & 18 deletions libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,18 +95,6 @@ inline void XpuptiActivityProfilerSession::handleCorrelationActivity(
}
}

std::string XpuptiActivityProfilerSession::getApiName(
const pti_view_record_api_t* activity) {
#if PTI_VERSION_AT_LEAST(0, 11)
const char* api_name = nullptr;
XPUPTI_CALL(
ptiViewGetApiIdName(activity->_api_group, activity->_api_id, &api_name));
return std::string(api_name);
#else
return std::string(activity->_name);
#endif
}

inline std::string memcpyName(
pti_view_memcpy_type kind,
pti_view_memory_type src,
Expand Down Expand Up @@ -183,8 +171,6 @@ void XpuptiActivityProfilerSession::handleRuntimeKernelMemcpyMemsetActivities(

traceBuffer_.span.opCount += 1;
traceBuffer_.gpuOpCount += 1;
const ITraceActivity* linked =
linkedActivity(activity->_correlation_id, cpuCorrelationMap_);

if constexpr (handleRuntimeActivities) {
traceBuffer_.emplace_activity(
Expand All @@ -209,11 +195,14 @@ void XpuptiActivityProfilerSession::handleRuntimeKernelMemcpyMemsetActivities(

trace_activity->startTime = activity->_start_timestamp;
trace_activity->endTime = activity->_end_timestamp;
trace_activity->id = activity->_correlation_id;
trace_activity->threadId = activity->_thread_id;
trace_activity->flow.id = activity->_correlation_id;
trace_activity->flow.type = libkineto::kLinkAsyncCpuGpu;
trace_activity->linked = linked;

trace_activity->id = activity->_correlation_id;
trace_activity->linked =
linkedActivity(activity->_correlation_id, cpuCorrelationMap_);
trace_activity->addMetadata("correlation", activity->_correlation_id);

if constexpr (handleRuntimeActivities) {
trace_activity->device = activity->_process_id;
Expand Down Expand Up @@ -263,8 +252,6 @@ void XpuptiActivityProfilerSession::handleRuntimeKernelMemcpyMemsetActivities(
"l0 queue", handleToHexString(activity->_queue_handle));
}

trace_activity->addMetadata("correlation", activity->_correlation_id);

if constexpr (handleKernelActivities) {
if (activity->_source_file_name) {
trace_activity->addMetadataQuoted(
Expand Down Expand Up @@ -293,6 +280,116 @@ void XpuptiActivityProfilerSession::handleRuntimeKernelMemcpyMemsetActivities(
trace_activity->log(logger);
}

namespace {
std::string getStringFromSynchronizationType(
const pti_view_synchronization_type& synchronization_type) {
using pv_st = pti_view_synchronization_type;
static const std::unordered_map<pv_st, std::string> name_map{
{pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_UNKNOWN, "UNKNOWN"},
{pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_EXECUTION,
"GPU_BARRIER_EXECUTION"},
{pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_MEMORY,
"GPU_BARRIER_MEMORY"},
{pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_FENCE, "HOST_FENCE"},
{pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_EVENT, "HOST_EVENT"},
{pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_COMMAND_LIST,
"HOST_COMMAND_LIST"},
{pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_COMMAND_QUEUE,
"HOST_COMMAND_QUEUE"},
};

const auto& name_string = name_map.find(synchronization_type);
if (name_string == name_map.end()) {
const std::string error_message =
"404: Not found string literal for this synchronization type: " +
std::to_string(synchronization_type);
return error_message;
}
return name_string->second;
}
} // namespace

void XpuptiActivityProfilerSession::handleSynchronizationActivity(
const pti_view_record_synchronization* activity,
ActivityLogger& logger) {
const auto& activity_record = *activity;
const auto record_name = getApiName(activity);

const bool isGpuSync =
activity_record._synch_type == PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_EXECUTION ||
activity_record._synch_type == PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_MEMORY;

traceBuffer_.span.opCount += 1;
if (isGpuSync) {
traceBuffer_.gpuOpCount += 1;
}
traceBuffer_.emplace_activity(traceBuffer_.span, ActivityType::XPU_SYNC, record_name);
auto& synchronization_activity = *(traceBuffer_.activities.back());

synchronization_activity.startTime = activity_record._start_timestamp;
synchronization_activity.endTime = activity_record._end_timestamp;
synchronization_activity.device = -1;
synchronization_activity.resource = activity_record._thread_id;
synchronization_activity.threadId = activity_record._thread_id;

synchronization_activity.id = activity->_correlation_id;
synchronization_activity.linked =
linkedActivity(activity->_correlation_id, cpuCorrelationMap_);
synchronization_activity.addMetadata(
"correlation", activity_record._correlation_id);

synchronization_activity.addMetadataQuoted(
"Type", getStringFromSynchronizationType(activity_record._synch_type));
synchronization_activity.addMetadataQuoted("Context_handle", handleToHexString(activity_record._context_handle));
synchronization_activity.addMetadataQuoted("Queue_handle", handleToHexString(activity_record._queue_handle));
synchronization_activity.addMetadataQuoted("Event_handle", handleToHexString(activity_record._event_handle));
synchronization_activity.addMetadata("Number_wait_events", activity_record._number_wait_events);
synchronization_activity.addMetadata("Return_code", activity_record._return_code);

if (outOfRange(&synchronization_activity)) {
traceBuffer_.span.opCount -= 1;
if (isGpuSync) {
traceBuffer_.gpuOpCount -= 1;
}
removeCorrelatedPtiActivities(&synchronization_activity);
traceBuffer_.activities.pop_back();
return;
}

synchronization_activity.log(logger);
}

#if PTI_VERSION_AT_LEAST(0, 17)
void XpuptiActivityProfilerSession::handleCommunicationActivity(
const pti_view_record_comms* activity,
ActivityLogger& logger) {
const auto& activity_record = *activity;
const std::string activity_name{activity_record._name};
const std::string xccl_prefix{"xccl::"};
const auto record_name = xccl_prefix + activity_name;

traceBuffer_.span.opCount += 1;
traceBuffer_.emplace_activity(traceBuffer_.span, ActivityType::COLLECTIVE_COMM, record_name);
auto& comms_activity = *(traceBuffer_.activities.back());

comms_activity.startTime = activity_record._start_timestamp;
comms_activity.endTime = activity_record._end_timestamp;
comms_activity.device = activity_record._process_id;
comms_activity.resource = activity_record._thread_id;
comms_activity.threadId = activity_record._thread_id;

comms_activity.addMetadata("Communicator_id", activity_record._communicator_id);

if (outOfRange(&comms_activity)) {
traceBuffer_.span.opCount -= 1;
traceBuffer_.activities.pop_back();
return;
}

comms_activity.log(logger);
}
#endif

void XpuptiActivityProfilerSession::handleOverheadActivity(
const pti_view_record_overhead* activity,
ActivityLogger& logger) {
Expand Down Expand Up @@ -367,6 +464,17 @@ void XpuptiActivityProfilerSession::handlePtiActivity(
handleOverheadActivity(
reinterpret_cast<const pti_view_record_overhead*>(record), logger);
break;
case PTI_VIEW_DEVICE_SYNCHRONIZATION:
handleSynchronizationActivity(
reinterpret_cast<const pti_view_record_synchronization*>(record),
logger);
break;
#if PTI_VERSION_AT_LEAST(0, 17)
case PTI_VIEW_COMMUNICATION:
handleCommunicationActivity(
reinterpret_cast<const pti_view_record_comms*>(record), logger);
break;
#endif
default:
errors_.push_back(
"Unexpected activity type: " + std::to_string(record->_view_kind));
Expand Down
16 changes: 15 additions & 1 deletion libkineto/src/plugin/xpupti/XpuptiActivityProfilerSession.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,27 @@ class XpuptiActivityProfilerSession : public libkineto::IActivityProfilerSession
using pti_view_record_api_t = pti_view_record_sycl_runtime;
#endif

std::string getApiName(const pti_view_record_api_t* activity);
template <typename PTI_VIEW>
std::string getApiName(const PTI_VIEW* activity) {
#if PTI_VERSION_AT_LEAST(0, 11)
const char* api_name = nullptr;
XPUPTI_CALL(ptiViewGetApiIdName(
activity->_api_group, activity->_api_id, &api_name));
return std::string(api_name);
#else
return std::string(activity->_name);
#endif
}

template <class pti_view_memory_record_type>
void handleRuntimeKernelMemcpyMemsetActivities(ActivityType activityType,
const pti_view_memory_record_type* activity,
ActivityLogger& logger);

void handleSynchronizationActivity(const pti_view_record_synchronization* activity, ActivityLogger& logger);
#if PTI_VERSION_AT_LEAST(0, 17)
void handleCommunicationActivity(const pti_view_record_comms* activity, ActivityLogger& logger);
#endif
void handleOverheadActivity(const pti_view_record_overhead* activity, ActivityLogger& logger);
void handlePtiActivity(const pti_view_record_base* record, ActivityLogger& logger);

Expand Down
Loading