diff --git a/libkineto/README.md b/libkineto/README.md index fcbae1b25..d18f1c113 100644 --- a/libkineto/README.md +++ b/libkineto/README.md @@ -73,6 +73,8 @@ For more information on how to run on-demand profiling, please refer to the Dyno The default trace output is a JSON file that can be visualized in Chrome Trace Viewer or Perfetto. The trace output is generated by the `ChromeTraceLogger` instance. The `ChromeTraceLogger` writes to a JSON file using `std::ofstream` in `output_json.cpp` to maximize performance during export. This instance is created by the `ActivityProfilerController` and is stored in the `ActivityLoggerFactory` alongside its protocol. Using this schema, Kineto supports multiple trace output formats. +- Intel XCCL: to enable collecting of oneCCL host events, `INTEL_LIBITTNOTIFY64` enviroment variable have to be set as path to `pti_view.so` location. + ## Full documentation We strive to keep our source files readable. The best and up-to-date documentation for implementation specifics is available in the source files. diff --git a/libkineto/include/ActivityType.h b/libkineto/include/ActivityType.h index 3008425a3..bdd902f13 100644 --- a/libkineto/include/ActivityType.h +++ b/libkineto/include/ActivityType.h @@ -51,8 +51,9 @@ enum class ActivityType { PRIVATEUSE1_DRIVER = 25, // host side privateUse1 driver events XPU_SCOPE_PROFILER = 26, // XPUPTI Profiler scope for performance metrics + XPU_SYNC = 27, // XPU synchronization events - ENUM_COUNT = 27, // This is to add buffer and not used for any profiling logic. Add + ENUM_COUNT = 28, // This is to add buffer and not used for any profiling logic. Add // your new type before it. OPTIONAL_ACTIVITY_TYPE_START = GLOW_RUNTIME, }; @@ -97,6 +98,7 @@ inline constexpr std::array<_ActivityTypeName, activityTypeCount + 1> _activityT {"privateuse1_runtime", ActivityType::PRIVATEUSE1_RUNTIME}, {"privateuse1_driver", ActivityType::PRIVATEUSE1_DRIVER}, {"xpu_scope_profiler", ActivityType::XPU_SCOPE_PROFILER}, + {"xpu_sync", ActivityType::XPU_SYNC}, {"ENUM_COUNT", ActivityType::ENUM_COUNT}, }}; diff --git a/libkineto/src/plugin/xpupti/CMakeLists.txt b/libkineto/src/plugin/xpupti/CMakeLists.txt index c75748948..b0011e4ec 100644 --- a/libkineto/src/plugin/xpupti/CMakeLists.txt +++ b/libkineto/src/plugin/xpupti/CMakeLists.txt @@ -49,7 +49,8 @@ if(TARGET Pti::pti_view) list(APPEND XPUPTI_INCLUDE_DIR ${PTI_INCLUDE_DIR}) set(XPUPTI_INCLUDE_DIR ${XPUPTI_INCLUDE_DIR} PARENT_SCOPE) - set(XPUPTI_BUILD_FLAG "-DHAS_XPUPTI" PARENT_SCOPE) + set(XPUPTI_BUILD_FLAG "-DHAS_XPUPTI") + set(XPUPTI_BUILD_FLAG ${XPUPTI_BUILD_FLAG} PARENT_SCOPE) message(STATUS " XPU_xpupti_LIBRARY = ${XPU_xpupti_LIBRARY}") message(STATUS " XPUPTI_INCLUDE_DIR = ${XPUPTI_INCLUDE_DIR}") diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp b/libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp index c934645b5..8ab3ac069 100644 --- a/libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp +++ b/libkineto/src/plugin/xpupti/XpuptiActivityApi.cpp @@ -7,6 +7,7 @@ */ #include "XpuptiActivityApi.h" +#include "Logger.h" #include #include @@ -260,6 +261,20 @@ void XpuptiActivityApi::enableXpuptiActivities( XPUPTI_CALL(ptiViewEnable(PTI_VIEW_COLLECTION_OVERHEAD)); break; +#if PTI_VERSION_AT_LEAST(0, 17) + case ActivityType::COLLECTIVE_COMM: { + auto rc = ptiViewEnable(PTI_VIEW_COMMUNICATION); + if (rc != PTI_SUCCESS) { + LOG(WARNING) << "Failed to enable PTI_VIEW_COMMUNICATION: " + << ptiResultTypeToString(rc); + } + break; + } +#endif + case ActivityType::XPU_SYNC: + XPUPTI_CALL(ptiViewEnable(PTI_VIEW_DEVICE_SYNCHRONIZATION)); + break; + default: break; } @@ -304,6 +319,20 @@ void XpuptiActivityApi::disablePtiActivities( XPUPTI_CALL(ptiViewDisable(PTI_VIEW_COLLECTION_OVERHEAD)); break; +#if PTI_VERSION_AT_LEAST(0, 17) + case ActivityType::COLLECTIVE_COMM: { + auto rc = ptiViewDisable(PTI_VIEW_COMMUNICATION); + if (rc != PTI_SUCCESS) { + LOG(WARNING) << "Failed to disable PTI_VIEW_COMMUNICATION: " + << ptiResultTypeToString(rc); + } + break; + } +#endif + case ActivityType::XPU_SYNC: + XPUPTI_CALL(ptiViewDisable(PTI_VIEW_DEVICE_SYNCHRONIZATION)); + break; + default: break; } diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp b/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp index 8c2121f11..e6d3933c4 100644 --- a/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp +++ b/libkineto/src/plugin/xpupti/XpuptiActivityHandlers.cpp @@ -95,18 +95,6 @@ inline void XpuptiActivityProfilerSession::handleCorrelationActivity( } } -std::string XpuptiActivityProfilerSession::getApiName( - const pti_view_record_api_t* activity) { -#if PTI_VERSION_AT_LEAST(0, 11) - const char* api_name = nullptr; - XPUPTI_CALL( - ptiViewGetApiIdName(activity->_api_group, activity->_api_id, &api_name)); - return std::string(api_name); -#else - return std::string(activity->_name); -#endif -} - inline std::string memcpyName( pti_view_memcpy_type kind, pti_view_memory_type src, @@ -183,8 +171,6 @@ void XpuptiActivityProfilerSession::handleRuntimeKernelMemcpyMemsetActivities( traceBuffer_.span.opCount += 1; traceBuffer_.gpuOpCount += 1; - const ITraceActivity* linked = - linkedActivity(activity->_correlation_id, cpuCorrelationMap_); if constexpr (handleRuntimeActivities) { traceBuffer_.emplace_activity( @@ -209,11 +195,14 @@ void XpuptiActivityProfilerSession::handleRuntimeKernelMemcpyMemsetActivities( trace_activity->startTime = activity->_start_timestamp; trace_activity->endTime = activity->_end_timestamp; - trace_activity->id = activity->_correlation_id; trace_activity->threadId = activity->_thread_id; trace_activity->flow.id = activity->_correlation_id; trace_activity->flow.type = libkineto::kLinkAsyncCpuGpu; - trace_activity->linked = linked; + + trace_activity->id = activity->_correlation_id; + trace_activity->linked = + linkedActivity(activity->_correlation_id, cpuCorrelationMap_); + trace_activity->addMetadata("correlation", activity->_correlation_id); if constexpr (handleRuntimeActivities) { trace_activity->device = activity->_process_id; @@ -271,8 +260,6 @@ void XpuptiActivityProfilerSession::handleRuntimeKernelMemcpyMemsetActivities( "l0 queue", handleToHexString(activity->_queue_handle)); } - trace_activity->addMetadata("correlation", activity->_correlation_id); - if constexpr (handleKernelActivities) { if (activity->_source_file_name) { trace_activity->addMetadataQuoted( @@ -301,6 +288,116 @@ void XpuptiActivityProfilerSession::handleRuntimeKernelMemcpyMemsetActivities( trace_activity->log(logger); } +namespace { +std::string getStringFromSynchronizationType( + const pti_view_synchronization_type& synchronization_type) { + using pv_st = pti_view_synchronization_type; + static const std::unordered_map name_map{ + {pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_UNKNOWN, "UNKNOWN"}, + {pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_EXECUTION, + "GPU_BARRIER_EXECUTION"}, + {pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_MEMORY, + "GPU_BARRIER_MEMORY"}, + {pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_FENCE, "HOST_FENCE"}, + {pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_EVENT, "HOST_EVENT"}, + {pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_COMMAND_LIST, + "HOST_COMMAND_LIST"}, + {pv_st::PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_COMMAND_QUEUE, + "HOST_COMMAND_QUEUE"}, + }; + + const auto& name_string = name_map.find(synchronization_type); + if (name_string == name_map.end()) { + const std::string error_message = + "404: Not found string literal for this synchronization type: " + + std::to_string(synchronization_type); + return error_message; + } + return name_string->second; +} +} // namespace + +void XpuptiActivityProfilerSession::handleSynchronizationActivity( + const pti_view_record_synchronization* activity, + ActivityLogger& logger) { + const auto& activity_record = *activity; + const auto record_name = getApiName(activity); + + const bool isGpuSync = + activity_record._synch_type == PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_EXECUTION || + activity_record._synch_type == PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_MEMORY; + + traceBuffer_.span.opCount += 1; + if (isGpuSync) { + traceBuffer_.gpuOpCount += 1; + } + traceBuffer_.emplace_activity(traceBuffer_.span, ActivityType::XPU_SYNC, record_name); + auto& synchronization_activity = *(traceBuffer_.activities.back()); + + synchronization_activity.startTime = activity_record._start_timestamp; + synchronization_activity.endTime = activity_record._end_timestamp; + synchronization_activity.device = -1; + synchronization_activity.resource = activity_record._thread_id; + synchronization_activity.threadId = activity_record._thread_id; + + synchronization_activity.id = activity->_correlation_id; + synchronization_activity.linked = + linkedActivity(activity->_correlation_id, cpuCorrelationMap_); + synchronization_activity.addMetadata( + "correlation", activity_record._correlation_id); + + synchronization_activity.addMetadataQuoted( + "Type", getStringFromSynchronizationType(activity_record._synch_type)); + synchronization_activity.addMetadataQuoted("Context_handle", handleToHexString(activity_record._context_handle)); + synchronization_activity.addMetadataQuoted("Queue_handle", handleToHexString(activity_record._queue_handle)); + synchronization_activity.addMetadataQuoted("Event_handle", handleToHexString(activity_record._event_handle)); + synchronization_activity.addMetadata("Number_wait_events", activity_record._number_wait_events); + synchronization_activity.addMetadata("Return_code", activity_record._return_code); + + if (outOfRange(&synchronization_activity)) { + traceBuffer_.span.opCount -= 1; + if (isGpuSync) { + traceBuffer_.gpuOpCount -= 1; + } + removeCorrelatedPtiActivities(&synchronization_activity); + traceBuffer_.activities.pop_back(); + return; + } + + synchronization_activity.log(logger); +} + +#if PTI_VERSION_AT_LEAST(0, 17) +void XpuptiActivityProfilerSession::handleCommunicationActivity( + const pti_view_record_comms* activity, + ActivityLogger& logger) { + const auto& activity_record = *activity; + const std::string activity_name{activity_record._name}; + const std::string xccl_prefix{"xccl::"}; + const auto record_name = xccl_prefix + activity_name; + + traceBuffer_.span.opCount += 1; + traceBuffer_.emplace_activity(traceBuffer_.span, ActivityType::COLLECTIVE_COMM, record_name); + auto& comms_activity = *(traceBuffer_.activities.back()); + + comms_activity.startTime = activity_record._start_timestamp; + comms_activity.endTime = activity_record._end_timestamp; + comms_activity.device = activity_record._process_id; + comms_activity.resource = activity_record._thread_id; + comms_activity.threadId = activity_record._thread_id; + + comms_activity.addMetadata("Communicator_id", activity_record._communicator_id); + + if (outOfRange(&comms_activity)) { + traceBuffer_.span.opCount -= 1; + traceBuffer_.activities.pop_back(); + return; + } + + comms_activity.log(logger); +} +#endif + void XpuptiActivityProfilerSession::handleOverheadActivity( const pti_view_record_overhead* activity, ActivityLogger& logger) { @@ -375,6 +472,17 @@ void XpuptiActivityProfilerSession::handlePtiActivity( handleOverheadActivity( reinterpret_cast(record), logger); break; + case PTI_VIEW_DEVICE_SYNCHRONIZATION: + handleSynchronizationActivity( + reinterpret_cast(record), + logger); + break; +#if PTI_VERSION_AT_LEAST(0, 17) + case PTI_VIEW_COMMUNICATION: + handleCommunicationActivity( + reinterpret_cast(record), logger); + break; +#endif default: errors_.push_back( "Unexpected activity type: " + std::to_string(record->_view_kind)); diff --git a/libkineto/src/plugin/xpupti/XpuptiActivityProfilerSession.h b/libkineto/src/plugin/xpupti/XpuptiActivityProfilerSession.h index 0cca9df87..3d91d5923 100644 --- a/libkineto/src/plugin/xpupti/XpuptiActivityProfilerSession.h +++ b/libkineto/src/plugin/xpupti/XpuptiActivityProfilerSession.h @@ -76,13 +76,27 @@ class XpuptiActivityProfilerSession : public libkineto::IActivityProfilerSession using pti_view_record_api_t = pti_view_record_sycl_runtime; #endif - std::string getApiName(const pti_view_record_api_t* activity); + template + std::string getApiName(const PTI_VIEW* activity) { +#if PTI_VERSION_AT_LEAST(0, 11) + const char* api_name = nullptr; + XPUPTI_CALL(ptiViewGetApiIdName( + activity->_api_group, activity->_api_id, &api_name)); + return std::string(api_name); +#else + return std::string(activity->_name); +#endif + } template void handleRuntimeKernelMemcpyMemsetActivities(ActivityType activityType, const pti_view_memory_record_type* activity, ActivityLogger& logger); + void handleSynchronizationActivity(const pti_view_record_synchronization* activity, ActivityLogger& logger); +#if PTI_VERSION_AT_LEAST(0, 17) + void handleCommunicationActivity(const pti_view_record_comms* activity, ActivityLogger& logger); +#endif void handleOverheadActivity(const pti_view_record_overhead* activity, ActivityLogger& logger); void handlePtiActivity(const pti_view_record_base* record, ActivityLogger& logger); diff --git a/libkineto/test/xpupti/CMakeLists.txt b/libkineto/test/xpupti/CMakeLists.txt index 1bc5595e7..320aa1ce0 100644 --- a/libkineto/test/xpupti/CMakeLists.txt +++ b/libkineto/test/xpupti/CMakeLists.txt @@ -6,8 +6,10 @@ set(CMAKE_CXX_STANDARD 17) -set(LINK_LIBRARIES - gtest_main +# Do not link gtest into the shared library to avoid duplicate gtest globals +# (gtest is statically linked into the executable; the shared library resolves +# gtest symbols from the executable at runtime via --export-dynamic). +set(COMMON_LINK_LIBRARIES kineto_base kineto_api $ @@ -15,31 +17,39 @@ set(LINK_LIBRARIES ${PTI_LIBRARY} ) +set(LINK_LIBRARIES gtest_main ${COMMON_LINK_LIBRARIES}) + add_executable(XpuptiScopeProfilerConfigTest XpuptiScopeProfilerConfigTest.cpp) target_link_libraries(XpuptiScopeProfilerConfigTest PRIVATE ${LINK_LIBRARIES}) -gtest_discover_tests(XpuptiScopeProfilerConfigTest) +gtest_add_tests(TARGET XpuptiScopeProfilerConfigTest) + +add_executable(XpuptiActivityHandlersTest XpuptiActivityHandlersTest.cpp) +target_link_libraries(XpuptiActivityHandlersTest PRIVATE ${LINK_LIBRARIES}) +gtest_add_tests(TARGET XpuptiActivityHandlersTest) include(ExternalProject) -function(make_test test_file) +function(make_sycl_test test_file) get_filename_component(test_name "${test_file}" NAME_WE) set(lib_name "${test_name}Lib") add_library(${lib_name} SHARED XpuptiTestUtilities.cpp ${test_file}) - # Do not link gtest into the shared library to avoid duplicate gtest globals - # (gtest is statically linked into the executable; the shared library resolves - # gtest symbols from the executable at runtime via --export-dynamic). - set(LIB_LINK_LIBRARIES - kineto_base - kineto_api - $ - ${SYCL_LIBRARY} - ${PTI_LIBRARY} - ) - target_link_libraries(${lib_name} PRIVATE ${LIB_LINK_LIBRARIES}) + + target_link_libraries(${lib_name} PRIVATE ${COMMON_LINK_LIBRARIES}) target_include_directories(${lib_name} PRIVATE $) - set_target_properties(${lib_name} PROPERTIES POSITION_INDEPENDENT_CODE ON) + set_target_properties(${lib_name} PROPERTIES + POSITION_INDEPENDENT_CODE ON + WINDOWS_EXPORT_ALL_SYMBOLS ON + ) + # On Windows, DLLs must resolve all symbols at link time (unlike Linux .so). + # ComputeOnXpu is defined in the compute executable (ExternalProject) which + # links back to this DLL. /FORCE:UNRESOLVED allows the DLL to build with + # this symbol unresolved. The linker will still report LNK2019 as an error + # but /FORCE overrides it (followed by LNK4088 warning) — this is expected. + if(WIN32) + target_link_options(${lib_name} PRIVATE "LINKER:/FORCE:UNRESOLVED") + endif() set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR}) install(TARGETS ${lib_name} @@ -47,20 +57,63 @@ function(make_test test_file) RUNTIME DESTINATION bin ) - ExternalProject_Add(${test_name} - SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/compute - CMAKE_ARGS -DPROJECT_NAME=${test_name} - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${SYCL_COMPILER} - CMAKE_ARGS -DCMAKE_PARENT_BINARY_DIR=${CMAKE_BINARY_DIR} - CMAKE_ARGS -DLINK_LIBRARY=${CMAKE_CURRENT_BINARY_DIR}/lib${lib_name}.so - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR} - BUILD_ALWAYS TRUE + set(_link_lib_search_dir ${CMAKE_CURRENT_BINARY_DIR}) + + # Build the common cmake args for the inner ExternalProject (single-config Ninja). + set(_ep_cmake_args + -DPROJECT_NAME=${test_name} + -DCMAKE_CXX_COMPILER=${SYCL_COMPILER} + -DCMAKE_PARENT_BINARY_DIR=${CMAKE_BINARY_DIR} + -DLINK_LIBRARY_NAME=${lib_name} + -DLINK_LIBRARY_DIR=${_link_lib_search_dir} + -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR} ) + if(WIN32) + get_filename_component(_sycl_compiler_dir "${SYCL_COMPILER}" DIRECTORY) + get_filename_component(_sycl_root "${_sycl_compiler_dir}" DIRECTORY) + list(APPEND _ep_cmake_args + "-DCMAKE_EXE_LINKER_FLAGS=/Qoption,link,/LIBPATH:\"${_sycl_root}/lib\"" + ) + endif() + + # - Multi-config (Visual Studio, Ninja Multi-Config): uses CONFIGURE_COMMAND + # with $ since CMAKE_ARGS doesn't support generator expressions. + # Ninja is forced as a single-config inner generator. + # - Single-config on Windows: forces Ninja for SYCL/ICX compatibility. + # - Single-config on Linux: forwards the outer generator (Ninja, Makefiles, etc.) + # via CMAKE_GENERATOR. + get_property(_is_multi_config GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) + if(_is_multi_config) + ExternalProject_Add(${test_name} + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/compute + CONFIGURE_COMMAND ${CMAKE_COMMAND} + -G Ninja + -S + -B + -DCMAKE_BUILD_TYPE=$ + ${_ep_cmake_args} + BUILD_ALWAYS TRUE + ) + else() + set(_selected_generator ${CMAKE_GENERATOR}) + if(WIN32) + set(_selected_generator Ninja) + endif() + ExternalProject_Add(${test_name} + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/compute + CMAKE_GENERATOR ${_selected_generator} + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + ${_ep_cmake_args} + BUILD_ALWAYS TRUE + ) + endif() add_dependencies(${test_name} ${lib_name}) add_dependencies(${test_name} gtest) add_dependencies(${test_name} gtest_main) endfunction() -make_test(XpuptiProfilerTest.cpp) -make_test(XpuptiScopeProfilerTest.cpp) +make_sycl_test(XpuptiProfilerTest.cpp) +make_sycl_test(XpuptiScopeProfilerTest.cpp) + diff --git a/libkineto/test/xpupti/XpuptiActivityHandlersTest.cpp b/libkineto/test/xpupti/XpuptiActivityHandlersTest.cpp new file mode 100644 index 000000000..7f9545524 --- /dev/null +++ b/libkineto/test/xpupti/XpuptiActivityHandlersTest.cpp @@ -0,0 +1,320 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "src/plugin/xpupti/XpuptiActivityApi.h" +#include "src/plugin/xpupti/XpuptiActivityProfilerSession.h" +#include "src/ActivityBuffers.h" +#include "include/output_base.h" + +#include "src/plugin/xpupti/XpuptiProfilerMacros.h" + +#include + +namespace KN = KINETO_NAMESPACE; +using namespace libkineto; + +// Mock XpuptiActivityApi that delivers hand-crafted PTI records +// through the virtual processActivities without needing PTI runtime. +class MockXpuptiActivityApi : public KN::XpuptiActivityApi { + public: + std::vector records; + + std::unique_ptr activityBuffers() override { + // Return a non-null map so processTrace enters the processing path. + return std::make_unique(); + } + + const std::pair processActivities( + KN::XpuptiActivityBufferMap&, + std::function handler) override { + for (auto* record : records) { + handler(record); + } + return {static_cast(records.size()), 0}; + } +}; + +// Minimal ActivityLogger that captures logged GenericTraceActivity objects. +class MockActivityLogger : public ActivityLogger { + public: + std::vector logged_activities; + + void handleDeviceInfo(const DeviceInfo&, uint64_t) override {} + void handleResourceInfo(const ResourceInfo&, int64_t) override {} + void handleOverheadInfo(const OverheadInfo&, int64_t) override {} + void handleTraceSpan(const TraceSpan&) override {} + + void handleActivity(const ITraceActivity&) override {} + + void handleGenericActivity(const GenericTraceActivity& activity) override { + logged_activities.push_back(&activity); + } + + void handleTraceStart( + const std::unordered_map&, + const std::string&) override {} + + void finalizeMemoryTrace(const std::string&, const Config&) override {} + + void finalizeTrace( + const Config&, + std::unique_ptr, + int64_t, + std::unordered_map>&) override {} +}; + +class XpuptiActivityHandlersTest : public ::testing::Test { + protected: + MockXpuptiActivityApi mockApi_; + MockActivityLogger logger_; + + // Processes all records in mockApi_ through the handler pipeline + // and returns the resulting trace buffer. + std::unique_ptr processAndGetTrace( + int64_t windowStart = 0, + int64_t windowEnd = 1000) { + Config config; + std::set activity_types = {ActivityType::COLLECTIVE_COMM, ActivityType::XPU_SYNC}; + auto session = std::make_unique( + mockApi_, "__test_profiler__", config, activity_types); + session->processTrace( + logger_, + [](int64_t) -> const ITraceActivity* { return nullptr; }, + windowStart, + windowEnd); + return session->getTraceBuffer(); + } +}; + +// --- Communication Activity Tests --- + +#if PTI_VERSION_AT_LEAST(0, 17) +TEST_F(XpuptiActivityHandlersTest, CommunicationActivityHasXcclPrefix) { + pti_view_record_comms comms_record{}; + comms_record._view_kind._view_kind = PTI_VIEW_COMMUNICATION; + comms_record._name = "allreduce"; + comms_record._start_timestamp = 100; + comms_record._end_timestamp = 200; + comms_record._process_id = 1; + comms_record._thread_id = 42; + comms_record._communicator_id = 7; + + mockApi_.records.push_back( + reinterpret_cast(&comms_record)); + + auto traceBuffer = processAndGetTrace(); + ASSERT_EQ(traceBuffer->activities.size(), 1); + + auto& activity = *traceBuffer->activities[0]; + EXPECT_EQ(activity.name(), "xccl::allreduce"); + EXPECT_EQ(activity.type(), ActivityType::COLLECTIVE_COMM); +} + +TEST_F(XpuptiActivityHandlersTest, CommunicationActivityFields) { + pti_view_record_comms comms_record{}; + comms_record._view_kind._view_kind = PTI_VIEW_COMMUNICATION; + comms_record._name = "broadcast"; + comms_record._start_timestamp = 300; + comms_record._end_timestamp = 500; + comms_record._process_id = 10; + comms_record._thread_id = 77; + comms_record._communicator_id = 99; + + mockApi_.records.push_back( + reinterpret_cast(&comms_record)); + + auto traceBuffer = processAndGetTrace(); + ASSERT_EQ(traceBuffer->activities.size(), 1); + + auto& activity = *traceBuffer->activities[0]; + EXPECT_EQ(activity.timestamp(), 300); + EXPECT_EQ(activity.duration(), 200); + EXPECT_EQ(activity.deviceId(), 10); + EXPECT_EQ(activity.resourceId(), 77); + EXPECT_EQ(activity.getThreadId(), 77); + EXPECT_EQ(activity.getMetadataValue("Communicator_id"), "99"); +} + +TEST_F(XpuptiActivityHandlersTest, CommunicationActivityOutOfRange) { + pti_view_record_comms comms_record{}; + comms_record._view_kind._view_kind = PTI_VIEW_COMMUNICATION; + comms_record._name = "allgather"; + comms_record._start_timestamp = 2000; + comms_record._end_timestamp = 3000; + comms_record._process_id = 1; + comms_record._thread_id = 1; + comms_record._communicator_id = 1; + + mockApi_.records.push_back( + reinterpret_cast(&comms_record)); + + auto traceBuffer = processAndGetTrace(100, 500); + EXPECT_EQ(traceBuffer->activities.size(), 0); +} +#endif // PTI_VERSION_AT_LEAST(0, 17) + +// --- Synchronization Activity Tests --- + +TEST_F(XpuptiActivityHandlersTest, SynchronizationActivityDeviceIsNegativeOne) { + pti_view_record_synchronization sync_record{}; + sync_record._view_kind._view_kind = PTI_VIEW_DEVICE_SYNCHRONIZATION; + sync_record._synch_type = PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_EVENT; + sync_record._start_timestamp = 100; + sync_record._end_timestamp = 200; + sync_record._thread_id = 55; + sync_record._correlation_id = 1; + sync_record._api_id = 84; // zeEventHostSynchronize_id + sync_record._api_group = static_cast(1); // PTI_API_GROUP_LEVELZERO + + mockApi_.records.push_back( + reinterpret_cast(&sync_record)); + + auto traceBuffer = processAndGetTrace(); + ASSERT_EQ(traceBuffer->activities.size(), 1); + + auto& activity = *traceBuffer->activities[0]; + EXPECT_EQ(activity.deviceId(), -1); + EXPECT_EQ(activity.type(), ActivityType::XPU_SYNC); +} + +TEST_F(XpuptiActivityHandlersTest, SynchronizationActivityMetadata) { + pti_view_record_synchronization sync_record{}; + sync_record._view_kind._view_kind = PTI_VIEW_DEVICE_SYNCHRONIZATION; + sync_record._synch_type = PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_FENCE; + sync_record._context_handle = nullptr; + sync_record._queue_handle = nullptr; + sync_record._event_handle = nullptr; + sync_record._start_timestamp = 400; + sync_record._end_timestamp = 600; + sync_record._thread_id = 88; + sync_record._correlation_id = 5; + sync_record._number_wait_events = 3; + sync_record._return_code = 0; + sync_record._api_id = 84; // zeEventHostSynchronize_id + sync_record._api_group = static_cast(1); // PTI_API_GROUP_LEVELZERO + + mockApi_.records.push_back( + reinterpret_cast(&sync_record)); + + auto traceBuffer = processAndGetTrace(); + ASSERT_EQ(traceBuffer->activities.size(), 1); + + auto& activity = *traceBuffer->activities[0]; + EXPECT_EQ(activity.timestamp(), 400); + EXPECT_EQ(activity.duration(), 200); + EXPECT_EQ(activity.resourceId(), 88); + EXPECT_EQ(activity.getMetadataValue("Type"), "HOST_FENCE"); + EXPECT_EQ(activity.getMetadataValue("Number_wait_events"), "3"); + EXPECT_EQ(activity.getMetadataValue("Return_code"), "0"); + EXPECT_EQ(activity.getMetadataValue("correlation"), "5"); +} + +TEST_F(XpuptiActivityHandlersTest, SynchronizationAllTypes) { + struct SyncTypeTestCase { + pti_view_synchronization_type type; + std::string expected_name; + }; + std::vector cases = { + {PTI_VIEW_SYNCHRONIZATION_TYPE_UNKNOWN, "UNKNOWN"}, + {PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_EXECUTION, + "GPU_BARRIER_EXECUTION"}, + {PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_MEMORY, + "GPU_BARRIER_MEMORY"}, + {PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_FENCE, "HOST_FENCE"}, + {PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_EVENT, "HOST_EVENT"}, + {PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_COMMAND_LIST, "HOST_COMMAND_LIST"}, + {PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_COMMAND_QUEUE, + "HOST_COMMAND_QUEUE"}, + }; + + for (const auto& tc : cases) { + mockApi_.records.clear(); + + pti_view_record_synchronization sync_record{}; + sync_record._view_kind._view_kind = PTI_VIEW_DEVICE_SYNCHRONIZATION; + sync_record._synch_type = tc.type; + sync_record._start_timestamp = 100; + sync_record._end_timestamp = 200; + sync_record._thread_id = 1; + sync_record._correlation_id = 1; + sync_record._api_id = 84; // zeEventHostSynchronize_id + sync_record._api_group = static_cast(1); // PTI_API_GROUP_LEVELZERO + + mockApi_.records.push_back( + reinterpret_cast(&sync_record)); + + auto traceBuffer = processAndGetTrace(); + ASSERT_EQ(traceBuffer->activities.size(), 1) + << "Failed for type: " << tc.expected_name; + + auto& activity = *traceBuffer->activities[0]; + EXPECT_EQ(activity.getMetadataValue("Type"), tc.expected_name) + << "Wrong string for synchronization type " << tc.type; + } +} + +TEST_F(XpuptiActivityHandlersTest, SynchronizationActivityOutOfRange) { + pti_view_record_synchronization sync_record{}; + sync_record._view_kind._view_kind = PTI_VIEW_DEVICE_SYNCHRONIZATION; + sync_record._synch_type = PTI_VIEW_SYNCHRONIZATION_TYPE_HOST_FENCE; + sync_record._start_timestamp = 50; + sync_record._end_timestamp = 80; + sync_record._thread_id = 1; + sync_record._correlation_id = 1; + sync_record._api_id = 84; // zeEventHostSynchronize_id + sync_record._api_group = static_cast(1); // PTI_API_GROUP_LEVELZERO + + mockApi_.records.push_back( + reinterpret_cast(&sync_record)); + + auto traceBuffer = processAndGetTrace(100, 500); + EXPECT_EQ(traceBuffer->activities.size(), 0); +} + +// --- Mixed dispatch test --- + +#if PTI_VERSION_AT_LEAST(0, 17) +TEST_F(XpuptiActivityHandlersTest, MixedCommunicationAndSynchronization) { + pti_view_record_comms comms_record{}; + comms_record._view_kind._view_kind = PTI_VIEW_COMMUNICATION; + comms_record._name = "reduce_scatter"; + comms_record._start_timestamp = 100; + comms_record._end_timestamp = 200; + comms_record._process_id = 1; + comms_record._thread_id = 10; + comms_record._communicator_id = 5; + + pti_view_record_synchronization sync_record{}; + sync_record._view_kind._view_kind = PTI_VIEW_DEVICE_SYNCHRONIZATION; + sync_record._synch_type = PTI_VIEW_SYNCHRONIZATION_TYPE_GPU_BARRIER_EXECUTION; + sync_record._start_timestamp = 300; + sync_record._end_timestamp = 400; + sync_record._thread_id = 20; + sync_record._correlation_id = 2; + sync_record._api_id = 84; // zeEventHostSynchronize_id + sync_record._api_group = static_cast(1); // PTI_API_GROUP_LEVELZERO + + mockApi_.records.push_back( + reinterpret_cast(&comms_record)); + mockApi_.records.push_back( + reinterpret_cast(&sync_record)); + + auto traceBuffer = processAndGetTrace(); + ASSERT_EQ(traceBuffer->activities.size(), 2); + + auto& comms_activity = *traceBuffer->activities[0]; + EXPECT_EQ(comms_activity.name(), "xccl::reduce_scatter"); + EXPECT_EQ(comms_activity.type(), ActivityType::COLLECTIVE_COMM); + + auto& sync_activity = *traceBuffer->activities[1]; + EXPECT_EQ(sync_activity.deviceId(), -1); + EXPECT_EQ(sync_activity.type(), ActivityType::XPU_SYNC); + EXPECT_EQ( + sync_activity.getMetadataValue("Type"), "GPU_BARRIER_EXECUTION"); +} +#endif // PTI_VERSION_AT_LEAST(0, 17) diff --git a/libkineto/test/xpupti/XpuptiTestUtilities.cpp b/libkineto/test/xpupti/XpuptiTestUtilities.cpp index a72d0a997..f490ce305 100644 --- a/libkineto/test/xpupti/XpuptiTestUtilities.cpp +++ b/libkineto/test/xpupti/XpuptiTestUtilities.cpp @@ -8,6 +8,7 @@ #include "XpuptiTestUtilities.h" +#include "src/ActivityBuffers.h" #include "src/plugin/xpupti/XpuptiActivityProfiler.h" #include @@ -88,9 +89,9 @@ void CheckCountsInMap( } EXPECT_EQ(countsMap.size(), expMap.size()); - - for (auto itCountsMap = countsMap.begin(), itExpArray = expMap.begin(); - (itCountsMap != countsMap.end()) && (itExpArray != expMap.end()); + auto itCountsMap = countsMap.begin(); + auto itExpArray = expMap.begin(); + for (; (itCountsMap != countsMap.end()) && (itExpArray != expMap.end()); ++itCountsMap, ++itExpArray) { EXPECT_EQ(itCountsMap->first, itExpArray->first * repeatCount); EXPECT_EQ(itCountsMap->second, itExpArray->second); diff --git a/libkineto/test/xpupti/compute/CMakeLists.txt b/libkineto/test/xpupti/compute/CMakeLists.txt index 889c94453..e963a8350 100644 --- a/libkineto/test/xpupti/compute/CMakeLists.txt +++ b/libkineto/test/xpupti/compute/CMakeLists.txt @@ -23,7 +23,7 @@ function(find_libraries) find_library(_FIND_LIB NAMES ${lib} PATHS ${CMAKE_PARENT_BINARY_DIR} - PATH_SUFFIXES lib + PATH_SUFFIXES lib/${CMAKE_BUILD_TYPE} lib NO_DEFAULT_PATH ) @@ -40,16 +40,28 @@ find_libraries(gtest gtest_main) find_package(Threads REQUIRED) +find_library(_LINK_LIB + NAMES ${LINK_LIBRARY_NAME} + PATHS ${LINK_LIBRARY_DIR} + PATH_SUFFIXES ${CMAKE_BUILD_TYPE} "" + NO_DEFAULT_PATH +) +if(NOT _LINK_LIB) + message(FATAL_ERROR "Library ${LINK_LIBRARY_NAME} not found in ${LINK_LIBRARY_DIR}") +endif() + add_executable(${PROJECT_NAME} XpuptiScopeProfilerCompute.cpp) target_compile_options(${PROJECT_NAME} PRIVATE -fsycl) target_link_options(${PROJECT_NAME} PRIVATE -fsycl) # Export gtest symbols so the shared library can resolve them at runtime -target_link_options(${PROJECT_NAME} PRIVATE -Wl,--export-dynamic) +if(NOT WIN32) + target_link_options(${PROJECT_NAME} PRIVATE -Wl,--export-dynamic) +endif() target_link_libraries(${PROJECT_NAME} PRIVATE ${gtest_main_FOR_XPU_PATH} ${gtest_FOR_XPU_PATH} - ${LINK_LIBRARY} + ${_LINK_LIB} Threads::Threads )