From e2028d8e35dd6e12561b7f603feafa8e3be24b4d Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 2 Apr 2026 00:59:22 +0800 Subject: [PATCH 01/11] Pass the member context. --- src/data/extmem_quantile_dmatrix.cc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc index fad3240f5b65..d2e05da3ca0d 100644 --- a/src/data/extmem_quantile_dmatrix.cc +++ b/src/data/extmem_quantile_dmatrix.cc @@ -35,21 +35,23 @@ ExtMemQuantileDMatrix::ExtMemQuantileDMatrix(DataIterHandle iter_handle, DMatrix CHECK(valid) << "Qauntile DMatrix must have at least 1 batch."; auto pctx = MakeProxy(proxy)->Ctx(); - Context ctx; - ctx.Init(Args{{"nthread", std::to_string(config.n_threads)}, {"device", pctx->DeviceName()}}); + { + Context ctx; + ctx.Init(Args{{"nthread", std::to_string(config.n_threads)}, {"device", pctx->DeviceName()}}); + this->fmat_ctx_ = ctx; + } BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()}; - if (ctx.IsCPU()) { + if (fmat_ctx_.IsCPU()) { CHECK(detail::HostRatioIsAuto(config.cache_host_ratio)) << error::CacheHostRatioNotImpl(); - this->InitFromCPU(&ctx, iter, proxy, p, config.missing, ref); + this->InitFromCPU(&fmat_ctx_, iter, proxy, p, config.missing, ref); } else { p.n_prefetch_batches = ::xgboost::cuda_impl::DftPrefetchBatches(); - this->InitFromCUDA(&ctx, iter, proxy, p, ref, config); + this->InitFromCUDA(&fmat_ctx_, iter, proxy, p, ref, config); } this->batch_ = p; - this->fmat_ctx_ = ctx; - SyncCategories(&ctx, info_.Cats(), info_.num_row_ == 0); + SyncCategories(&fmat_ctx_, info_.Cats(), info_.num_row_ == 0); } ExtMemQuantileDMatrix::~ExtMemQuantileDMatrix() { From d9fbab9a3cb7680b600c4b7f0f23b3651a5bdc12 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 2 Apr 2026 02:04:19 +0800 Subject: [PATCH 02/11] Use context in `ExtMemQuantileDMatrix`. --- src/data/ellpack_page.cc | 2 +- src/data/ellpack_page.cu | 12 +++++-- src/data/ellpack_page.cuh | 5 +-- src/data/ellpack_page.h | 4 +-- src/data/ellpack_page_raw_format.cu | 31 +++++++++---------- src/data/ellpack_page_raw_format.h | 7 +++-- src/data/ellpack_page_source.cu | 30 +++++++++--------- src/data/ellpack_page_source.h | 16 ++++++---- src/data/gradient_index_page_source.h | 1 + src/data/iterative_dmatrix.cc | 2 +- src/data/iterative_dmatrix.cu | 16 +++++----- src/data/iterative_dmatrix.h | 3 +- src/data/sparse_page_source.h | 3 +- .../cpp/data/test_ellpack_page_raw_format.cu | 10 +++--- tests/cpp/data/test_iterative_dmatrix.cu | 2 +- 15 files changed, 79 insertions(+), 65 deletions(-) diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc index 6b0a81c169ff..77c2798d76c4 100644 --- a/src/data/ellpack_page.cc +++ b/src/data/ellpack_page.cc @@ -20,7 +20,7 @@ class EllpackPageImpl { [[nodiscard]] std::shared_ptr CutsShared() const { return cuts_; } }; -EllpackPage::EllpackPage() = default; +EllpackPage::EllpackPage(Context const*) = default; EllpackPage::EllpackPage(Context const*, DMatrix*, const BatchParam&) { LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 60db1ab7b9e2..643db6ed6084 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -18,7 +18,7 @@ #include "../common/compressed_iterator.h" // for CompressedIterator #include "../common/cuda_context.cuh" // for CUDAContext #include "../common/cuda_rt_utils.h" // for SetDevice -#include "../common/cuda_stream.h" // for DefaultStream +#include "../common/cuda_stream.h" // for StreamRef #include "../common/hist_util.cuh" // for HistogramCuts #include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc #include "../common/transform_iterator.h" // for MakeIndexTransformIter @@ -30,7 +30,9 @@ #include "xgboost/data.h" // for DMatrix namespace xgboost { -EllpackPage::EllpackPage() : impl_{new EllpackPageImpl{}} {} +EllpackPage::EllpackPage(Context const* ctx) : impl_{new EllpackPageImpl{ctx}} {} + +EllpackPageImpl::EllpackPageImpl(Context const* ctx) : ctx_{ctx} {} EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param) : impl_{new EllpackPageImpl{ctx, dmat, param}} {} @@ -187,6 +189,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, : is_dense{is_dense}, n_rows{n_rows}, cuts_{std::move(cuts)}, + ctx_{ctx}, info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -202,6 +205,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, : is_dense{is_dense}, n_rows{page.Size()}, cuts_{std::move(cuts)}, + ctx_{ctx}, info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -221,6 +225,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc common::DeviceSketch(ctx, p_fmat, param.max_bin)) : std::make_shared( common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))}, + ctx_{ctx}, info{CalcNumSymbols(ctx, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -465,6 +470,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag cuts->SetDevice(ctx->Device()); return cuts; }()}, + ctx_{ctx}, info{CalcNumSymbols( ctx, [&] { @@ -502,7 +508,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag EllpackPageImpl::~EllpackPageImpl() noexcept(false) { // Sync the stream to make sure all running CUDA kernels finish before deallocation. - auto status = curt::DefaultStream().Sync(false); + auto status = ctx_->CUDACtx()->Stream().Sync(false); if (status != cudaSuccess) { auto str = cudaGetErrorString(status); // For external-memory, throwing here can trigger a series of calls to diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 28dc410211ea..fabe2db74a24 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -1,5 +1,5 @@ /** - * Copyright 2019-2025, XGBoost Contributors + * Copyright 2019-2026, XGBoost Contributors */ #ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_ #define XGBOOST_DATA_ELLPACK_PAGE_CUH_ @@ -186,7 +186,7 @@ class EllpackPageImpl { * This is used in the external memory case. An empty ELLPACK page is constructed with its content * set later by the reader. */ - EllpackPageImpl() = default; + explicit EllpackPageImpl(Context const* ctx); /** * @brief Constructor from existing ellpack matrics. @@ -357,6 +357,7 @@ class EllpackPageImpl { void InitCompressedData(Context const* ctx); std::shared_ptr cuts_; + Context const* ctx_{nullptr}; public: bool is_dense{false}; diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h index 8b0facd227d8..dcaf02fbb672 100644 --- a/src/data/ellpack_page.h +++ b/src/data/ellpack_page.h @@ -1,5 +1,5 @@ /** - * Copyright 2017-2023 by XGBoost Contributors + * Copyright 2017-2026, XGBoost Contributors */ #ifndef XGBOOST_DATA_ELLPACK_PAGE_H_ #define XGBOOST_DATA_ELLPACK_PAGE_H_ @@ -28,7 +28,7 @@ class EllpackPage { * This is used in the external memory case. An empty ELLPACK page is constructed with its content * set later by the reader. */ - EllpackPage(); + explicit EllpackPage(Context const* ctx); /** * @brief Constructor from an existing DMatrix. * diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu index a5a2b3748100..d4765743779c 100644 --- a/src/data/ellpack_page_raw_format.cu +++ b/src/data/ellpack_page_raw_format.cu @@ -1,12 +1,12 @@ /** - * Copyright 2019-2025, XGBoost contributors + * Copyright 2019-2026, XGBoost contributors */ #include #include // for size_t #include // for vector -#include "../common/cuda_rt_utils.h" +#include "../common/cuda_context.cuh" // for CUDAContext #include "../common/cuda_stream.h" // for Event #include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream #include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc @@ -21,7 +21,7 @@ DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format); namespace { // Function to support system without HMM or ATS template -[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi, +[[nodiscard]] bool ReadDeviceVec(Context const* ctx, common::AlignedResourceReadStream* fi, common::RefResourceView* vec) { xgboost_NVTX_FN_RANGE(); @@ -42,7 +42,7 @@ template *vec = common::MakeFixedVecWithCudaMalloc(n); dh::safe_cuda( - cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, curt::DefaultStream())); + cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, ctx->CUDACtx()->Stream())); return true; } } // namespace @@ -62,7 +62,7 @@ template RET_IF_NOT(fi->Read(&impl->info.row_stride)); if (this->param_.prefetch_copy || !has_hmm_ats_) { - RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer)); + RET_IF_NOT(ReadDeviceVec(ctx_, fi, &impl->gidx_buffer)); } else { RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer)); } @@ -73,7 +73,7 @@ template impl->SetCuts(this->cuts_); - curt::DefaultStream().Sync(); + ctx_->CUDACtx()->Stream().Sync(); return true; } @@ -87,14 +87,13 @@ template bytes += fo->Write(impl->is_dense); bytes += fo->Write(impl->info.row_stride); std::vector h_gidx_buffer; - Context ctx = Context{}.MakeCUDA(curt::CurrentDevice()); // write data into the h_gidx_buffer - [[maybe_unused]] auto h_accessor = impl->GetHostEllpack(&ctx, &h_gidx_buffer); + [[maybe_unused]] auto h_accessor = impl->GetHostEllpack(ctx_, &h_gidx_buffer); bytes += common::WriteVec(fo, h_gidx_buffer); bytes += fo->Write(impl->base_rowid); bytes += fo->Write(impl->NumSymbols()); - curt::DefaultStream().Sync(); + ctx_->CUDACtx()->Stream().Sync(); return bytes; } @@ -104,21 +103,21 @@ template auto* impl = page->Impl(); CHECK(this->cuts_->cut_values_.DeviceCanRead()); - auto ctx = Context{}.MakeCUDA(curt::CurrentDevice()); + auto stream = ctx_->CUDACtx()->Stream(); auto dispatch = [&] { - fi->Read(&ctx, page, this->param_.prefetch_copy || !this->has_hmm_ats_); + fi->Read(ctx_, page, this->param_.prefetch_copy || !this->has_hmm_ats_); impl->SetCuts(this->cuts_); }; if (ConsoleLogger::GlobalVerbosity() == ConsoleLogger::LogVerbosity::kDebug) { curt::Event start{false}, stop{false}; float milliseconds = 0; - start.Record(ctx.CUDACtx()->Stream()); + start.Record(stream); dispatch(); - stop.Record(ctx.CUDACtx()->Stream()); + stop.Record(stream); stop.Sync(); dh::safe_cuda(cudaEventElapsedTime(&milliseconds, start, stop)); double n_bytes = page->Impl()->MemCostBytes(); @@ -128,7 +127,7 @@ template dispatch(); } - curt::DefaultStream().Sync(); + stream.Sync(); return true; } @@ -137,8 +136,8 @@ template EllpackHostCacheStream* fo) const { xgboost_NVTX_FN_RANGE_C(3, 252, 198); - bool new_page = fo->Write(page); - curt::DefaultStream().Sync(); + bool new_page = fo->Write(ctx_, page); + ctx_->CUDACtx()->Stream().Sync(); if (new_page) { auto cache = fo->Share(); diff --git a/src/data/ellpack_page_raw_format.h b/src/data/ellpack_page_raw_format.h index eda0e1d20978..74d44a8bd99d 100644 --- a/src/data/ellpack_page_raw_format.h +++ b/src/data/ellpack_page_raw_format.h @@ -29,14 +29,17 @@ class EllpackPageRawFormat : public SparsePageFormat { BatchParam param_; // Supports CUDA HMM or ATS bool has_hmm_ats_{false}; + Context const* ctx_; public: - explicit EllpackPageRawFormat(std::shared_ptr cuts, DeviceOrd device, + explicit EllpackPageRawFormat(Context const* ctx, + std::shared_ptr cuts, DeviceOrd device, BatchParam param, bool has_hmm_ats) : cuts_{std::move(cuts)}, device_{device}, param_{std::move(param)}, - has_hmm_ats_{has_hmm_ats} {} + has_hmm_ats_{has_hmm_ats}, + ctx_{ctx} {} [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override; [[nodiscard]] std::size_t Write(EllpackPage const& page, common::AlignedFileWriteStream* fo) override; diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 5cddd94996da..cc0731021f39 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -145,9 +145,8 @@ class EllpackHostCacheStreamImpl { ptr_ = k; } - [[nodiscard]] bool Write(EllpackPage const& page) { + [[nodiscard]] bool Write(Context const* ctx, EllpackPage const& page) { auto impl = page.Impl(); - auto ctx = Context{}.MakeCUDA(dh::CurrentDevice()); this->cache_->sizes_orig.push_back(page.Impl()->MemCostBytes()); auto orig_ptr = this->cache_->sizes_orig.size() - 1; @@ -184,7 +183,7 @@ class EllpackHostCacheStreamImpl { // Finish writing a (concatenated) cache page. auto commit_page = [&](EllpackPageImpl const* old_impl) { CHECK_EQ(old_impl->gidx_buffer.Resource()->Type(), common::ResourceHandler::kCudaMalloc); - auto new_impl = std::make_unique(); + auto new_impl = std::make_unique(ctx); new_impl->CopyInfo(old_impl); // Split the cache into host cache, compressed host cache, and the device cache. We @@ -219,10 +218,10 @@ class EllpackHostCacheStreamImpl { dc::CuMemParams c_out; std::size_t constexpr kChunkSize = 1ul << 21; auto params = dc::CompressSnappy( - &ctx, old_impl->gidx_buffer.ToSpan().subspan(n_h_bytes, n_comp_bytes), &tmp, kChunkSize); + ctx, old_impl->gidx_buffer.ToSpan().subspan(n_h_bytes, n_comp_bytes), &tmp, kChunkSize); common::RefResourceView c_buf = dc::CoalesceCompressedBuffersToHost( - ctx.CUDACtx()->Stream(), this->cache_->pool, params, tmp, &c_out); - auto c_page = dc::MakeSnappyDecomprMgr(ctx.CUDACtx()->Stream(), this->cache_->pool, + ctx->CUDACtx()->Stream(), this->cache_->pool, params, tmp, &c_out); + auto c_page = dc::MakeSnappyDecomprMgr(ctx->CUDACtx()->Stream(), this->cache_->pool, std::move(c_out), c_buf.ToSpan()); CHECK_EQ(c_page.DecompressedBytes() + new_impl->gidx_buffer.size_bytes(), n_bytes); @@ -264,13 +263,13 @@ class EllpackHostCacheStreamImpl { // Push a new page auto n_bytes = this->cache_->buffer_bytes.at(this->cache_->h_pages.size()); auto n_samples = this->cache_->buffer_rows.at(this->cache_->h_pages.size()); - auto new_impl = std::make_unique(&ctx, impl->CutsShared(), impl->IsDense(), + auto new_impl = std::make_unique(ctx, impl->CutsShared(), impl->IsDense(), impl->info.row_stride, n_samples); new_impl->SetBaseRowId(impl->base_rowid); new_impl->SetNumSymbols(impl->NumSymbols()); new_impl->gidx_buffer = - common::MakeFixedVecWithCudaMalloc(&ctx, n_bytes, 0); - auto offset = new_impl->Copy(&ctx, impl, 0); + common::MakeFixedVecWithCudaMalloc(ctx, n_bytes, 0); + auto offset = new_impl->Copy(ctx, impl, 0); this->cache_->offsets.push_back(offset); @@ -284,7 +283,7 @@ class EllpackHostCacheStreamImpl { CHECK(!this->cache_->h_pages.empty()); CHECK_EQ(cache_idx, this->cache_->h_pages.size() - 1); auto& new_impl = this->cache_->h_pages.back(); - auto offset = new_impl->Copy(&ctx, impl, this->cache_->offsets.back()); + auto offset = new_impl->Copy(ctx, impl, this->cache_->offsets.back()); this->cache_->offsets.back() += offset; } @@ -382,8 +381,8 @@ void EllpackHostCacheStream::Read(Context const* ctx, EllpackPage* page, bool pr this->p_impl_->Read(ctx, page, prefetch_copy); } -[[nodiscard]] bool EllpackHostCacheStream::Write(EllpackPage const& page) { - return this->p_impl_->Write(page); +[[nodiscard]] bool EllpackHostCacheStream::Write(Context const* ctx, EllpackPage const& page) { + return this->p_impl_->Write(ctx, page); } /** @@ -528,13 +527,12 @@ void EllpackPageSourceImpl::Fetch() { // This is not read from cache so we still need it to be synced with sparse page source. CHECK_EQ(this->Iter(), this->source_->Iter()); auto const& csr = this->source_->Page(); - this->page_.reset(new EllpackPage{}); + this->page_.reset(new EllpackPage{this->Ctx()}); auto* impl = this->page_->Impl(); - Context ctx = Context{}.MakeCUDA(this->Device().ordinal); if (this->GetCuts()->HasCategorical()) { CHECK(!this->feature_types_.empty()); } - *impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_}; + *impl = EllpackPageImpl{this->Ctx(), this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_}; this->page_->SetBaseRowId(csr->base_rowid); LOG(INFO) << "Generated an Ellpack page with size: " << common::HumanMemUnit(impl->MemCostBytes()) @@ -573,7 +571,7 @@ void ExtEllpackPageSourceImpl::Fetch() { bst_idx_t row_stride = GetRowCounts(this->ctx_, value, row_counts_span, dh::GetDevice(this->ctx_), this->missing_); CHECK_LE(row_stride, this->ext_info_.row_stride); - this->page_.reset(new EllpackPage{}); + this->page_.reset(new EllpackPage{this->ctx_}); *this->page_->Impl() = EllpackPageImpl{this->ctx_, value, this->missing_, diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 9a657be4d0d6..9613532006ae 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -164,7 +164,7 @@ class EllpackHostCacheStream { * @return Whether a new cache page is create. False if the new page is appended to the * previous one. */ - [[nodiscard]] bool Write(EllpackPage const& page); + [[nodiscard]] bool Write(Context const* ctx, EllpackPage const& page); }; namespace detail { @@ -177,6 +177,7 @@ class EllpackFormatPolicy { std::shared_ptr cuts_{nullptr}; DeviceOrd device_; bool has_hmm_{curt::SupportsPageableMem()}; + Context const* ctx_{nullptr}; EllpackCacheInfo cache_info_; static_assert(std::is_same_v); @@ -214,11 +215,12 @@ class EllpackFormatPolicy { [[nodiscard]] auto CreatePageFormat(BatchParam const& param) const { CHECK_EQ(cuts_->cut_values_.Device(), device_); - std::unique_ptr fmt{new EllpackPageRawFormat{cuts_, device_, param, has_hmm_}}; + std::unique_ptr fmt{new EllpackPageRawFormat{ctx_, cuts_, device_, param, has_hmm_}}; return fmt; } - void SetCuts(std::shared_ptr cuts, DeviceOrd device, - EllpackCacheInfo cinfo) { + void SetCuts(Context const* ctx, std::shared_ptr cuts, + DeviceOrd device, EllpackCacheInfo cinfo) { + this->ctx_ = ctx; std::swap(this->cuts_, cuts); this->device_ = device; CHECK(this->device_.IsCUDA()); @@ -230,6 +232,8 @@ class EllpackFormatPolicy { } [[nodiscard]] auto Device() const { return this->device_; } [[nodiscard]] auto const& CacheInfo() { return this->cache_info_; } + [[nodiscard]] auto Ctx() const { return this->ctx_; } + [[nodiscard]] auto MakePage() const { return std::make_shared(ctx_); } }; template typename F> @@ -311,7 +315,7 @@ class EllpackPageSourceImpl : public PageSourceIncMixIn { feature_types_{feature_types} { this->source_ = source; cuts->SetDevice(ctx->Device()); - this->SetCuts(std::move(cuts), ctx->Device(), cinfo); + this->SetCuts(ctx, std::move(cuts), ctx->Device(), cinfo); this->Fetch(); } @@ -353,7 +357,7 @@ class ExtEllpackPageSourceImpl : public ExtQantileSourceMixinSetDevice(ctx->Device()); - this->SetCuts(std::move(cuts), ctx->Device(), cinfo); + this->SetCuts(ctx, std::move(cuts), ctx->Device(), cinfo); CHECK(!this->cache_info_->written); this->source_->Reset(); CHECK(this->source_->Next()); diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h index e6c97540a1db..4b2342bbc7e9 100644 --- a/src/data/gradient_index_page_source.h +++ b/src/data/gradient_index_page_source.h @@ -37,6 +37,7 @@ class GHistIndexFormatPolicy { } void SetCuts(common::HistogramCuts cuts) { std::swap(cuts_, cuts); } + [[nodiscard]] auto MakePage() const { return std::make_shared(); } }; class GradientIndexPageSource diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index 47caf0e72664..e5e46d872709 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -212,7 +212,7 @@ void IterativeDMatrix::Save(common::AlignedFileWriteStream*) const { LOG(FATAL) << "Not implemented"; } -IterativeDMatrix* IterativeDMatrix::Load(common::AlignedResourceReadStream*) { +IterativeDMatrix* IterativeDMatrix::Load(Context const*, common::AlignedResourceReadStream*) { LOG(FATAL) << "Not implemented"; return nullptr; } diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index b8e5eaaba705..3e006e53c744 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -44,7 +44,7 @@ void IterativeDMatrix::InitFromCUDA( if (!ellpack_) { // Should be put inside the while loop to protect against empty batch. In // that case device id is invalid. - ellpack_.reset(new EllpackPage); + ellpack_.reset(new EllpackPage{&fmat_ctx_}); *(ellpack_->Impl()) = EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), ext_info.row_stride, ext_info.accumulated_rows); } @@ -107,7 +107,7 @@ BatchSet IterativeDMatrix::GetEllpackBatches(Context const* ctx, } if (!ellpack_) { - ellpack_.reset(new EllpackPage()); + ellpack_.reset(new EllpackPage{&fmat_ctx_}); if (ctx->IsCUDA()) { this->Info().feature_types.SetDevice(ctx->Device()); *ellpack_->Impl() = @@ -137,20 +137,20 @@ void IterativeDMatrix::Save(common::AlignedFileWriteStream* fo) const { auto const& p_cuts = this->ellpack_->Impl()->CutsShared(); p_cuts->Save(fo); // Save ellpack - auto fmt = - std::make_unique(p_cuts, this->Ctx()->Device(), BatchParam{}, false); + auto fmt = std::make_unique(this->Ctx(), p_cuts, this->Ctx()->Device(), + BatchParam{}, false); auto n_bytes = fmt->Write(*this->ellpack_, fo); CHECK_GE(n_bytes, this->ellpack_->Impl()->MemCostBytes()); } -IterativeDMatrix* IterativeDMatrix::Load(common::AlignedResourceReadStream* fi) { +IterativeDMatrix* IterativeDMatrix::Load(Context const* ctx, + common::AlignedResourceReadStream* fi) { CHECK(fi); // Load cuts std::shared_ptr p_cuts{common::HistogramCuts::Load(fi)}; // Load ellpack - auto fmt = std::make_unique(p_cuts, DeviceOrd::CUDA(dh::CurrentDevice()), - BatchParam{}, false); - auto ellpack = std::make_shared(); + auto fmt = std::make_unique(ctx, p_cuts, ctx->Device(), BatchParam{}, false); + auto ellpack = std::make_shared(ctx); CHECK(fmt->Read(ellpack.get(), fi)); return new IterativeDMatrix{std::move(ellpack)}; } diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h index ce4b5b49e725..c05ffc5a0c88 100644 --- a/src/data/iterative_dmatrix.h +++ b/src/data/iterative_dmatrix.h @@ -65,7 +65,8 @@ class IterativeDMatrix : public QuantileDMatrix { BatchSet GetExtBatches(Context const *ctx, BatchParam const ¶m) override; void Save(common::AlignedFileWriteStream *fo) const; - [[nodiscard]] static IterativeDMatrix *Load(common::AlignedResourceReadStream *fi); + [[nodiscard]] static IterativeDMatrix *Load(Context const *ctx, + common::AlignedResourceReadStream *fi); }; } // namespace data } // namespace xgboost diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index 14f65d1119eb..6e9b76b5c381 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -235,6 +235,7 @@ class DefaultFormatPolicy { std::unique_ptr fmt{::xgboost::data::CreatePageFormat("raw")}; return fmt; } + [[nodiscard]] auto MakePage() const { return std::make_shared(); } }; /** @@ -317,7 +318,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl, public FormatStreamPol } auto p = this->param_; ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, p, this] { - auto page = std::make_shared(); + auto page = self->MakePage(); this->exce_.Run([&] { std::unique_ptr fmt{self->CreatePageFormat(p)}; auto name = self->cache_info_->ShardName(); diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu index 9fa7ab83f966..30a18db523c4 100644 --- a/tests/cpp/data/test_ellpack_page_raw_format.cu +++ b/tests/cpp/data/test_ellpack_page_raw_format.cu @@ -56,7 +56,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam { auto row_stride = GetRowStride(m.get()); EllpackCacheInfo cinfo = CInfoForTest(&ctx, m.get(), row_stride, param, cuts); - policy.SetCuts(cuts, ctx.Device(), cinfo); + policy.SetCuts(&ctx, cuts, ctx.Device(), cinfo); std::unique_ptr format{policy.CreatePageFormat(param)}; @@ -68,7 +68,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam { } } - EllpackPage page; + EllpackPage page{&ctx}; auto fi = policy.CreateReader(StringView{path}, static_cast(0), n_bytes); ASSERT_TRUE(format->Read(&page, fi.get())); @@ -130,7 +130,7 @@ TEST_P(TestEllpackPageRawFormat, HostIO) { cinfo.buffer_bytes.push_back(page.Impl()->MemCostBytes()); cinfo.buffer_rows.push_back(page.Impl()->n_rows); } - policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo)); + policy.SetCuts(&ctx, page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo)); format = policy.CreatePageFormat(param); } auto writer = policy.CreateWriter({}, i); @@ -143,7 +143,7 @@ TEST_P(TestEllpackPageRawFormat, HostIO) { for (std::size_t i = 0; i < 3; ++i) { auto reader = policy.CreateReader({}, cache.offset[i], cache.Bytes(i)); - EllpackPage page; + EllpackPage page{&ctx}; ASSERT_TRUE(format->Read(&page, reader.get())); ASSERT_EQ(page.Impl()->MemCostBytes(), cache.Bytes(i)); auto p_fmat = RandomDataGenerator{100, 14, 0.5}.Seed(i).GenerateDMatrix(); @@ -202,7 +202,7 @@ TEST(EllpackPageRawFormat, DevicePageConcat) { } else { EXPECT_EQ(cinfo.buffer_rows.size(), 4ul); } - policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo)); + policy.SetCuts(&ctx, page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo)); } auto format = policy.CreatePageFormat(param); diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu index 362c789e25e5..a421ffecb5d0 100644 --- a/tests/cpp/data/test_iterative_dmatrix.cu +++ b/tests/cpp/data/test_iterative_dmatrix.cu @@ -204,7 +204,7 @@ TEST(IterativeDeviceDMatrix, IO) { } auto fsize = std::filesystem::file_size(path); auto fi = std::make_unique(path.string(), 0ul, fsize); - auto loaded = std::shared_ptr(IterativeDMatrix::Load(fi.get())); + auto loaded = std::shared_ptr(IterativeDMatrix::Load(&ctx, fi.get())); for (auto const& orig_page : qdm->GetBatches(&ctx, {})) { for (auto const& new_page : loaded->GetBatches(&ctx, {})) { std::vector h_orig, h_new; From 7e8a6ccebaa22f45745ae842b6b434c31ee2ce27 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 2 Apr 2026 02:19:07 +0800 Subject: [PATCH 03/11] lint. --- src/data/ellpack_page_source.cu | 3 ++- src/data/iterative_dmatrix.cu | 3 ++- src/data/iterative_dmatrix.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index cc0731021f39..82e1bed58d4d 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -532,7 +532,8 @@ void EllpackPageSourceImpl::Fetch() { if (this->GetCuts()->HasCategorical()) { CHECK(!this->feature_types_.empty()); } - *impl = EllpackPageImpl{this->Ctx(), this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_}; + *impl = + EllpackPageImpl{this->Ctx(), this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_}; this->page_->SetBaseRowId(csr->base_rowid); LOG(INFO) << "Generated an Ellpack page with size: " << common::HumanMemUnit(impl->MemCostBytes()) diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 3e006e53c744..f37d4412f7e1 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -149,7 +149,8 @@ IterativeDMatrix* IterativeDMatrix::Load(Context const* ctx, // Load cuts std::shared_ptr p_cuts{common::HistogramCuts::Load(fi)}; // Load ellpack - auto fmt = std::make_unique(ctx, p_cuts, ctx->Device(), BatchParam{}, false); + auto fmt = + std::make_unique(ctx, p_cuts, ctx->Device(), BatchParam{}, false); auto ellpack = std::make_shared(ctx); CHECK(fmt->Read(ellpack.get(), fi)); return new IterativeDMatrix{std::move(ellpack)}; diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h index c05ffc5a0c88..97b087ffc114 100644 --- a/src/data/iterative_dmatrix.h +++ b/src/data/iterative_dmatrix.h @@ -66,7 +66,7 @@ class IterativeDMatrix : public QuantileDMatrix { void Save(common::AlignedFileWriteStream *fo) const; [[nodiscard]] static IterativeDMatrix *Load(Context const *ctx, - common::AlignedResourceReadStream *fi); + common::AlignedResourceReadStream *fi); }; } // namespace data } // namespace xgboost From 115dea777d4b416b77922ace506a4ca90c1f5e7e Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 2 Apr 2026 02:21:49 +0800 Subject: [PATCH 04/11] cpu build. --- src/data/ellpack_page.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc index 77c2798d76c4..958ad70f1ac8 100644 --- a/src/data/ellpack_page.cc +++ b/src/data/ellpack_page.cc @@ -20,7 +20,7 @@ class EllpackPageImpl { [[nodiscard]] std::shared_ptr CutsShared() const { return cuts_; } }; -EllpackPage::EllpackPage(Context const*) = default; +EllpackPage::EllpackPage(Context const*) {} EllpackPage::EllpackPage(Context const*, DMatrix*, const BatchParam&) { LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " From 5fb7aeb3ebe11a7c3af1cfef68624480ac1dbca9 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 2 Apr 2026 02:47:09 +0800 Subject: [PATCH 05/11] Fix. --- src/data/iterative_dmatrix.cu | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index f37d4412f7e1..0d02f5cb66b4 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -110,20 +110,15 @@ BatchSet IterativeDMatrix::GetEllpackBatches(Context const* ctx, ellpack_.reset(new EllpackPage{&fmat_ctx_}); if (ctx->IsCUDA()) { this->Info().feature_types.SetDevice(ctx->Device()); - *ellpack_->Impl() = - EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); } else if (fmat_ctx_.IsCUDA()) { this->Info().feature_types.SetDevice(fmat_ctx_.Device()); - *ellpack_->Impl() = - EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); } else { // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM // for cut reference. - auto cuda_ctx = ctx->MakeCUDA(); - this->Info().feature_types.SetDevice(cuda_ctx.Device()); - *ellpack_->Impl() = - EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); + this->Info().feature_types.SetDevice(ctx->MakeCUDA().Device()); } + *ellpack_->Impl() = + EllpackPageImpl{&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()}; } CHECK(ellpack_); auto begin_iter = BatchIterator(new SimpleBatchIteratorImpl(ellpack_)); From 54dd6a7861d045aac967da94b4918c07b04dd5ee Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 2 Apr 2026 04:01:06 +0800 Subject: [PATCH 06/11] Fix --- src/data/simple_dmatrix.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index a249c99ac515..a18ec4a59ea6 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -156,15 +156,12 @@ BatchSet SimpleDMatrix::GetEllpackBatches(Context const* ctx, if (ctx->IsCUDA()) { // The context passed in is on GPU, we pick it first since we prioritize the context // in Booster. - ellpack_page_.reset(new EllpackPage(ctx, this, param)); - } else if (fmat_ctx_.IsCUDA()) { - // DMatrix was initialized on GPU, we use the context from initialization. - ellpack_page_.reset(new EllpackPage(&fmat_ctx_, this, param)); - } else { + fmat_ctx_ = *ctx; + } else if (!fmat_ctx_.IsCUDA()) { // Mismatched parameter, user set a new max_bin during training. - auto cuda_ctx = ctx->MakeCUDA(); - ellpack_page_.reset(new EllpackPage(&cuda_ctx, this, param)); + fmat_ctx_ = ctx->MakeCUDA(); } + ellpack_page_.reset(new EllpackPage(&fmat_ctx_, this, param)); batch_param_ = param.MakeCache(); } From cd03fbca8e768e1940bcfec3eee862b108dfbe28 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 2 Apr 2026 05:14:25 +0800 Subject: [PATCH 07/11] Fix. --- src/data/iterative_dmatrix.cu | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 0d02f5cb66b4..f2fce5c77d89 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -107,16 +107,13 @@ BatchSet IterativeDMatrix::GetEllpackBatches(Context const* ctx, } if (!ellpack_) { - ellpack_.reset(new EllpackPage{&fmat_ctx_}); if (ctx->IsCUDA()) { - this->Info().feature_types.SetDevice(ctx->Device()); - } else if (fmat_ctx_.IsCUDA()) { - this->Info().feature_types.SetDevice(fmat_ctx_.Device()); - } else { - // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM - // for cut reference. - this->Info().feature_types.SetDevice(ctx->MakeCUDA().Device()); + fmat_ctx_ = *ctx; + } else if (!fmat_ctx_.IsCUDA()) { + fmat_ctx_ = ctx->MakeCUDA(); } + this->Info().feature_types.SetDevice(fmat_ctx_.Device()); + ellpack_.reset(new EllpackPage{&fmat_ctx_}); *ellpack_->Impl() = EllpackPageImpl{&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()}; } From 1242879226bfd1b5e5226a3614a50528f453a8c8 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 2 Apr 2026 17:28:00 +0800 Subject: [PATCH 08/11] Use explicit stream sync. --- src/data/ellpack_page.cu | 20 +++-------------- src/data/ellpack_page.cuh | 3 +-- src/data/ellpack_page.h | 2 +- src/data/ellpack_page_source.cu | 22 ++++++++++++++++--- src/data/ellpack_page_source.h | 7 +++++- src/data/gradient_index_page_source.h | 2 +- src/data/iterative_dmatrix.cu | 6 ++--- src/data/sparse_page_source.h | 10 ++++----- .../cpp/data/test_ellpack_page_raw_format.cu | 4 ++-- 9 files changed, 41 insertions(+), 35 deletions(-) diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 643db6ed6084..d3843e98de7f 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -30,9 +30,9 @@ #include "xgboost/data.h" // for DMatrix namespace xgboost { -EllpackPage::EllpackPage(Context const* ctx) : impl_{new EllpackPageImpl{ctx}} {} +EllpackPage::EllpackPage() : impl_{new EllpackPageImpl{}} {} -EllpackPageImpl::EllpackPageImpl(Context const* ctx) : ctx_{ctx} {} +EllpackPageImpl::EllpackPageImpl() = default; EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param) : impl_{new EllpackPageImpl{ctx, dmat, param}} {} @@ -189,7 +189,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, : is_dense{is_dense}, n_rows{n_rows}, cuts_{std::move(cuts)}, - ctx_{ctx}, info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -205,7 +204,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, : is_dense{is_dense}, n_rows{page.Size()}, cuts_{std::move(cuts)}, - ctx_{ctx}, info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -225,7 +223,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc common::DeviceSketch(ctx, p_fmat, param.max_bin)) : std::make_shared( common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))}, - ctx_{ctx}, info{CalcNumSymbols(ctx, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -470,7 +467,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag cuts->SetDevice(ctx->Device()); return cuts; }()}, - ctx_{ctx}, info{CalcNumSymbols( ctx, [&] { @@ -506,17 +502,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag this->monitor_.Stop("CopyGHistToEllpack"); } -EllpackPageImpl::~EllpackPageImpl() noexcept(false) { - // Sync the stream to make sure all running CUDA kernels finish before deallocation. - auto status = ctx_->CUDACtx()->Stream().Sync(false); - if (status != cudaSuccess) { - auto str = cudaGetErrorString(status); - // For external-memory, throwing here can trigger a series of calls to - // `std::terminate` by various destructors. For now, we just log the error. - LOG(WARNING) << "Ran into CUDA error:" << str << "\nXGBoost is likely to abort."; - } - dh::safe_cuda(status); -} +EllpackPageImpl::~EllpackPageImpl() noexcept(false) = default; // A functor that copies the data from one EllpackPage to another. template diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index fabe2db74a24..13ef7b877e56 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -186,7 +186,7 @@ class EllpackPageImpl { * This is used in the external memory case. An empty ELLPACK page is constructed with its content * set later by the reader. */ - explicit EllpackPageImpl(Context const* ctx); + EllpackPageImpl(); /** * @brief Constructor from existing ellpack matrics. @@ -357,7 +357,6 @@ class EllpackPageImpl { void InitCompressedData(Context const* ctx); std::shared_ptr cuts_; - Context const* ctx_{nullptr}; public: bool is_dense{false}; diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h index dcaf02fbb672..474f6f955df1 100644 --- a/src/data/ellpack_page.h +++ b/src/data/ellpack_page.h @@ -28,7 +28,7 @@ class EllpackPage { * This is used in the external memory case. An empty ELLPACK page is constructed with its content * set later by the reader. */ - explicit EllpackPage(Context const* ctx); + EllpackPage(); /** * @brief Constructor from an existing DMatrix. * diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 82e1bed58d4d..16512d702b62 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -183,7 +183,7 @@ class EllpackHostCacheStreamImpl { // Finish writing a (concatenated) cache page. auto commit_page = [&](EllpackPageImpl const* old_impl) { CHECK_EQ(old_impl->gidx_buffer.Resource()->Type(), common::ResourceHandler::kCudaMalloc); - auto new_impl = std::make_unique(ctx); + auto new_impl = std::make_unique(); new_impl->CopyInfo(old_impl); // Split the cache into host cache, compressed host cache, and the device cache. We @@ -385,6 +385,20 @@ void EllpackHostCacheStream::Read(Context const* ctx, EllpackPage* page, bool pr return this->p_impl_->Write(ctx, page); } +/** + * EllpackFormatPolicy + */ +template +void EllpackFormatPolicy::DestroyPage(std::shared_ptr& page) const { + if (page && ctx_) { + ctx_->CUDACtx()->Stream().Sync(); + } + page.reset(); +} + +template void EllpackFormatPolicy::DestroyPage( + std::shared_ptr& page) const; + /** * EllpackCacheStreamPolicy */ @@ -527,7 +541,8 @@ void EllpackPageSourceImpl::Fetch() { // This is not read from cache so we still need it to be synced with sparse page source. CHECK_EQ(this->Iter(), this->source_->Iter()); auto const& csr = this->source_->Page(); - this->page_.reset(new EllpackPage{this->Ctx()}); + this->DestroyPage(this->page_); + this->page_.reset(new EllpackPage{}); auto* impl = this->page_->Impl(); if (this->GetCuts()->HasCategorical()) { CHECK(!this->feature_types_.empty()); @@ -572,7 +587,8 @@ void ExtEllpackPageSourceImpl::Fetch() { bst_idx_t row_stride = GetRowCounts(this->ctx_, value, row_counts_span, dh::GetDevice(this->ctx_), this->missing_); CHECK_LE(row_stride, this->ext_info_.row_stride); - this->page_.reset(new EllpackPage{this->ctx_}); + this->DestroyPage(this->page_); + this->page_.reset(new EllpackPage{}); *this->page_->Impl() = EllpackPageImpl{this->ctx_, value, this->missing_, diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 9613532006ae..a5c7c7112179 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -233,7 +233,7 @@ class EllpackFormatPolicy { [[nodiscard]] auto Device() const { return this->device_; } [[nodiscard]] auto const& CacheInfo() { return this->cache_info_; } [[nodiscard]] auto Ctx() const { return this->ctx_; } - [[nodiscard]] auto MakePage() const { return std::make_shared(ctx_); } + void DestroyPage(std::shared_ptr& page) const; }; template typename F> @@ -387,6 +387,11 @@ using ExtEllpackPageSource = ExtEllpackPageSourceImpl>; #if !defined(XGBOOST_USE_CUDA) +template +inline void EllpackFormatPolicy::DestroyPage(std::shared_ptr& page) const { + page.reset(); +} + template inline void EllpackPageSourceImpl::Fetch() { // silent the warning about unused variables. diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h index 4b2342bbc7e9..2909169f4f6a 100644 --- a/src/data/gradient_index_page_source.h +++ b/src/data/gradient_index_page_source.h @@ -37,7 +37,7 @@ class GHistIndexFormatPolicy { } void SetCuts(common::HistogramCuts cuts) { std::swap(cuts_, cuts); } - [[nodiscard]] auto MakePage() const { return std::make_shared(); } + static void DestroyPage(std::shared_ptr& page) { page.reset(); } }; class GradientIndexPageSource diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index f2fce5c77d89..37aefc12df68 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -44,7 +44,7 @@ void IterativeDMatrix::InitFromCUDA( if (!ellpack_) { // Should be put inside the while loop to protect against empty batch. In // that case device id is invalid. - ellpack_.reset(new EllpackPage{&fmat_ctx_}); + ellpack_.reset(new EllpackPage{}); *(ellpack_->Impl()) = EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), ext_info.row_stride, ext_info.accumulated_rows); } @@ -113,7 +113,7 @@ BatchSet IterativeDMatrix::GetEllpackBatches(Context const* ctx, fmat_ctx_ = ctx->MakeCUDA(); } this->Info().feature_types.SetDevice(fmat_ctx_.Device()); - ellpack_.reset(new EllpackPage{&fmat_ctx_}); + ellpack_.reset(new EllpackPage{}); *ellpack_->Impl() = EllpackPageImpl{&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()}; } @@ -143,7 +143,7 @@ IterativeDMatrix* IterativeDMatrix::Load(Context const* ctx, // Load ellpack auto fmt = std::make_unique(ctx, p_cuts, ctx->Device(), BatchParam{}, false); - auto ellpack = std::make_shared(ctx); + auto ellpack = std::make_shared(); CHECK(fmt->Read(ellpack.get(), fi)); return new IterativeDMatrix{std::move(ellpack)}; } diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index 6e9b76b5c381..cf04f007d164 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -235,7 +235,7 @@ class DefaultFormatPolicy { std::unique_ptr fmt{::xgboost::data::CreatePageFormat("raw")}; return fmt; } - [[nodiscard]] auto MakePage() const { return std::make_shared(); } + static void DestroyPage(std::shared_ptr& page) { page.reset(); } }; /** @@ -300,9 +300,9 @@ class SparsePageSourceImpl : public BatchIteratorImpl, public FormatStreamPol exce_.Rethrow(); // Clear out the existing page before loading new ones. This helps reduce memory usage - // when page is not loaded with mmap, in addition, it triggers necessary CUDA - // synchronizations by freeing memory. - page_.reset(); + // when page is not loaded with mmap. The destruction policy handles any necessary + // synchronizations (e.g., CUDA stream sync for Ellpack pages). + this->DestroyPage(page_); for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) { bool restart = fetch_it == n_batches; @@ -318,7 +318,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl, public FormatStreamPol } auto p = this->param_; ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, p, this] { - auto page = self->MakePage(); + auto page = std::make_shared(); this->exce_.Run([&] { std::unique_ptr fmt{self->CreatePageFormat(p)}; auto name = self->cache_info_->ShardName(); diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu index 30a18db523c4..616d8fc6cb44 100644 --- a/tests/cpp/data/test_ellpack_page_raw_format.cu +++ b/tests/cpp/data/test_ellpack_page_raw_format.cu @@ -68,7 +68,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam { } } - EllpackPage page{&ctx}; + EllpackPage page; auto fi = policy.CreateReader(StringView{path}, static_cast(0), n_bytes); ASSERT_TRUE(format->Read(&page, fi.get())); @@ -143,7 +143,7 @@ TEST_P(TestEllpackPageRawFormat, HostIO) { for (std::size_t i = 0; i < 3; ++i) { auto reader = policy.CreateReader({}, cache.offset[i], cache.Bytes(i)); - EllpackPage page{&ctx}; + EllpackPage page; ASSERT_TRUE(format->Read(&page, reader.get())); ASSERT_EQ(page.Impl()->MemCostBytes(), cache.Bytes(i)); auto p_fmat = RandomDataGenerator{100, 14, 0.5}.Seed(i).GenerateDMatrix(); From 65e74866a6357e14c9d399fd2c77ab93567ce4cc Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 3 Apr 2026 19:56:58 +0800 Subject: [PATCH 09/11] CPU build. --- src/data/ellpack_page.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc index 958ad70f1ac8..69fd46b176d1 100644 --- a/src/data/ellpack_page.cc +++ b/src/data/ellpack_page.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019-2024, XGBoost contributors + * Copyright 2019-2026, XGBoost contributors */ #ifndef XGBOOST_USE_CUDA @@ -20,7 +20,7 @@ class EllpackPageImpl { [[nodiscard]] std::shared_ptr CutsShared() const { return cuts_; } }; -EllpackPage::EllpackPage(Context const*) {} +EllpackPage::EllpackPage() = default; EllpackPage::EllpackPage(Context const*, DMatrix*, const BatchParam&) { LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " From 101add01100efbaca3559d094e31111dfb43703e Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 3 Apr 2026 20:02:57 +0800 Subject: [PATCH 10/11] lint. --- src/data/ellpack_page_source.cu | 12 ++++++------ src/data/ellpack_page_source.h | 8 ++++---- src/data/gradient_index_page_source.h | 2 +- src/data/sparse_page_source.h | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 16512d702b62..7f3dc5c8c9dc 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -1,5 +1,5 @@ /** - * Copyright 2019-2025, XGBoost contributors + * Copyright 2019-2026, XGBoost contributors */ #include // for max #include // for size_t @@ -389,15 +389,15 @@ void EllpackHostCacheStream::Read(Context const* ctx, EllpackPage* page, bool pr * EllpackFormatPolicy */ template -void EllpackFormatPolicy::DestroyPage(std::shared_ptr& page) const { +void EllpackFormatPolicy::DestroyPage(std::shared_ptr* page) const { if (page && ctx_) { ctx_->CUDACtx()->Stream().Sync(); } - page.reset(); + page->reset(); } template void EllpackFormatPolicy::DestroyPage( - std::shared_ptr& page) const; + std::shared_ptr* page) const; /** * EllpackCacheStreamPolicy @@ -541,7 +541,7 @@ void EllpackPageSourceImpl::Fetch() { // This is not read from cache so we still need it to be synced with sparse page source. CHECK_EQ(this->Iter(), this->source_->Iter()); auto const& csr = this->source_->Page(); - this->DestroyPage(this->page_); + this->DestroyPage(&this->page_); this->page_.reset(new EllpackPage{}); auto* impl = this->page_->Impl(); if (this->GetCuts()->HasCategorical()) { @@ -587,7 +587,7 @@ void ExtEllpackPageSourceImpl::Fetch() { bst_idx_t row_stride = GetRowCounts(this->ctx_, value, row_counts_span, dh::GetDevice(this->ctx_), this->missing_); CHECK_LE(row_stride, this->ext_info_.row_stride); - this->DestroyPage(this->page_); + this->DestroyPage(&this->page_); this->page_.reset(new EllpackPage{}); *this->page_->Impl() = EllpackPageImpl{this->ctx_, value, diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index a5c7c7112179..0676a41d1b77 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -1,5 +1,5 @@ /** - * Copyright 2019-2025, XGBoost Contributors + * Copyright 2019-2026, XGBoost Contributors */ #ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_ @@ -233,7 +233,7 @@ class EllpackFormatPolicy { [[nodiscard]] auto Device() const { return this->device_; } [[nodiscard]] auto const& CacheInfo() { return this->cache_info_; } [[nodiscard]] auto Ctx() const { return this->ctx_; } - void DestroyPage(std::shared_ptr& page) const; + void DestroyPage(std::shared_ptr* page) const; }; template typename F> @@ -388,8 +388,8 @@ using ExtEllpackPageSource = #if !defined(XGBOOST_USE_CUDA) template -inline void EllpackFormatPolicy::DestroyPage(std::shared_ptr& page) const { - page.reset(); +inline void EllpackFormatPolicy::DestroyPage(std::shared_ptr* page) const { + page->reset(); } template diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h index 2909169f4f6a..19a5b19c19a4 100644 --- a/src/data/gradient_index_page_source.h +++ b/src/data/gradient_index_page_source.h @@ -37,7 +37,7 @@ class GHistIndexFormatPolicy { } void SetCuts(common::HistogramCuts cuts) { std::swap(cuts_, cuts); } - static void DestroyPage(std::shared_ptr& page) { page.reset(); } + static void DestroyPage(std::shared_ptr* page) { page->reset(); } }; class GradientIndexPageSource diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index cf04f007d164..7a5528df78df 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -235,7 +235,7 @@ class DefaultFormatPolicy { std::unique_ptr fmt{::xgboost::data::CreatePageFormat("raw")}; return fmt; } - static void DestroyPage(std::shared_ptr& page) { page.reset(); } + static void DestroyPage(std::shared_ptr* page) { page->reset(); } }; /** @@ -302,7 +302,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl, public FormatStreamPol // Clear out the existing page before loading new ones. This helps reduce memory usage // when page is not loaded with mmap. The destruction policy handles any necessary // synchronizations (e.g., CUDA stream sync for Ellpack pages). - this->DestroyPage(page_); + this->DestroyPage(&page_); for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) { bool restart = fetch_it == n_batches; From 63370db3bf03d71c2125859571b67b3346217ed0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 3 Apr 2026 20:05:22 +0800 Subject: [PATCH 11/11] lint. --- src/data/gradient_index_page_source.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h index 19a5b19c19a4..a223ef0828ce 100644 --- a/src/data/gradient_index_page_source.h +++ b/src/data/gradient_index_page_source.h @@ -1,5 +1,5 @@ /** - * Copyright 2021-2024, XGBoost Contributors + * Copyright 2021-2026, XGBoost Contributors */ #ifndef XGBOOST_DATA_GRADIENT_INDEX_PAGE_SOURCE_H_ #define XGBOOST_DATA_GRADIENT_INDEX_PAGE_SOURCE_H_