From e2028d8e35dd6e12561b7f603feafa8e3be24b4d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Apr 2026 00:59:22 +0800
Subject: [PATCH 01/11] Pass the member context.

---
 src/data/extmem_quantile_dmatrix.cc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/data/extmem_quantile_dmatrix.cc b/src/data/extmem_quantile_dmatrix.cc
index fad3240f5b65..d2e05da3ca0d 100644
--- a/src/data/extmem_quantile_dmatrix.cc
+++ b/src/data/extmem_quantile_dmatrix.cc
@@ -35,21 +35,23 @@ ExtMemQuantileDMatrix::ExtMemQuantileDMatrix(DataIterHandle iter_handle, DMatrix
   CHECK(valid) << "Qauntile DMatrix must have at least 1 batch.";
 
   auto pctx = MakeProxy(proxy)->Ctx();
-  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(config.n_threads)}, {"device", pctx->DeviceName()}});
+  {
+    Context ctx;
+    ctx.Init(Args{{"nthread", std::to_string(config.n_threads)}, {"device", pctx->DeviceName()}});
+    this->fmat_ctx_ = ctx;
+  }
 
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
-  if (ctx.IsCPU()) {
+  if (fmat_ctx_.IsCPU()) {
     CHECK(detail::HostRatioIsAuto(config.cache_host_ratio)) << error::CacheHostRatioNotImpl();
-    this->InitFromCPU(&ctx, iter, proxy, p, config.missing, ref);
+    this->InitFromCPU(&fmat_ctx_, iter, proxy, p, config.missing, ref);
   } else {
     p.n_prefetch_batches = ::xgboost::cuda_impl::DftPrefetchBatches();
-    this->InitFromCUDA(&ctx, iter, proxy, p, ref, config);
+    this->InitFromCUDA(&fmat_ctx_, iter, proxy, p, ref, config);
   }
   this->batch_ = p;
-  this->fmat_ctx_ = ctx;
 
-  SyncCategories(&ctx, info_.Cats(), info_.num_row_ == 0);
+  SyncCategories(&fmat_ctx_, info_.Cats(), info_.num_row_ == 0);
 }
 
 ExtMemQuantileDMatrix::~ExtMemQuantileDMatrix() {

From d9fbab9a3cb7680b600c4b7f0f23b3651a5bdc12 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Apr 2026 02:04:19 +0800
Subject: [PATCH 02/11] Use context in `ExtMemQuantileDMatrix`.

---
 src/data/ellpack_page.cc                      |  2 +-
 src/data/ellpack_page.cu                      | 12 +++++--
 src/data/ellpack_page.cuh                     |  5 +--
 src/data/ellpack_page.h                       |  4 +--
 src/data/ellpack_page_raw_format.cu           | 31 +++++++++----------
 src/data/ellpack_page_raw_format.h            |  7 +++--
 src/data/ellpack_page_source.cu               | 30 +++++++++---------
 src/data/ellpack_page_source.h                | 16 ++++++----
 src/data/gradient_index_page_source.h         |  1 +
 src/data/iterative_dmatrix.cc                 |  2 +-
 src/data/iterative_dmatrix.cu                 | 16 +++++-----
 src/data/iterative_dmatrix.h                  |  3 +-
 src/data/sparse_page_source.h                 |  3 +-
 .../cpp/data/test_ellpack_page_raw_format.cu  | 10 +++---
 tests/cpp/data/test_iterative_dmatrix.cu      |  2 +-
 15 files changed, 79 insertions(+), 65 deletions(-)

diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
index 6b0a81c169ff..77c2798d76c4 100644
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -20,7 +20,7 @@ class EllpackPageImpl {
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
 };
 
-EllpackPage::EllpackPage() = default;
+EllpackPage::EllpackPage(Context const*) = default;
 
 EllpackPage::EllpackPage(Context const*, DMatrix*, const BatchParam&) {
   LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 60db1ab7b9e2..643db6ed6084 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -18,7 +18,7 @@
 #include "../common/compressed_iterator.h"  // for CompressedIterator
 #include "../common/cuda_context.cuh"       // for CUDAContext
 #include "../common/cuda_rt_utils.h"        // for SetDevice
-#include "../common/cuda_stream.h"          // for DefaultStream
+#include "../common/cuda_stream.h"          // for StreamRef
 #include "../common/hist_util.cuh"          // for HistogramCuts
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
 #include "../common/transform_iterator.h"   // for MakeIndexTransformIter
@@ -30,7 +30,9 @@
 #include "xgboost/data.h"                   // for DMatrix
 
 namespace xgboost {
-EllpackPage::EllpackPage() : impl_{new EllpackPageImpl{}} {}
+EllpackPage::EllpackPage(Context const* ctx) : impl_{new EllpackPageImpl{ctx}} {}
+
+EllpackPageImpl::EllpackPageImpl(Context const* ctx) : ctx_{ctx} {}
 
 EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : impl_{new EllpackPageImpl{ctx, dmat, param}} {}
@@ -187,6 +189,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
     : is_dense{is_dense},
       n_rows{n_rows},
       cuts_{std::move(cuts)},
+      ctx_{ctx},
       info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
@@ -202,6 +205,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
     : is_dense{is_dense},
       n_rows{page.Size()},
       cuts_{std::move(cuts)},
+      ctx_{ctx},
       info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
@@ -221,6 +225,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
                       common::DeviceSketch(ctx, p_fmat, param.max_bin))
                 : std::make_shared<common::HistogramCuts>(
                       common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
+      ctx_{ctx},
       info{CalcNumSymbols(ctx, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
@@ -465,6 +470,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
         cuts->SetDevice(ctx->Device());
         return cuts;
       }()},
+      ctx_{ctx},
       info{CalcNumSymbols(
           ctx,
           [&] {
@@ -502,7 +508,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
 
 EllpackPageImpl::~EllpackPageImpl() noexcept(false) {
   // Sync the stream to make sure all running CUDA kernels finish before deallocation.
-  auto status = curt::DefaultStream().Sync(false);
+  auto status = ctx_->CUDACtx()->Stream().Sync(false);
   if (status != cudaSuccess) {
     auto str = cudaGetErrorString(status);
     // For external-memory, throwing here can trigger a series of calls to
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 28dc410211ea..fabe2db74a24 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2025, XGBoost Contributors
+ * Copyright 2019-2026, XGBoost Contributors
  */
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
 #define XGBOOST_DATA_ELLPACK_PAGE_CUH_
@@ -186,7 +186,7 @@ class EllpackPageImpl {
    * This is used in the external memory case. An empty ELLPACK page is constructed with its content
    * set later by the reader.
    */
-  EllpackPageImpl() = default;
+  explicit EllpackPageImpl(Context const* ctx);
 
   /**
    * @brief Constructor from existing ellpack matrics.
@@ -357,6 +357,7 @@ class EllpackPageImpl {
   void InitCompressedData(Context const* ctx);
 
   std::shared_ptr<common::HistogramCuts const> cuts_;
+  Context const* ctx_{nullptr};
 
  public:
   bool is_dense{false};
diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h
index 8b0facd227d8..dcaf02fbb672 100644
--- a/src/data/ellpack_page.h
+++ b/src/data/ellpack_page.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost Contributors
+ * Copyright 2017-2026, XGBoost Contributors
  */
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
 #define XGBOOST_DATA_ELLPACK_PAGE_H_
@@ -28,7 +28,7 @@ class EllpackPage {
    * This is used in the external memory case. An empty ELLPACK page is constructed with its content
    * set later by the reader.
    */
-  EllpackPage();
+  explicit EllpackPage(Context const* ctx);
   /**
    * @brief Constructor from an existing DMatrix.
    *
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index a5a2b3748100..d4765743779c 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -1,12 +1,12 @@
 /**
- * Copyright 2019-2025, XGBoost contributors
+ * Copyright 2019-2026, XGBoost contributors
  */
 #include <dmlc/registry.h>
 
 #include <cstddef>  // for size_t
 #include <vector>   // for vector
 
-#include "../common/cuda_rt_utils.h"
+#include "../common/cuda_context.cuh"       // for CUDAContext
 #include "../common/cuda_stream.h"          // for Event
 #include "../common/io.h"                   // for AlignedResourceReadStream, AlignedFileWriteStream
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
@@ -21,7 +21,7 @@ DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);
 namespace {
 // Function to support system without HMM or ATS
 template <typename T>
-[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
+[[nodiscard]] bool ReadDeviceVec(Context const* ctx, common::AlignedResourceReadStream* fi,
                                  common::RefResourceView<T>* vec) {
   xgboost_NVTX_FN_RANGE();
 
@@ -42,7 +42,7 @@ template <typename T>
 
   *vec = common::MakeFixedVecWithCudaMalloc<T>(n);
   dh::safe_cuda(
-      cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, curt::DefaultStream()));
+      cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
   return true;
 }
 }  // namespace
@@ -62,7 +62,7 @@ template <typename T>
   RET_IF_NOT(fi->Read(&impl->info.row_stride));
 
   if (this->param_.prefetch_copy || !has_hmm_ats_) {
-    RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
+    RET_IF_NOT(ReadDeviceVec(ctx_, fi, &impl->gidx_buffer));
   } else {
     RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
   }
@@ -73,7 +73,7 @@ template <typename T>
 
   impl->SetCuts(this->cuts_);
 
-  curt::DefaultStream().Sync();
+  ctx_->CUDACtx()->Stream().Sync();
   return true;
 }
 
@@ -87,14 +87,13 @@ template <typename T>
   bytes += fo->Write(impl->is_dense);
   bytes += fo->Write(impl->info.row_stride);
   std::vector<common::CompressedByteT> h_gidx_buffer;
-  Context ctx = Context{}.MakeCUDA(curt::CurrentDevice());
   // write data into the h_gidx_buffer
-  [[maybe_unused]] auto h_accessor = impl->GetHostEllpack(&ctx, &h_gidx_buffer);
+  [[maybe_unused]] auto h_accessor = impl->GetHostEllpack(ctx_, &h_gidx_buffer);
   bytes += common::WriteVec(fo, h_gidx_buffer);
   bytes += fo->Write(impl->base_rowid);
   bytes += fo->Write(impl->NumSymbols());
 
-  curt::DefaultStream().Sync();
+  ctx_->CUDACtx()->Stream().Sync();
   return bytes;
 }
 
@@ -104,21 +103,21 @@ template <typename T>
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
 
-  auto ctx = Context{}.MakeCUDA(curt::CurrentDevice());
+  auto stream = ctx_->CUDACtx()->Stream();
 
   auto dispatch = [&] {
-    fi->Read(&ctx, page, this->param_.prefetch_copy || !this->has_hmm_ats_);
+    fi->Read(ctx_, page, this->param_.prefetch_copy || !this->has_hmm_ats_);
     impl->SetCuts(this->cuts_);
   };
 
   if (ConsoleLogger::GlobalVerbosity() == ConsoleLogger::LogVerbosity::kDebug) {
     curt::Event start{false}, stop{false};
     float milliseconds = 0;
-    start.Record(ctx.CUDACtx()->Stream());
+    start.Record(stream);
 
     dispatch();
 
-    stop.Record(ctx.CUDACtx()->Stream());
+    stop.Record(stream);
     stop.Sync();
     dh::safe_cuda(cudaEventElapsedTime(&milliseconds, start, stop));
     double n_bytes = page->Impl()->MemCostBytes();
@@ -128,7 +127,7 @@ template <typename T>
     dispatch();
   }
 
-  curt::DefaultStream().Sync();
+  stream.Sync();
 
   return true;
 }
@@ -137,8 +136,8 @@ template <typename T>
                                                       EllpackHostCacheStream* fo) const {
   xgboost_NVTX_FN_RANGE_C(3, 252, 198);
 
-  bool new_page = fo->Write(page);
-  curt::DefaultStream().Sync();
+  bool new_page = fo->Write(ctx_, page);
+  ctx_->CUDACtx()->Stream().Sync();
 
   if (new_page) {
     auto cache = fo->Share();
diff --git a/src/data/ellpack_page_raw_format.h b/src/data/ellpack_page_raw_format.h
index eda0e1d20978..74d44a8bd99d 100644
--- a/src/data/ellpack_page_raw_format.h
+++ b/src/data/ellpack_page_raw_format.h
@@ -29,14 +29,17 @@ class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
   BatchParam param_;
   // Supports CUDA HMM or ATS
   bool has_hmm_ats_{false};
+  Context const* ctx_;
 
  public:
-  explicit EllpackPageRawFormat(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device,
+  explicit EllpackPageRawFormat(Context const* ctx,
+                                std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device,
                                 BatchParam param, bool has_hmm_ats)
       : cuts_{std::move(cuts)},
         device_{device},
         param_{std::move(param)},
-        has_hmm_ats_{has_hmm_ats} {}
+        has_hmm_ats_{has_hmm_ats},
+        ctx_{ctx} {}
   [[nodiscard]] bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override;
   [[nodiscard]] std::size_t Write(EllpackPage const& page,
                                   common::AlignedFileWriteStream* fo) override;
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 5cddd94996da..cc0731021f39 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -145,9 +145,8 @@ class EllpackHostCacheStreamImpl {
     ptr_ = k;
   }
 
-  [[nodiscard]] bool Write(EllpackPage const& page) {
+  [[nodiscard]] bool Write(Context const* ctx, EllpackPage const& page) {
     auto impl = page.Impl();
-    auto ctx = Context{}.MakeCUDA(dh::CurrentDevice());
 
     this->cache_->sizes_orig.push_back(page.Impl()->MemCostBytes());
     auto orig_ptr = this->cache_->sizes_orig.size() - 1;
@@ -184,7 +183,7 @@ class EllpackHostCacheStreamImpl {
     // Finish writing a (concatenated) cache page.
     auto commit_page = [&](EllpackPageImpl const* old_impl) {
       CHECK_EQ(old_impl->gidx_buffer.Resource()->Type(), common::ResourceHandler::kCudaMalloc);
-      auto new_impl = std::make_unique<EllpackPageImpl>();
+      auto new_impl = std::make_unique<EllpackPageImpl>(ctx);
       new_impl->CopyInfo(old_impl);
 
       // Split the cache into host cache, compressed host cache, and the device cache. We
@@ -219,10 +218,10 @@ class EllpackHostCacheStreamImpl {
       dc::CuMemParams c_out;
       std::size_t constexpr kChunkSize = 1ul << 21;
       auto params = dc::CompressSnappy(
-          &ctx, old_impl->gidx_buffer.ToSpan().subspan(n_h_bytes, n_comp_bytes), &tmp, kChunkSize);
+          ctx, old_impl->gidx_buffer.ToSpan().subspan(n_h_bytes, n_comp_bytes), &tmp, kChunkSize);
       common::RefResourceView<std::uint8_t> c_buf = dc::CoalesceCompressedBuffersToHost(
-          ctx.CUDACtx()->Stream(), this->cache_->pool, params, tmp, &c_out);
-      auto c_page = dc::MakeSnappyDecomprMgr(ctx.CUDACtx()->Stream(), this->cache_->pool,
+          ctx->CUDACtx()->Stream(), this->cache_->pool, params, tmp, &c_out);
+      auto c_page = dc::MakeSnappyDecomprMgr(ctx->CUDACtx()->Stream(), this->cache_->pool,
                                              std::move(c_out), c_buf.ToSpan());
       CHECK_EQ(c_page.DecompressedBytes() + new_impl->gidx_buffer.size_bytes(), n_bytes);
 
@@ -264,13 +263,13 @@ class EllpackHostCacheStreamImpl {
       // Push a new page
       auto n_bytes = this->cache_->buffer_bytes.at(this->cache_->h_pages.size());
       auto n_samples = this->cache_->buffer_rows.at(this->cache_->h_pages.size());
-      auto new_impl = std::make_unique<EllpackPageImpl>(&ctx, impl->CutsShared(), impl->IsDense(),
+      auto new_impl = std::make_unique<EllpackPageImpl>(ctx, impl->CutsShared(), impl->IsDense(),
                                                         impl->info.row_stride, n_samples);
       new_impl->SetBaseRowId(impl->base_rowid);
       new_impl->SetNumSymbols(impl->NumSymbols());
       new_impl->gidx_buffer =
-          common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, n_bytes, 0);
-      auto offset = new_impl->Copy(&ctx, impl, 0);
+          common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(ctx, n_bytes, 0);
+      auto offset = new_impl->Copy(ctx, impl, 0);
 
       this->cache_->offsets.push_back(offset);
 
@@ -284,7 +283,7 @@ class EllpackHostCacheStreamImpl {
       CHECK(!this->cache_->h_pages.empty());
       CHECK_EQ(cache_idx, this->cache_->h_pages.size() - 1);
       auto& new_impl = this->cache_->h_pages.back();
-      auto offset = new_impl->Copy(&ctx, impl, this->cache_->offsets.back());
+      auto offset = new_impl->Copy(ctx, impl, this->cache_->offsets.back());
       this->cache_->offsets.back() += offset;
     }
 
@@ -382,8 +381,8 @@ void EllpackHostCacheStream::Read(Context const* ctx, EllpackPage* page, bool pr
   this->p_impl_->Read(ctx, page, prefetch_copy);
 }
 
-[[nodiscard]] bool EllpackHostCacheStream::Write(EllpackPage const& page) {
-  return this->p_impl_->Write(page);
+[[nodiscard]] bool EllpackHostCacheStream::Write(Context const* ctx, EllpackPage const& page) {
+  return this->p_impl_->Write(ctx, page);
 }
 
 /**
@@ -528,13 +527,12 @@ void EllpackPageSourceImpl<F>::Fetch() {
     // This is not read from cache so we still need it to be synced with sparse page source.
     CHECK_EQ(this->Iter(), this->source_->Iter());
     auto const& csr = this->source_->Page();
-    this->page_.reset(new EllpackPage{});
+    this->page_.reset(new EllpackPage{this->Ctx()});
     auto* impl = this->page_->Impl();
-    Context ctx = Context{}.MakeCUDA(this->Device().ordinal);
     if (this->GetCuts()->HasCategorical()) {
       CHECK(!this->feature_types_.empty());
     }
-    *impl = EllpackPageImpl{&ctx, this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
+    *impl = EllpackPageImpl{this->Ctx(), this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
     this->page_->SetBaseRowId(csr->base_rowid);
     LOG(INFO) << "Generated an Ellpack page with size: "
               << common::HumanMemUnit(impl->MemCostBytes())
@@ -573,7 +571,7 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
       bst_idx_t row_stride = GetRowCounts(this->ctx_, value, row_counts_span,
                                           dh::GetDevice(this->ctx_), this->missing_);
       CHECK_LE(row_stride, this->ext_info_.row_stride);
-      this->page_.reset(new EllpackPage{});
+      this->page_.reset(new EllpackPage{this->ctx_});
       *this->page_->Impl() = EllpackPageImpl{this->ctx_,
                                              value,
                                              this->missing_,
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 9a657be4d0d6..9613532006ae 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -164,7 +164,7 @@ class EllpackHostCacheStream {
    * @return Whether a new cache page is create. False if the new page is appended to the
    * previous one.
    */
-  [[nodiscard]] bool Write(EllpackPage const& page);
+  [[nodiscard]] bool Write(Context const* ctx, EllpackPage const& page);
 };
 
 namespace detail {
@@ -177,6 +177,7 @@ class EllpackFormatPolicy {
   std::shared_ptr<common::HistogramCuts const> cuts_{nullptr};
   DeviceOrd device_;
   bool has_hmm_{curt::SupportsPageableMem()};
+  Context const* ctx_{nullptr};
 
   EllpackCacheInfo cache_info_;
   static_assert(std::is_same_v<S, EllpackPage>);
@@ -214,11 +215,12 @@ class EllpackFormatPolicy {
 
   [[nodiscard]] auto CreatePageFormat(BatchParam const& param) const {
     CHECK_EQ(cuts_->cut_values_.Device(), device_);
-    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{cuts_, device_, param, has_hmm_}};
+    std::unique_ptr<FormatT> fmt{new EllpackPageRawFormat{ctx_, cuts_, device_, param, has_hmm_}};
     return fmt;
   }
-  void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts, DeviceOrd device,
-               EllpackCacheInfo cinfo) {
+  void SetCuts(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
+               DeviceOrd device, EllpackCacheInfo cinfo) {
+    this->ctx_ = ctx;
     std::swap(this->cuts_, cuts);
     this->device_ = device;
     CHECK(this->device_.IsCUDA());
@@ -230,6 +232,8 @@ class EllpackFormatPolicy {
   }
   [[nodiscard]] auto Device() const { return this->device_; }
   [[nodiscard]] auto const& CacheInfo() { return this->cache_info_; }
+  [[nodiscard]] auto Ctx() const { return this->ctx_; }
+  [[nodiscard]] auto MakePage() const { return std::make_shared<S>(ctx_); }
 };
 
 template <typename S, template <typename> typename F>
@@ -311,7 +315,7 @@ class EllpackPageSourceImpl : public PageSourceIncMixIn<EllpackPage, F> {
         feature_types_{feature_types} {
     this->source_ = source;
     cuts->SetDevice(ctx->Device());
-    this->SetCuts(std::move(cuts), ctx->Device(), cinfo);
+    this->SetCuts(ctx, std::move(cuts), ctx->Device(), cinfo);
     this->Fetch();
   }
 
@@ -353,7 +357,7 @@ class ExtEllpackPageSourceImpl : public ExtQantileSourceMixin<EllpackPage, Forma
         info_{info},
         ext_info_{std::move(ext_info)} {
     cuts->SetDevice(ctx->Device());
-    this->SetCuts(std::move(cuts), ctx->Device(), cinfo);
+    this->SetCuts(ctx, std::move(cuts), ctx->Device(), cinfo);
     CHECK(!this->cache_info_->written);
     this->source_->Reset();
     CHECK(this->source_->Next());
diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h
index e6c97540a1db..4b2342bbc7e9 100644
--- a/src/data/gradient_index_page_source.h
+++ b/src/data/gradient_index_page_source.h
@@ -37,6 +37,7 @@ class GHistIndexFormatPolicy {
   }
 
   void SetCuts(common::HistogramCuts cuts) { std::swap(cuts_, cuts); }
+  [[nodiscard]] auto MakePage() const { return std::make_shared<GHistIndexMatrix>(); }
 };
 
 class GradientIndexPageSource
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 47caf0e72664..e5e46d872709 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -212,7 +212,7 @@ void IterativeDMatrix::Save(common::AlignedFileWriteStream*) const {
   LOG(FATAL) << "Not implemented";
 }
 
-IterativeDMatrix* IterativeDMatrix::Load(common::AlignedResourceReadStream*) {
+IterativeDMatrix* IterativeDMatrix::Load(Context const*, common::AlignedResourceReadStream*) {
   LOG(FATAL) << "Not implemented";
   return nullptr;
 }
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index b8e5eaaba705..3e006e53c744 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -44,7 +44,7 @@ void IterativeDMatrix::InitFromCUDA(
     if (!ellpack_) {
       // Should be put inside the while loop to protect against empty batch.  In
       // that case device id is invalid.
-      ellpack_.reset(new EllpackPage);
+      ellpack_.reset(new EllpackPage{&fmat_ctx_});
       *(ellpack_->Impl()) = EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), ext_info.row_stride,
                                             ext_info.accumulated_rows);
     }
@@ -107,7 +107,7 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
   }
 
   if (!ellpack_) {
-    ellpack_.reset(new EllpackPage());
+    ellpack_.reset(new EllpackPage{&fmat_ctx_});
     if (ctx->IsCUDA()) {
       this->Info().feature_types.SetDevice(ctx->Device());
       *ellpack_->Impl() =
@@ -137,20 +137,20 @@ void IterativeDMatrix::Save(common::AlignedFileWriteStream* fo) const {
   auto const& p_cuts = this->ellpack_->Impl()->CutsShared();
   p_cuts->Save(fo);
   // Save ellpack
-  auto fmt =
-      std::make_unique<EllpackPageRawFormat>(p_cuts, this->Ctx()->Device(), BatchParam{}, false);
+  auto fmt = std::make_unique<EllpackPageRawFormat>(this->Ctx(), p_cuts, this->Ctx()->Device(),
+                                                    BatchParam{}, false);
   auto n_bytes = fmt->Write(*this->ellpack_, fo);
   CHECK_GE(n_bytes, this->ellpack_->Impl()->MemCostBytes());
 }
 
-IterativeDMatrix* IterativeDMatrix::Load(common::AlignedResourceReadStream* fi) {
+IterativeDMatrix* IterativeDMatrix::Load(Context const* ctx,
+                                         common::AlignedResourceReadStream* fi) {
   CHECK(fi);
   // Load cuts
   std::shared_ptr<common::HistogramCuts> p_cuts{common::HistogramCuts::Load(fi)};
   // Load ellpack
-  auto fmt = std::make_unique<EllpackPageRawFormat>(p_cuts, DeviceOrd::CUDA(dh::CurrentDevice()),
-                                                    BatchParam{}, false);
-  auto ellpack = std::make_shared<EllpackPage>();
+  auto fmt = std::make_unique<EllpackPageRawFormat>(ctx, p_cuts, ctx->Device(), BatchParam{}, false);
+  auto ellpack = std::make_shared<EllpackPage>(ctx);
   CHECK(fmt->Read(ellpack.get(), fi));
   return new IterativeDMatrix{std::move(ellpack)};
 }
diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h
index ce4b5b49e725..c05ffc5a0c88 100644
--- a/src/data/iterative_dmatrix.h
+++ b/src/data/iterative_dmatrix.h
@@ -65,7 +65,8 @@ class IterativeDMatrix : public QuantileDMatrix {
   BatchSet<ExtSparsePage> GetExtBatches(Context const *ctx, BatchParam const &param) override;
 
   void Save(common::AlignedFileWriteStream *fo) const;
-  [[nodiscard]] static IterativeDMatrix *Load(common::AlignedResourceReadStream *fi);
+  [[nodiscard]] static IterativeDMatrix *Load(Context const *ctx,
+                                               common::AlignedResourceReadStream *fi);
 };
 }  // namespace data
 }  // namespace xgboost
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 14f65d1119eb..6e9b76b5c381 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -235,6 +235,7 @@ class DefaultFormatPolicy {
     std::unique_ptr<FormatT> fmt{::xgboost::data::CreatePageFormat<S>("raw")};
     return fmt;
   }
+  [[nodiscard]] auto MakePage() const { return std::make_shared<S>(); }
 };
 
 /**
@@ -317,7 +318,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
       }
       auto p = this->param_;
       ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, p, this] {
-        auto page = std::make_shared<S>();
+        auto page = self->MakePage();
         this->exce_.Run([&] {
           std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{self->CreatePageFormat(p)};
           auto name = self->cache_info_->ShardName();
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 9fa7ab83f966..30a18db523c4 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -56,7 +56,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam<bool> {
 
     auto row_stride = GetRowStride(m.get());
     EllpackCacheInfo cinfo = CInfoForTest(&ctx, m.get(), row_stride, param, cuts);
-    policy.SetCuts(cuts, ctx.Device(), cinfo);
+    policy.SetCuts(&ctx, cuts, ctx.Device(), cinfo);
 
     std::unique_ptr<EllpackPageRawFormat> format{policy.CreatePageFormat(param)};
 
@@ -68,7 +68,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam<bool> {
       }
     }
 
-    EllpackPage page;
+    EllpackPage page{&ctx};
     auto fi = policy.CreateReader(StringView{path}, static_cast<bst_idx_t>(0), n_bytes);
     ASSERT_TRUE(format->Read(&page, fi.get()));
 
@@ -130,7 +130,7 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
             cinfo.buffer_bytes.push_back(page.Impl()->MemCostBytes());
             cinfo.buffer_rows.push_back(page.Impl()->n_rows);
           }
-          policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
+          policy.SetCuts(&ctx, page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
           format = policy.CreatePageFormat(param);
         }
         auto writer = policy.CreateWriter({}, i);
@@ -143,7 +143,7 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
 
     for (std::size_t i = 0; i < 3; ++i) {
       auto reader = policy.CreateReader({}, cache.offset[i], cache.Bytes(i));
-      EllpackPage page;
+      EllpackPage page{&ctx};
       ASSERT_TRUE(format->Read(&page, reader.get()));
       ASSERT_EQ(page.Impl()->MemCostBytes(), cache.Bytes(i));
       auto p_fmat = RandomDataGenerator{100, 14, 0.5}.Seed(i).GenerateDMatrix();
@@ -202,7 +202,7 @@ TEST(EllpackPageRawFormat, DevicePageConcat) {
       } else {
         EXPECT_EQ(cinfo.buffer_rows.size(), 4ul);
       }
-      policy.SetCuts(page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
+      policy.SetCuts(&ctx, page.Impl()->CutsShared(), ctx.Device(), std::move(cinfo));
     }
 
     auto format = policy.CreatePageFormat(param);
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 362c789e25e5..a421ffecb5d0 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -204,7 +204,7 @@ TEST(IterativeDeviceDMatrix, IO) {
   }
   auto fsize = std::filesystem::file_size(path);
   auto fi = std::make_unique<common::MemBufFileReadStream>(path.string(), 0ul, fsize);
-  auto loaded = std::shared_ptr<IterativeDMatrix>(IterativeDMatrix::Load(fi.get()));
+  auto loaded = std::shared_ptr<IterativeDMatrix>(IterativeDMatrix::Load(&ctx, fi.get()));
   for (auto const& orig_page : qdm->GetBatches<EllpackPage>(&ctx, {})) {
     for (auto const& new_page : loaded->GetBatches<EllpackPage>(&ctx, {})) {
       std::vector<common::CompressedByteT> h_orig, h_new;

From 7e8a6ccebaa22f45745ae842b6b434c31ee2ce27 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Apr 2026 02:19:07 +0800
Subject: [PATCH 03/11] lint.

---
 src/data/ellpack_page_source.cu | 3 ++-
 src/data/iterative_dmatrix.cu   | 3 ++-
 src/data/iterative_dmatrix.h    | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index cc0731021f39..82e1bed58d4d 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -532,7 +532,8 @@ void EllpackPageSourceImpl<F>::Fetch() {
     if (this->GetCuts()->HasCategorical()) {
       CHECK(!this->feature_types_.empty());
     }
-    *impl = EllpackPageImpl{this->Ctx(), this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
+    *impl =
+        EllpackPageImpl{this->Ctx(), this->GetCuts(), *csr, is_dense_, row_stride_, feature_types_};
     this->page_->SetBaseRowId(csr->base_rowid);
     LOG(INFO) << "Generated an Ellpack page with size: "
               << common::HumanMemUnit(impl->MemCostBytes())
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 3e006e53c744..f37d4412f7e1 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -149,7 +149,8 @@ IterativeDMatrix* IterativeDMatrix::Load(Context const* ctx,
   // Load cuts
   std::shared_ptr<common::HistogramCuts> p_cuts{common::HistogramCuts::Load(fi)};
   // Load ellpack
-  auto fmt = std::make_unique<EllpackPageRawFormat>(ctx, p_cuts, ctx->Device(), BatchParam{}, false);
+  auto fmt =
+      std::make_unique<EllpackPageRawFormat>(ctx, p_cuts, ctx->Device(), BatchParam{}, false);
   auto ellpack = std::make_shared<EllpackPage>(ctx);
   CHECK(fmt->Read(ellpack.get(), fi));
   return new IterativeDMatrix{std::move(ellpack)};
diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h
index c05ffc5a0c88..97b087ffc114 100644
--- a/src/data/iterative_dmatrix.h
+++ b/src/data/iterative_dmatrix.h
@@ -66,7 +66,7 @@ class IterativeDMatrix : public QuantileDMatrix {
 
   void Save(common::AlignedFileWriteStream *fo) const;
   [[nodiscard]] static IterativeDMatrix *Load(Context const *ctx,
-                                               common::AlignedResourceReadStream *fi);
+                                              common::AlignedResourceReadStream *fi);
 };
 }  // namespace data
 }  // namespace xgboost

From 115dea777d4b416b77922ace506a4ca90c1f5e7e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Apr 2026 02:21:49 +0800
Subject: [PATCH 04/11] cpu build.

---
 src/data/ellpack_page.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
index 77c2798d76c4..958ad70f1ac8 100644
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -20,7 +20,7 @@ class EllpackPageImpl {
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
 };
 
-EllpackPage::EllpackPage(Context const*) = default;
+EllpackPage::EllpackPage(Context const*) {}
 
 EllpackPage::EllpackPage(Context const*, DMatrix*, const BatchParam&) {
   LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "

From 5fb7aeb3ebe11a7c3af1cfef68624480ac1dbca9 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Apr 2026 02:47:09 +0800
Subject: [PATCH 05/11] Fix.

---
 src/data/iterative_dmatrix.cu | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index f37d4412f7e1..0d02f5cb66b4 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -110,20 +110,15 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
     ellpack_.reset(new EllpackPage{&fmat_ctx_});
     if (ctx->IsCUDA()) {
       this->Info().feature_types.SetDevice(ctx->Device());
-      *ellpack_->Impl() =
-          EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     } else if (fmat_ctx_.IsCUDA()) {
       this->Info().feature_types.SetDevice(fmat_ctx_.Device());
-      *ellpack_->Impl() =
-          EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     } else {
       // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
       // for cut reference.
-      auto cuda_ctx = ctx->MakeCUDA();
-      this->Info().feature_types.SetDevice(cuda_ctx.Device());
-      *ellpack_->Impl() =
-          EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
+      this->Info().feature_types.SetDevice(ctx->MakeCUDA().Device());
     }
+    *ellpack_->Impl() =
+        EllpackPageImpl{&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()};
   }
   CHECK(ellpack_);
   auto begin_iter = BatchIterator<EllpackPage>(new SimpleBatchIteratorImpl<EllpackPage>(ellpack_));

From 54dd6a7861d045aac967da94b4918c07b04dd5ee Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Apr 2026 04:01:06 +0800
Subject: [PATCH 06/11] Fix

---
 src/data/simple_dmatrix.cc | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index a249c99ac515..a18ec4a59ea6 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -156,15 +156,12 @@ BatchSet<EllpackPage> SimpleDMatrix::GetEllpackBatches(Context const* ctx,
     if (ctx->IsCUDA()) {
       // The context passed in is on GPU, we pick it first since we prioritize the context
       // in Booster.
-      ellpack_page_.reset(new EllpackPage(ctx, this, param));
-    } else if (fmat_ctx_.IsCUDA()) {
-      // DMatrix was initialized on GPU, we use the context from initialization.
-      ellpack_page_.reset(new EllpackPage(&fmat_ctx_, this, param));
-    } else {
+      fmat_ctx_ = *ctx;
+    } else if (!fmat_ctx_.IsCUDA()) {
       // Mismatched parameter, user set a new max_bin during training.
-      auto cuda_ctx = ctx->MakeCUDA();
-      ellpack_page_.reset(new EllpackPage(&cuda_ctx, this, param));
+      fmat_ctx_ = ctx->MakeCUDA();
     }
+    ellpack_page_.reset(new EllpackPage(&fmat_ctx_, this, param));
 
     batch_param_ = param.MakeCache();
   }

From cd03fbca8e768e1940bcfec3eee862b108dfbe28 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Apr 2026 05:14:25 +0800
Subject: [PATCH 07/11] Fix.

---
 src/data/iterative_dmatrix.cu | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 0d02f5cb66b4..f2fce5c77d89 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -107,16 +107,13 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
   }
 
   if (!ellpack_) {
-    ellpack_.reset(new EllpackPage{&fmat_ctx_});
     if (ctx->IsCUDA()) {
-      this->Info().feature_types.SetDevice(ctx->Device());
-    } else if (fmat_ctx_.IsCUDA()) {
-      this->Info().feature_types.SetDevice(fmat_ctx_.Device());
-    } else {
-      // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
-      // for cut reference.
-      this->Info().feature_types.SetDevice(ctx->MakeCUDA().Device());
+      fmat_ctx_ = *ctx;
+    } else if (!fmat_ctx_.IsCUDA()) {
+      fmat_ctx_ = ctx->MakeCUDA();
     }
+    this->Info().feature_types.SetDevice(fmat_ctx_.Device());
+    ellpack_.reset(new EllpackPage{&fmat_ctx_});
     *ellpack_->Impl() =
         EllpackPageImpl{&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()};
   }

From 1242879226bfd1b5e5226a3614a50528f453a8c8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 2 Apr 2026 17:28:00 +0800
Subject: [PATCH 08/11] Use explicit stream sync.

---
 src/data/ellpack_page.cu                      | 20 +++--------------
 src/data/ellpack_page.cuh                     |  3 +--
 src/data/ellpack_page.h                       |  2 +-
 src/data/ellpack_page_source.cu               | 22 ++++++++++++++++---
 src/data/ellpack_page_source.h                |  7 +++++-
 src/data/gradient_index_page_source.h         |  2 +-
 src/data/iterative_dmatrix.cu                 |  6 ++---
 src/data/sparse_page_source.h                 | 10 ++++-----
 .../cpp/data/test_ellpack_page_raw_format.cu  |  4 ++--
 9 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 643db6ed6084..d3843e98de7f 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -30,9 +30,9 @@
 #include "xgboost/data.h"                   // for DMatrix
 
 namespace xgboost {
-EllpackPage::EllpackPage(Context const* ctx) : impl_{new EllpackPageImpl{ctx}} {}
+EllpackPage::EllpackPage() : impl_{new EllpackPageImpl{}} {}
 
-EllpackPageImpl::EllpackPageImpl(Context const* ctx) : ctx_{ctx} {}
+EllpackPageImpl::EllpackPageImpl() = default;
 
 EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : impl_{new EllpackPageImpl{ctx, dmat, param}} {}
@@ -189,7 +189,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
     : is_dense{is_dense},
       n_rows{n_rows},
       cuts_{std::move(cuts)},
-      ctx_{ctx},
       info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
@@ -205,7 +204,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
     : is_dense{is_dense},
       n_rows{page.Size()},
       cuts_{std::move(cuts)},
-      ctx_{ctx},
       info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
@@ -225,7 +223,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
                       common::DeviceSketch(ctx, p_fmat, param.max_bin))
                 : std::make_shared<common::HistogramCuts>(
                       common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
-      ctx_{ctx},
       info{CalcNumSymbols(ctx, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
@@ -470,7 +467,6 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
         cuts->SetDevice(ctx->Device());
         return cuts;
       }()},
-      ctx_{ctx},
       info{CalcNumSymbols(
           ctx,
           [&] {
@@ -506,17 +502,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   this->monitor_.Stop("CopyGHistToEllpack");
 }
 
-EllpackPageImpl::~EllpackPageImpl() noexcept(false) {
-  // Sync the stream to make sure all running CUDA kernels finish before deallocation.
-  auto status = ctx_->CUDACtx()->Stream().Sync(false);
-  if (status != cudaSuccess) {
-    auto str = cudaGetErrorString(status);
-    // For external-memory, throwing here can trigger a series of calls to
-    // `std::terminate` by various destructors. For now, we just log the error.
-    LOG(WARNING) << "Ran into CUDA error:" << str << "\nXGBoost is likely to abort.";
-  }
-  dh::safe_cuda(status);
-}
+EllpackPageImpl::~EllpackPageImpl() noexcept(false) = default;
 
 // A functor that copies the data from one EllpackPage to another.
 template <typename IterT>
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index fabe2db74a24..13ef7b877e56 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -186,7 +186,7 @@ class EllpackPageImpl {
    * This is used in the external memory case. An empty ELLPACK page is constructed with its content
    * set later by the reader.
    */
-  explicit EllpackPageImpl(Context const* ctx);
+  EllpackPageImpl();
 
   /**
    * @brief Constructor from existing ellpack matrics.
@@ -357,7 +357,6 @@ class EllpackPageImpl {
   void InitCompressedData(Context const* ctx);
 
   std::shared_ptr<common::HistogramCuts const> cuts_;
-  Context const* ctx_{nullptr};
 
  public:
   bool is_dense{false};
diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h
index dcaf02fbb672..474f6f955df1 100644
--- a/src/data/ellpack_page.h
+++ b/src/data/ellpack_page.h
@@ -28,7 +28,7 @@ class EllpackPage {
    * This is used in the external memory case. An empty ELLPACK page is constructed with its content
    * set later by the reader.
    */
-  explicit EllpackPage(Context const* ctx);
+  EllpackPage();
   /**
    * @brief Constructor from an existing DMatrix.
    *
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 82e1bed58d4d..16512d702b62 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -183,7 +183,7 @@ class EllpackHostCacheStreamImpl {
     // Finish writing a (concatenated) cache page.
     auto commit_page = [&](EllpackPageImpl const* old_impl) {
       CHECK_EQ(old_impl->gidx_buffer.Resource()->Type(), common::ResourceHandler::kCudaMalloc);
-      auto new_impl = std::make_unique<EllpackPageImpl>(ctx);
+      auto new_impl = std::make_unique<EllpackPageImpl>();
       new_impl->CopyInfo(old_impl);
 
       // Split the cache into host cache, compressed host cache, and the device cache. We
@@ -385,6 +385,20 @@ void EllpackHostCacheStream::Read(Context const* ctx, EllpackPage* page, bool pr
   return this->p_impl_->Write(ctx, page);
 }
 
+/**
+ * EllpackFormatPolicy
+ */
+template <typename S>
+void EllpackFormatPolicy<S>::DestroyPage(std::shared_ptr<S>& page) const {
+  if (page && ctx_) {
+    ctx_->CUDACtx()->Stream().Sync();
+  }
+  page.reset();
+}
+
+template void EllpackFormatPolicy<EllpackPage>::DestroyPage(
+    std::shared_ptr<EllpackPage>& page) const;
+
 /**
  * EllpackCacheStreamPolicy
  */
@@ -527,7 +541,8 @@ void EllpackPageSourceImpl<F>::Fetch() {
     // This is not read from cache so we still need it to be synced with sparse page source.
     CHECK_EQ(this->Iter(), this->source_->Iter());
     auto const& csr = this->source_->Page();
-    this->page_.reset(new EllpackPage{this->Ctx()});
+    this->DestroyPage(this->page_);
+    this->page_.reset(new EllpackPage{});
     auto* impl = this->page_->Impl();
     if (this->GetCuts()->HasCategorical()) {
       CHECK(!this->feature_types_.empty());
@@ -572,7 +587,8 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
       bst_idx_t row_stride = GetRowCounts(this->ctx_, value, row_counts_span,
                                           dh::GetDevice(this->ctx_), this->missing_);
       CHECK_LE(row_stride, this->ext_info_.row_stride);
-      this->page_.reset(new EllpackPage{this->ctx_});
+      this->DestroyPage(this->page_);
+      this->page_.reset(new EllpackPage{});
       *this->page_->Impl() = EllpackPageImpl{this->ctx_,
                                              value,
                                              this->missing_,
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 9613532006ae..a5c7c7112179 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -233,7 +233,7 @@ class EllpackFormatPolicy {
   [[nodiscard]] auto Device() const { return this->device_; }
   [[nodiscard]] auto const& CacheInfo() { return this->cache_info_; }
   [[nodiscard]] auto Ctx() const { return this->ctx_; }
-  [[nodiscard]] auto MakePage() const { return std::make_shared<S>(ctx_); }
+  void DestroyPage(std::shared_ptr<S>& page) const;
 };
 
 template <typename S, template <typename> typename F>
@@ -387,6 +387,11 @@ using ExtEllpackPageSource =
     ExtEllpackPageSourceImpl<EllpackMmapStreamPolicy<EllpackPage, EllpackFormatPolicy>>;
 
 #if !defined(XGBOOST_USE_CUDA)
+template <typename S>
+inline void EllpackFormatPolicy<S>::DestroyPage(std::shared_ptr<S>& page) const {
+  page.reset();
+}
+
 template <typename F>
 inline void EllpackPageSourceImpl<F>::Fetch() {
   // silent the warning about unused variables.
diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h
index 4b2342bbc7e9..2909169f4f6a 100644
--- a/src/data/gradient_index_page_source.h
+++ b/src/data/gradient_index_page_source.h
@@ -37,7 +37,7 @@ class GHistIndexFormatPolicy {
   }
 
   void SetCuts(common::HistogramCuts cuts) { std::swap(cuts_, cuts); }
-  [[nodiscard]] auto MakePage() const { return std::make_shared<GHistIndexMatrix>(); }
+  static void DestroyPage(std::shared_ptr<S>& page) { page.reset(); }
 };
 
 class GradientIndexPageSource
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index f2fce5c77d89..37aefc12df68 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -44,7 +44,7 @@ void IterativeDMatrix::InitFromCUDA(
     if (!ellpack_) {
       // Should be put inside the while loop to protect against empty batch.  In
       // that case device id is invalid.
-      ellpack_.reset(new EllpackPage{&fmat_ctx_});
+      ellpack_.reset(new EllpackPage{});
       *(ellpack_->Impl()) = EllpackPageImpl(&fmat_ctx_, cuts, this->IsDense(), ext_info.row_stride,
                                             ext_info.accumulated_rows);
     }
@@ -113,7 +113,7 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
       fmat_ctx_ = ctx->MakeCUDA();
     }
     this->Info().feature_types.SetDevice(fmat_ctx_.Device());
-    ellpack_.reset(new EllpackPage{&fmat_ctx_});
+    ellpack_.reset(new EllpackPage{});
     *ellpack_->Impl() =
         EllpackPageImpl{&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()};
   }
@@ -143,7 +143,7 @@ IterativeDMatrix* IterativeDMatrix::Load(Context const* ctx,
   // Load ellpack
   auto fmt =
       std::make_unique<EllpackPageRawFormat>(ctx, p_cuts, ctx->Device(), BatchParam{}, false);
-  auto ellpack = std::make_shared<EllpackPage>(ctx);
+  auto ellpack = std::make_shared<EllpackPage>();
   CHECK(fmt->Read(ellpack.get(), fi));
   return new IterativeDMatrix{std::move(ellpack)};
 }
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 6e9b76b5c381..cf04f007d164 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -235,7 +235,7 @@ class DefaultFormatPolicy {
     std::unique_ptr<FormatT> fmt{::xgboost::data::CreatePageFormat<S>("raw")};
     return fmt;
   }
-  [[nodiscard]] auto MakePage() const { return std::make_shared<S>(); }
+  static void DestroyPage(std::shared_ptr<S>& page) { page.reset(); }
 };
 
 /**
@@ -300,9 +300,9 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
 
     exce_.Rethrow();
     // Clear out the existing page before loading new ones. This helps reduce memory usage
-    // when page is not loaded with mmap, in addition, it triggers necessary CUDA
-    // synchronizations by freeing memory.
-    page_.reset();
+    // when page is not loaded with mmap. The destruction policy handles any necessary
+    // synchronizations (e.g., CUDA stream sync for Ellpack pages).
+    this->DestroyPage(page_);
 
     for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       bool restart = fetch_it == n_batches;
@@ -318,7 +318,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
       }
       auto p = this->param_;
       ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, p, this] {
-        auto page = self->MakePage();
+        auto page = std::make_shared<S>();
         this->exce_.Run([&] {
           std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{self->CreatePageFormat(p)};
           auto name = self->cache_info_->ShardName();
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 30a18db523c4..616d8fc6cb44 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -68,7 +68,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam<bool> {
       }
     }
 
-    EllpackPage page{&ctx};
+    EllpackPage page;
     auto fi = policy.CreateReader(StringView{path}, static_cast<bst_idx_t>(0), n_bytes);
     ASSERT_TRUE(format->Read(&page, fi.get()));
 
@@ -143,7 +143,7 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
 
     for (std::size_t i = 0; i < 3; ++i) {
       auto reader = policy.CreateReader({}, cache.offset[i], cache.Bytes(i));
-      EllpackPage page{&ctx};
+      EllpackPage page;
       ASSERT_TRUE(format->Read(&page, reader.get()));
       ASSERT_EQ(page.Impl()->MemCostBytes(), cache.Bytes(i));
       auto p_fmat = RandomDataGenerator{100, 14, 0.5}.Seed(i).GenerateDMatrix();

From 65e74866a6357e14c9d399fd2c77ab93567ce4cc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 3 Apr 2026 19:56:58 +0800
Subject: [PATCH 09/11] CPU build.

---
 src/data/ellpack_page.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
index 958ad70f1ac8..69fd46b176d1 100644
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2024, XGBoost contributors
+ * Copyright 2019-2026, XGBoost contributors
  */
 #ifndef XGBOOST_USE_CUDA
 
@@ -20,7 +20,7 @@ class EllpackPageImpl {
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
 };
 
-EllpackPage::EllpackPage(Context const*) {}
+EllpackPage::EllpackPage() = default;
 
 EllpackPage::EllpackPage(Context const*, DMatrix*, const BatchParam&) {
   LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "

From 101add01100efbaca3559d094e31111dfb43703e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 3 Apr 2026 20:02:57 +0800
Subject: [PATCH 10/11] lint.

---
 src/data/ellpack_page_source.cu       | 12 ++++++------
 src/data/ellpack_page_source.h        |  8 ++++----
 src/data/gradient_index_page_source.h |  2 +-
 src/data/sparse_page_source.h         |  4 ++--
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 16512d702b62..7f3dc5c8c9dc 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2025, XGBoost contributors
+ * Copyright 2019-2026, XGBoost contributors
  */
 #include <algorithm>  // for max
 #include <cstddef>    // for size_t
@@ -389,15 +389,15 @@ void EllpackHostCacheStream::Read(Context const* ctx, EllpackPage* page, bool pr
  * EllpackFormatPolicy
  */
 template <typename S>
-void EllpackFormatPolicy<S>::DestroyPage(std::shared_ptr<S>& page) const {
+void EllpackFormatPolicy<S>::DestroyPage(std::shared_ptr<S>* page) const {
   if (page && ctx_) {
     ctx_->CUDACtx()->Stream().Sync();
   }
-  page.reset();
+  page->reset();
 }
 
 template void EllpackFormatPolicy<EllpackPage>::DestroyPage(
-    std::shared_ptr<EllpackPage>& page) const;
+    std::shared_ptr<EllpackPage>* page) const;
 
 /**
  * EllpackCacheStreamPolicy
@@ -541,7 +541,7 @@ void EllpackPageSourceImpl<F>::Fetch() {
     // This is not read from cache so we still need it to be synced with sparse page source.
     CHECK_EQ(this->Iter(), this->source_->Iter());
     auto const& csr = this->source_->Page();
-    this->DestroyPage(this->page_);
+    this->DestroyPage(&this->page_);
     this->page_.reset(new EllpackPage{});
     auto* impl = this->page_->Impl();
     if (this->GetCuts()->HasCategorical()) {
@@ -587,7 +587,7 @@ void ExtEllpackPageSourceImpl<F>::Fetch() {
       bst_idx_t row_stride = GetRowCounts(this->ctx_, value, row_counts_span,
                                           dh::GetDevice(this->ctx_), this->missing_);
       CHECK_LE(row_stride, this->ext_info_.row_stride);
-      this->DestroyPage(this->page_);
+      this->DestroyPage(&this->page_);
       this->page_.reset(new EllpackPage{});
       *this->page_->Impl() = EllpackPageImpl{this->ctx_,
                                              value,
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index a5c7c7112179..0676a41d1b77 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2025, XGBoost Contributors
+ * Copyright 2019-2026, XGBoost Contributors
  */
 
 #ifndef XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
@@ -233,7 +233,7 @@ class EllpackFormatPolicy {
   [[nodiscard]] auto Device() const { return this->device_; }
   [[nodiscard]] auto const& CacheInfo() { return this->cache_info_; }
   [[nodiscard]] auto Ctx() const { return this->ctx_; }
-  void DestroyPage(std::shared_ptr<S>& page) const;
+  void DestroyPage(std::shared_ptr<S>* page) const;
 };
 
 template <typename S, template <typename> typename F>
@@ -388,8 +388,8 @@ using ExtEllpackPageSource =
 
 #if !defined(XGBOOST_USE_CUDA)
 template <typename S>
-inline void EllpackFormatPolicy<S>::DestroyPage(std::shared_ptr<S>& page) const {
-  page.reset();
+inline void EllpackFormatPolicy<S>::DestroyPage(std::shared_ptr<S>* page) const {
+  page->reset();
 }
 
 template <typename F>
diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h
index 2909169f4f6a..19a5b19c19a4 100644
--- a/src/data/gradient_index_page_source.h
+++ b/src/data/gradient_index_page_source.h
@@ -37,7 +37,7 @@ class GHistIndexFormatPolicy {
   }
 
   void SetCuts(common::HistogramCuts cuts) { std::swap(cuts_, cuts); }
-  static void DestroyPage(std::shared_ptr<S>& page) { page.reset(); }
+  static void DestroyPage(std::shared_ptr<S>* page) { page->reset(); }
 };
 
 class GradientIndexPageSource
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index cf04f007d164..7a5528df78df 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -235,7 +235,7 @@ class DefaultFormatPolicy {
     std::unique_ptr<FormatT> fmt{::xgboost::data::CreatePageFormat<S>("raw")};
     return fmt;
   }
-  static void DestroyPage(std::shared_ptr<S>& page) { page.reset(); }
+  static void DestroyPage(std::shared_ptr<S>* page) { page->reset(); }
 };
 
 /**
@@ -302,7 +302,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
     // Clear out the existing page before loading new ones. This helps reduce memory usage
     // when page is not loaded with mmap. The destruction policy handles any necessary
     // synchronizations (e.g., CUDA stream sync for Ellpack pages).
-    this->DestroyPage(page_);
+    this->DestroyPage(&page_);
 
     for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       bool restart = fetch_it == n_batches;

From 63370db3bf03d71c2125859571b67b3346217ed0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 3 Apr 2026 20:05:22 +0800
Subject: [PATCH 11/11] lint.

---
 src/data/gradient_index_page_source.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data/gradient_index_page_source.h b/src/data/gradient_index_page_source.h
index 19a5b19c19a4..a223ef0828ce 100644
--- a/src/data/gradient_index_page_source.h
+++ b/src/data/gradient_index_page_source.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost Contributors
+ * Copyright 2021-2026, XGBoost Contributors
  */
 #ifndef XGBOOST_DATA_GRADIENT_INDEX_PAGE_SOURCE_H_
 #define XGBOOST_DATA_GRADIENT_INDEX_PAGE_SOURCE_H_