cms-sw · cmsbuild · May 29, 2025 · May 28, 2025 · May 28, 2025
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
@@ -302,85 +302,6 @@ void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetu
 [Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc)
 
 
-### Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)
-
-```cpp
-class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
-public:
-  ...
-  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
-  ...
-private:
-  void addMoreWork(edm::WaitingTaskWithArenaHolder waitingTashHolder);
-
-  ...
-  ProducerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
-  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
-};
-...
-void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
-
-  // Set the current device to the same that was used to produce
-  // InputData, and also use the same CUDA stream
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
-
-  // Grab the real input data. Checks that the input data is on the
-  // current device. If the input data was produced in a different CUDA
-  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
-  // synchronization point with CUDA event and cudaStreamWaitEvent()
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by cms::cuda::ScopedContextAcquire::stream()
-  gpuAlgo.makeAsync(inputData, ctx.stream());
-
-  // Push a functor on top of "a stack of tasks" to be run as a next
-  // task after the work queued above before produce(). In this case ctx
-  // is a context constructed by the calling TBB task, and therefore the
-  // current device and CUDA stream have been already set up. The ctx
-  // internally holds the WaitingTaskWithArenaHolder for the next task.
-
-  ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
-    addMoreWork(ctx);
-  });
-
-  // Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished,
-  // and saves the device and CUDA stream to ctxState_
-}
-
-// Called after the asynchronous work queued in acquire() has finished
-void ProducerInputOutputCUDA::addMoreWork(cms::cuda::ScopedContextTask& ctx) {
-  // Current device and CUDA stream have already been set
-
-  // Queues more asynchronous data transfer and kernels to the CUDA
-  // stream returned by cms::cuda::ScopedContextTask::stream()
-  gpuAlgo.makeMoreAsync(ctx.stream());
-
-  // Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished
-}
-
-// Called after the asynchronous work queued in addMoreWork() has finished
-void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
-  // Sets again the current device, uses the CUDA stream created in the acquire()
-  cms::cuda::ScopedContextProduce ctx{ctxState_};
-
-  // Now getResult() returns data in GPU memory that is passed to the
-  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
-  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
-  // the current device and the CUDA stream since those will be needed
-  // in the consumer side.
-  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
-}
-```
-
-[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEWTask.cc)
-
-
 ### Producer with CUDA input and output (without ExternalWork)
 
 If the producer does not need to transfer anything back to CPU (like
@@ -791,79 +712,6 @@ The `cms::cuda::ScopedContextAcquire` saves its state to the `ctxState_` in
 the destructor, and `cms::cuda::ScopedContextProduce` then restores the
 context.
 
-#### Module-internal chain of CPU and GPU tasks
-
-Technically `ExternalWork` works such that the framework calls
-`acquire()` with a `edm::WaitingTaskWithArenaHolder` that holds an
-`edm::WaitingTask` (that inherits from `tbb::task`) for calling
-`produce()` in a `std::shared_ptr` semantics: spawn the task when
-reference count hits `0`. It is also possible to create a longer chain
-of such tasks, alternating between CPU and GPU work. This mechanism
-can also be used to re-run (part of) the GPU work.
-
-The "next tasks" to run are essentially structured as a stack, such
-that
-- `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask::pushNextTask()`
-  pushes a new functor on top of the stack
-- Completion of both the asynchronous work and the queueing function
-  pops the top task of the stack and enqueues it (so that TBB
-  eventually runs the task)
-  * Technically the task is made eligible to run when all copies of
-    `edm::WaitingTaskWithArenaHolder` of the acquire() (or "previous"
-    function) have either been destructed or their `doneWaiting()` has
-    been called
-  * The code calling `acquire()` or the functor holds one copy of
-    `edm::WaitingTaskWithArenaHolder` so it is guaranteed that the
-    next function will not run before the earlier one has finished
-
-
-Below is an example how to push a functor on top of the stack of tasks
-to run next (following the example of the previous section)
-```cpp
-void FooProducerCUDA::acquire(...) {
-   ...
-   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
-     ...
-   });
-   ...
-}
-```
-
-In this case the `ctx`argument to the function is a
-`cms::cuda::ScopedContexTask` object constructed by the TBB task calling the
-user-given function. It follows that the current device and CUDA
-stream have been set up already. The `pushNextTask()` can be called
-many times. On each invocation the `pushNextTask()` pushes a new task
-on top of the stack (i.e. in front of the chain). It follows that in
-```cpp
-void FooProducerCUDA::acquire(...) {
-   ...
-   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
-     ... // function 1
-   });
-   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
-     ... // function 2
-   });
-   ctx.pushNextTask([this](cms::cuda::ScopedContextTask ctx) {
-     ... // function 3
-   });
-   ...
-}
-```
-the functions will be run in the order 3, 2, 1.
-
-**Note** that the `CUDAService` is **not** available (nor is any other
-service) in these intermediate tasks. In the near future memory
-allocations etc. will be made possible by taking them out from the
-`CUDAService`.
-
-The `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextTask` have also a
-more generic member function, `replaceWaitingTaskHolder()`, that can
-be used to just replace the currently-hold
-`edm::WaitingTaskWithArenaHolder` (that will get notified by the
-callback function) with anything. In this case the caller is
-responsible of creating the task(s) and setting up the chain of them.
-
 
 #### Transferring GPU data to CPU
 

diff --git a/HeterogeneousCore/CUDACore/interface/ContextState.h b/HeterogeneousCore/CUDACore/interface/ContextState.h
@@ -25,7 +25,6 @@ namespace cms {
     private:
       friend class ScopedContextAcquire;
       friend class ScopedContextProduce;
-      friend class ScopedContextTask;
 
       void set(int device, SharedStreamPtr stream) {
         throwIfStream();

diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
@@ -183,36 +183,6 @@ namespace cms {
       SharedEventPtr event_ = getEventCache().get();
     };
 
-    /**
-     * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
-     * - setting the current device
-     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextTask : public impl::ScopedContextBase {
-    public:
-      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-      explicit ScopedContextTask(ContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextBase(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
-            holderHelper_{std::move(waitingTaskHolder)},
-            contextState_{state} {}
-
-      ~ScopedContextTask();
-
-      template <typename F>
-      void pushNextTask(F&& f) {
-        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
-      }
-
-      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-      }
-
-    private:
-      impl::ScopedContextHolderHelper holderHelper_;
-      ContextState const* contextState_;
-    };
-
     /**
      * The aim of this class is to do necessary per-event "initialization" in analyze()
      * - setting the current device
@@ -224,19 +194,6 @@ namespace cms {
       /// Constructor to (possibly) re-use a CUDA stream
       explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
     };
-
-    namespace impl {
-      template <typename F>
-      void ScopedContextHolderHelper::pushNextTask(F&& f, ContextState const* state) {
-        auto group = waitingTaskHolder_.group();
-        replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
-            *group,
-            edm::make_waiting_task_with_holder(std::move(waitingTaskHolder_),
-                                               [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
-                                                 func(ScopedContextTask{state, std::move(h)});
-                                               })});
-      }
-    }  // namespace impl
   }  // namespace cuda
 }  // namespace cms
 

diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
@@ -89,8 +89,4 @@ namespace cms::cuda {
     // elsewhere as well.
     cudaEventRecord(event_.get(), stream());
   }
-
-  ////////////////////
-
-  ScopedContextTask::~ScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
 }  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
diff --git a/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
diff --git a/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
diff --git a/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py