From ebd237ad2e2e46a07539d6cd53b610ff111c40c1 Mon Sep 17 00:00:00 2001 From: Jukka Jokiniva Date: Tue, 28 Apr 2026 15:14:53 +0300 Subject: [PATCH 1/2] schedm: merge new placement actions into running plan instead of discarding Previously, if a placement plan was already APPLYING when a new scheduling cycle produced results, the new plan was silently discarded. This caused VMs scheduled in subsequent cycles to wait until the running plan fully completed before being deployed. This change implements plan merging for placement plans (cid == -1): - When a new plan arrives and one is already APPLYING, merge_actions() is called instead of returning early - merge_actions() first prunes terminal actions (DONE/ERROR/TIMEOUT) from the running plan to prevent unbounded growth and unblock check_completed() - New actions for VMs not already in the plan are appended with IDs starting above the current maximum to avoid colliding with in-flight APPLYING action IDs stored in VM history records - execute_plans() is called immediately after merging so newly appended actions are dispatched without waiting for the next timer tick DRS cluster optimization plans (cid >= 0) retain the existing replace behaviour as concurrent optimizer runs would produce conflicting results. Also removes two stale TODO comments in SchedulerManager.cc that referred to this missing guard. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Jukka Jokiniva --- include/Plan.h | 14 +++++++ src/schedm/Plan.cc | 67 ++++++++++++++++++++++++++++++++++ src/schedm/PlanManager.cc | 15 +++++--- src/schedm/SchedulerManager.cc | 4 +- 4 files changed, 92 insertions(+), 8 deletions(-) diff --git a/include/Plan.h b/include/Plan.h index 49d0ee8030..0d85a7f937 100644 --- a/include/Plan.h +++ b/include/Plan.h @@ -43,6 +43,11 @@ class PlanAction return _id; } + void id(int i) + { + _id = i; + } + PlanState state() const { return _state; @@ -200,6 +205,15 @@ class Plan : public ObjectSQL, public ObjectXML */ void count_actions(int &cluster_actions, std::map& host_actions); + /** + * Merge new actions from an XML plan into the current plan, skipping VMs + * that already have a READY or APPLYING action. Used to append newly + * scheduled VMs to a plan that is already being applied. + * + * @param xml XML string of the incoming plan + */ + void merge_actions(const std::string& xml); + private: friend class PlanPool; diff --git a/src/schedm/Plan.cc b/src/schedm/Plan.cc index e9b5306b10..1094c969d5 100644 --- a/src/schedm/Plan.cc +++ b/src/schedm/Plan.cc @@ -220,6 +220,73 @@ void Plan::timeout_actions(int timeout) /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ +void Plan::merge_actions(const std::string& xml) +{ + Plan incoming; + + if (incoming.from_xml(xml) != 0) + { + NebulaLog::error("PLM", "merge_actions: failed to parse incoming plan XML"); + return; + } + + // Remove terminal actions to keep the plan lean. This prevents unbounded + // growth when many scheduling cycles merge into a long-running plan and + // ensures check_completed() is not blocked by already-finished actions. + // NOTE: do NOT touch APPLYING actions — their IDs are stored in VM history + // records and must remain stable so action_finished() callbacks match. + std::vector active_actions; + + for (const auto& a : _actions) + { + if (a.state() != PlanState::DONE && + a.state() != PlanState::ERROR && + a.state() != PlanState::TIMEOUT) + { + active_actions.push_back(a); + } + } + + _actions = std::move(active_actions); + + // Build set of VM IDs that already have an active (READY or APPLYING) action + std::set active_vms; + + // Find the highest ID in use so new actions don't collide with any + // in-flight APPLYING actions whose IDs are stored in VM history records. + int next_id = 0; + + for (const auto& a : _actions) + { + active_vms.insert(a.vm_id()); + next_id = std::max(next_id, a.id() + 1); + } + + // Append only new VMs not already present in the plan + int appended = 0; + + for (const auto& a : incoming.actions()) + { + if (active_vms.count(a.vm_id()) != 0) + { + continue; + } + + _actions.push_back(a); + _actions.back().id(next_id++); + appended++; + } + + if (appended > 0) + { + NebulaLog::info("PLM", "merge_actions: appended " + std::to_string(appended) + + " new actions to the running placement plan"); + } +} + +/* -------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------- */ + void Plan::count_actions(int &cluster_actions, std::map& host_actions) { for (const auto& a : _actions) diff --git a/src/schedm/PlanManager.cc b/src/schedm/PlanManager.cc index 5c3f656315..a406682685 100644 --- a/src/schedm/PlanManager.cc +++ b/src/schedm/PlanManager.cc @@ -104,17 +104,20 @@ void PlanManager::add_plan(const string& xml) { if (plan.cid() == -1) { - NebulaLog::info("PLM", "Adding new placement plan"); - if (cplan->state() == PlanState::APPLYING) { - NebulaLog::info("PLM", "Cannot add plan. A placement plan is already in progress."); - return; + NebulaLog::info("PLM", "Merging new actions into running placement plan."); + + cplan->merge_actions(xml); } + else + { + NebulaLog::info("PLM", "Adding new placement plan"); - cplan->from_xml(xml); + cplan->from_xml(xml); - cplan->state(PlanState::APPLYING); + cplan->state(PlanState::APPLYING); + } } else if (auto cluster = cluster_pool->get_ro(plan.cid())) { diff --git a/src/schedm/SchedulerManager.cc b/src/schedm/SchedulerManager.cc index ea1a14c5d1..874028f2fd 100644 --- a/src/schedm/SchedulerManager.cc +++ b/src/schedm/SchedulerManager.cc @@ -161,7 +161,7 @@ void SchedulerManager::trigger_place() return; } - //TODO Check a PLACE planning is not being applied + // New actions will be merged into the running plan if one exists //send place request to driver, reset window last_place = the_time; wnd_start = 0; @@ -288,7 +288,7 @@ void SchedulerManager::timer_action() NebulaLog::ddebug("SCMT", oss.str()); - //TODO Check there is no placement plan active + // New actions will be merged into the running plan if one exists if (!expired && !pending) { From 97fdc5929430f51a2ed6f2a3a4e35f503737677b Mon Sep 17 00:00:00 2001 From: Jukka Jokiniva Date: Thu, 7 May 2026 09:07:36 +0300 Subject: [PATCH 2/2] schedm: dispatch to all eligible hosts in a single plan cycle Previously, execute_plan() stopped dispatching as soon as the first READY action hit either the per-host or per-cluster action limit. This meant VMs targeting uncongested hosts were skipped until the next cycle. Replace get_next_action() with get_ready_actions() which collects all READY action pointers upfront. The dispatch loop now breaks only on the cluster cap (hard ceiling) and continues past saturated hosts, allowing VMs assigned to other hosts to be started in the same cycle. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Jukka Jokiniva --- include/Plan.h | 2 +- src/schedm/Plan.cc | 6 ++---- src/schedm/PlanManager.cc | 12 +++++++++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/Plan.h b/include/Plan.h index 0d85a7f937..623d14d486 100644 --- a/include/Plan.h +++ b/include/Plan.h @@ -183,7 +183,7 @@ class Plan : public ObjectSQL, public ObjectXML return _actions; } - PlanAction* get_next_action(); + void get_ready_actions(std::vector& ready_actions); /** * Mark action as finished, return false if the action is not in the plan diff --git a/src/schedm/Plan.cc b/src/schedm/Plan.cc index 1094c969d5..489e878c9b 100644 --- a/src/schedm/Plan.cc +++ b/src/schedm/Plan.cc @@ -100,17 +100,15 @@ std::string PlanAction::to_xml() const /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ -PlanAction* Plan::get_next_action() +void Plan::get_ready_actions(std::vector& ready_actions) { for (auto& action : _actions) { if (action.state() == PlanState::READY) { - return &action; + ready_actions.push_back(&action); } } - - return nullptr; } /* -------------------------------------------------------------------------- */ diff --git a/src/schedm/PlanManager.cc b/src/schedm/PlanManager.cc index a406682685..42487bdeb3 100644 --- a/src/schedm/PlanManager.cc +++ b/src/schedm/PlanManager.cc @@ -407,18 +407,24 @@ void PlanManager::execute_plan(Plan& plan) // Update counter, num of running actions per host, per cluster map host_actions; int cluster_actions = 0; + std::vector ready_actions; plan.count_actions(cluster_actions, host_actions); + plan.get_ready_actions(ready_actions); // Execute plan actions - while (auto action = plan.get_next_action()) + for (auto& action : ready_actions) { - if (host_actions[action->host_id()] >= max_actions_per_host - || cluster_actions >= max_actions_per_cluster) + if (cluster_actions >= max_actions_per_cluster) { break; } + if (host_actions[action->host_id()] >= max_actions_per_host) + { + continue; + } + if (start_action(plan.cid(), *action)) { host_actions[action->host_id()]++;