stackhpc · priteau · May 7, 2026 · Jul 23, 2025 · Mar 29, 2026 · Apr 2, 2026
@@ -458,7 +458,7 @@ management, therefore if it fails to start, these other components
 will not be running correctly either.
 Check that etcd is running on the master nodes by::
 
-    sudo service etcd status -l
+    sudo systemctl status etcd
 
 If it is running correctly, you should see that the service is
 successfully deployed::
@@ -484,7 +484,7 @@ something like::
 
 In this case, try restarting etcd by::
 
-    sudo service etcd start
+    sudo systemctl start etcd
 
 If etcd continues to fail, check the following:
 
@@ -534,7 +534,7 @@ confirmed by running *ping* or *curl* from one container to another.
 The Flannel daemon is run as a systemd service on each node of the cluster.
 To check Flannel, run on each node::
 
-    sudo service flanneld status
+    sudo systemctl status flanneld
 
 If the daemon is running, you should see that the service is successfully
 deployed::
@@ -562,7 +562,7 @@ Check the following:
   If the etcd service failed, once it has been restored successfully, the
   Flannel service can be restarted by::
 
-    sudo service flanneld restart
+    sudo systemctl restart flanneld
 
 - Magnum writes the configuration for Flannel in a local file on each master
   node.  Check for this file on the master nodes by::

@@ -292,6 +292,24 @@ def __init__(self, **kwargs):
     def convert_with_links(rpc_clusters, limit, url=None, expand=False,
                            **kwargs):
         collection = ClusterCollection()
+        # Pre-fetch all unique ClusterTemplates needed for this page in one
+        # batch to avoid N separate ClusterTemplate.get_by_uuid() RPC calls
+        # (one per cluster).  Clusters in a project commonly share the same
+        # template, so this usually collapses to a single RPC regardless of
+        # page size.
+        template_cache = {}
+        for rpc_cluster in rpc_clusters:
+            tid = rpc_cluster.cluster_template_id
+            if tid and tid not in template_cache:
+                template_cache[tid] = objects.ClusterTemplate.get_by_uuid(
+                    pecan.request.context, tid)
+        # Inject the pre-fetched template so obj_load_attr is never triggered
+        # during Cluster.convert_with_links below.
+        for rpc_cluster in rpc_clusters:
+            tid = rpc_cluster.cluster_template_id
+            if tid and tid in template_cache:
+                rpc_cluster.cluster_template = template_cache[tid]
+
         collection.clusters = [Cluster.convert_with_links(p, expand)
                                for p in rpc_clusters]
         collection.next = collection.get_next(limit, url=url, **kwargs)
@@ -443,7 +461,9 @@ def get_one(self, cluster_ident):
             context.all_tenants = True
 
         cluster = api_utils.get_resource('Cluster', cluster_ident)
-        policy.enforce(context, 'cluster:get', cluster.as_dict(),
+        # Compute as_dict() once and reuse it for policy enforcement.
+        cluster_dict = cluster.as_dict()
+        policy.enforce(context, 'cluster:get', cluster_dict,
                        action='cluster:get')
 
         api_cluster = Cluster.convert_with_links(cluster)

@@ -17,6 +17,7 @@
 
 import pecan
 
+from glanceclient import exc as glance_exception
 from keystoneauth1 import exceptions as ka_exception
 
 from magnum.api import utils as api_utils
@@ -78,6 +79,10 @@ def wrapper(func, *args, **kwargs):
                                                          'images')
                 cluster_distro = image.get('os_distro')
                 driver_name = image.get('magnum_driver')
+            except (glance_exception.NotFound, exception.ResourceNotFound):
+                raise exception.ImageNotFound(image_id=image_id)
+            except glance_exception.HTTPForbidden:
+                raise exception.ImageNotAuthorized(image_id=image_id)
             except Exception:
                 pass
         cluster_type = (cluster_template.server_type,

@@ -27,6 +27,7 @@
 from magnum.common import profiler
 from magnum.common import service
 import magnum.conf
+from magnum.drivers.common import driver as driver_module
 from magnum.i18n import _
 from magnum.objects import base
 from magnum import version
@@ -78,6 +79,9 @@ def main():
     LOG.debug("Configuration:")
     CONF.log_opt_values(LOG, logging.DEBUG)
 
+    drivers = [ep.name for ep, _ in driver_module.Driver.load_entry_points()]
+    LOG.debug('Loaded drivers: %s', drivers)
+
     LOG.info('Serving on %(proto)s://%(host)s:%(port)s',
              dict(proto="https" if use_ssl else "http", host=host, port=port))
 

@@ -34,6 +34,7 @@
 from magnum.conductor.handlers import indirection_api
 from magnum.conductor.handlers import nodegroup_conductor
 import magnum.conf
+from magnum.drivers.common import driver as driver_module
 from magnum import version
 
 CONF = magnum.conf.CONF
@@ -50,6 +51,9 @@ def main():
     LOG.debug("Configuration:")
     CONF.log_opt_values(LOG, logging.DEBUG)
 
+    drivers = [ep.name for ep, _ in driver_module.Driver.load_entry_points()]
+    LOG.debug('Loaded drivers: %s', drivers)
+
     conductor_id = short_id.generate_id()
     endpoints = [
         indirection_api.Handler(),

@@ -30,6 +30,18 @@
 LOG = logging.getLogger(__name__)
 _ENFORCER = None
 CONF = cfg.CONF
+_TRUSTEE_DOMAIN_ID_CACHE = None
+
+
+def _reset_trustee_domain_id_cache():
+    """Reset the trustee_domain_id Keystone lookup cache.
+
+    Intended for use in test teardown only.  In production the value is
+    either read from CONF (free) or fetched once and cached for the process
+    lifetime.
+    """
+    global _TRUSTEE_DOMAIN_ID_CACHE
+    _TRUSTEE_DOMAIN_ID_CACHE = None
 
 
 # we can get a policy enforcer by this init.
@@ -112,10 +124,21 @@ def enforce(context, rule=None, target=None,
 
 def add_policy_attributes(target):
     """Adds extra information for policy enforcement to raw target object"""
-    context = importutils.import_module('magnum.common.context')
-    admin_context = context.make_admin_context()
-    admin_osc = clients.OpenStackClients(admin_context)
-    trustee_domain_id = admin_osc.keystone().trustee_domain_id
+    global _TRUSTEE_DOMAIN_ID_CACHE
+
+    # When trustee_domain_id is set - we don't need to do any operations
+    trustee_domain_id = CONF.trust.trustee_domain_id
+    if not trustee_domain_id:
+        # Fallback for deployments that rely on auto-discovery via Keystone.
+        # Cache the result for the process lifetime so the call happens
+        # at most once.
+        if _TRUSTEE_DOMAIN_ID_CACHE is None:
+            ctx = importutils.import_module('magnum.common.context')
+            admin_context = ctx.make_admin_context()
+            admin_osc = clients.OpenStackClients(admin_context)
+            _TRUSTEE_DOMAIN_ID_CACHE = admin_osc.keystone().trustee_domain_id
+        trustee_domain_id = _TRUSTEE_DOMAIN_ID_CACHE
+
     target['trustee_domain_id'] = trustee_domain_id
     return target
 

@@ -62,18 +62,46 @@ def create(cls, topic, server, handlers, binary):
         return service_obj
 
 
+# Share a single RPCClient per unique (topic, server, timeout) tuple
+# for the lifetime of the worker process.  The per-request context is injected
+# via RPCClient.prepare(), which returns a lightweight _CallContext that reuses
+# the same underlying transport connections.
+_RPC_CLIENT_CACHE = {}
+
+
+def _get_cached_client(topic, server, timeout):
+    """Return a process-level cached RPCClient for the given target parameters.
+
+    The client is created once per (topic, server, timeout) combination and
+    reused across all requests.  This keeps the RabbitMQ connection pool warm
+    and avoids per-request TCP connect/disconnect cycles.
+    """
+
+    key = (topic, server, timeout)
+    client = _RPC_CLIENT_CACHE.get(key)
+    if client is None:
+        target = messaging.Target(topic=topic, server=server)
+        client = rpc.get_client(
+            target,
+            serializer=objects_base.MagnumObjectSerializer(),
+            timeout=timeout,
+        )
+        _RPC_CLIENT_CACHE[key] = client
+    return client
+
+
 class API(object):
     def __init__(self, context=None, topic=None, server=None,
                  timeout=None):
         self._context = context
         if topic is None:
             topic = ''
-        target = messaging.Target(topic=topic, server=server)
-        self._client = rpc.get_client(
-            target,
-            serializer=objects_base.MagnumObjectSerializer(),
-            timeout=timeout
-        )
+        # Fetch (or create) the shared RPCClient from the process-level cache.
+        # Storing it as self._client keeps the interface identical to the
+        # original code; subclasses (conductor_api.API) that access
+        # self._client directly for OVO indirection calls continue to work
+        # without any changes.
+        self._client = _get_cached_client(topic, server, timeout)
 
     def _call(self, method, *args, **kwargs):
         return self._client.call(self._context, method, *args, **kwargs)

@@ -113,21 +113,36 @@ def _add_tenant_filters(self, context, query):
         if context.is_admin and context.all_tenants:
             return query
 
-        admin_context = request_context.make_admin_context(all_tenants=True)
-        osc = clients.OpenStackClients(admin_context)
-        kst = osc.keystone()
+        # Read the trustee domain ID directly from configuration rather than
+        # authenticating to Keystone on every DB query.  The value is
+        # operator-configured (CONF.trust.trustee_domain_id) and is stable for
+        # the lifetime of the service.
+        trustee_domain_id = CONF.trust.trustee_domain_id
+
+        # Fall back to a live Keystone lookup when trustee_domain_id is not
+        # set in configuration so that deployments which rely on auto-discovery
+        # continue to work correctly.
+        if not trustee_domain_id:
+            admin_context = request_context.make_admin_context(
+                                all_tenants=True)
+            osc = clients.OpenStackClients(admin_context)
+            trustee_domain_id = osc.keystone().trustee_domain_id
 
         # User in a regular project (not in the trustee domain)
         if (
             context.project_id
-            and context.user_domain_id != kst.trustee_domain_id
+            and context.user_domain_id != trustee_domain_id
         ):
             query = query.filter_by(project_id=context.project_id)
         # Match project ID component in trustee user's user name against
         # cluster's project_id to associate per-cluster trustee users who have
         # no project information with the project their clusters/cluster models
         # reside in. This is equivalent to the project filtering above.
-        elif context.user_domain_id == kst.trustee_domain_id:
+        elif context.user_domain_id == trustee_domain_id:
+            admin_context = request_context.make_admin_context(
+                                all_tenants=True)
+            osc = clients.OpenStackClients(admin_context)
+            kst = osc.keystone()
             user_name = kst.client.users.get(context.user_id).name
             user_project = user_name.split('_', 2)[1]
             query = query.filter_by(project_id=user_project)

@@ -111,10 +111,40 @@ def _from_db_object(cluster, db_cluster):
         cluster.obj_reset_changes()
         return cluster
 
+    # Call-scoped nodegroups cache: None means inactive (no as_dict() in
+    # progress).  Set to a list by as_dict() before computing derived fields
+    # and cleared afterwards.  Outside of as_dict() _get_nodegroups() always
+    # goes directly to the DB, so nodegroup mutations (create/delete) in
+    # conductor code and tests are always visible.
+    _nodegroups_cache = None
+
+    def _get_nodegroups(self):
+        """Fetch nodegroups, using a call-scoped cache when active.
+
+        as_dict() accesses four derived properties (node_count, master_count,
+        node_addresses, master_addresses) that each call self.nodegroups.
+        Without caching, a single as_dict() triggers four identical
+        NodeGroup.list() RPC calls.
+
+        Rather than caching for the object lifetime (which would return stale
+        data if nodegroups are created/deleted after first access, as happens
+        in conductor and test code), the cache is scoped to a single
+        as_dict() call: as_dict() populates _nodegroups_cache before
+        computing derived fields and clears it on exit.  At all other times
+        _nodegroups_cache is None and this method goes directly to the DB.
+        """
+        if self._nodegroups_cache is not None:
+            return self._nodegroups_cache
+        return NodeGroup.list(self._context, self.uuid)
+
+    def _invalidate_nodegroups_cache(self):
+        """Clear the call-scoped nodegroups cache."""
+        self._nodegroups_cache = None
+
     @property
     def nodegroups(self):
         # Returns all nodegroups that belong to the cluster.
-        return NodeGroup.list(self._context, self.uuid)
+        return self._get_nodegroups()
 
     @property
     def default_ng_worker(self):
@@ -342,12 +372,20 @@ def obj_load_attr(self, attrname):
 
     def as_dict(self):
         dict_ = super(Cluster, self).as_dict()
-        # Update the dict with the attributes coming form
-        # the cluster's nodegroups.
-        dict_.update({
-            'node_count': self.node_count,
-            'master_count': self.master_count,
-            'node_addresses': self.node_addresses,
-            'master_addresses': self.master_addresses
-        })
+        # Populate the call-scoped nodegroups cache so that the four derived
+        # properties below (node_count, master_count, node_addresses,
+        # master_addresses) all share a single NodeGroup.list() fetch instead
+        # of each issuing their own RPC call.  The cache is cleared on exit
+        # so that any subsequent access (e.g. from conductor code that has
+        # mutated nodegroups) sees a fresh DB result.
+        self._nodegroups_cache = NodeGroup.list(self._context, self.uuid)
+        try:
+            dict_.update({
+                'node_count': self.node_count,
+                'master_count': self.master_count,
+                'node_addresses': self.node_addresses,
+                'master_addresses': self.master_addresses
+            })
+        finally:
+            self._nodegroups_cache = None
         return dict_
@@ -27,6 +27,7 @@
 
 from magnum.common import context as magnum_context
 from magnum.common import keystone as magnum_keystone
+from magnum.common import policy as magnum_policy
 from magnum.objects import base as objects_base
 from magnum.tests import conf_fixture
 from magnum.tests import fake_notifier
@@ -119,6 +120,12 @@ def make_context(*args, **kwargs):
         self.mock_make_trustee_domain_id = q.start()
         self.addCleanup(q.stop)
 
+        # Reset the module-level trustee_domain_id cache in policy.py so that
+        # it is re-resolved on the first policy.enforce() call of the next
+        # test.  Without this, a test that exercises the Keystone-discovery
+        # path could leave a stale value that bypasses mocks in later tests.
+        self.addCleanup(magnum_policy._reset_trustee_domain_id_cache)
+
         self.useFixture(conf_fixture.ConfFixture())
         self.useFixture(fixtures.NestedTempfile())
 

@@ -29,5 +29,13 @@ def _setUp(self):
         CONF.set_default('host', 'fake-mini')
         CONF.set_default('connection', "sqlite://", group='database')
         CONF.set_default('sqlite_synchronous', False, group='database')
+        # Set a fixed trustee_domain_id so that policy.add_policy_attributes()
+        # can read it directly from config without making any Keystone call.
+        # This matches the value used by the global trustee_domain_id mock in
+        # tests/base.py and avoids 'An auth plugin is required' errors in
+        # no-auth test configurations.
+        CONF.set_default('trustee_domain_id',
+                         '12345678-9012-3456-7890-123456789abc',
+                         group='trust')
         config.parse_args([], default_config_files=[])
         self.addCleanup(CONF.reset)