diff --git a/platform/maintenance/monitoring/aggregating-metrics.mdx b/platform/maintenance/monitoring/aggregating-metrics.mdx
deleted file mode 100644
index 4436e2aa2..000000000
--- a/platform/maintenance/monitoring/aggregating-metrics.mdx
+++ /dev/null
@@ -1,302 +0,0 @@
----
-title: "Aggregating Metrics"
-sidebar_label: "Aggregating Metrics"
-sidebar_position: 4
----
-
-import Flow, { Step } from '@site/src/components/Flow'
-import NavStep from '@site/src/components/NavStep'
-import Button from '@site/src/components/Button'
-import GrafanaExample from '@site/static/media/screenshots/grafana-example.png'
-
-This guide explains how to configure OpenTelemetry with Prometheus to
-collect workload metrics from across multiple virtual clusters and aggregate by
-Project, vCluster, and Space. This approach uses shared OpenTelemetry DaemonSets on the host cluster without requiring individual collector installations per virtual cluster.
-
-## Prerequisites
-
-Before you begin, decide where to deploy Prometheus and Grafana. This guide uses the Platform's connected `local-cluster` with both services in an `observability` namespace. You can adapt this to external clusters or different namespaces based on your requirements.
-
-:::warning Don't use kube-prometheus-stack
-Do not use the `kube-prometheus-stack` chart from the Platform Apps page for this setup. Instead, use the standalone `prometheus` and `grafana` charts as described below, which are optimized to work with the OpenTelemetry Collector.
-:::
-
-
-## Deploy Prometheus
-
-
-Deploy Prometheus with OTLP receiver support using the Platform Apps UI.
-
-
-
-Go to the Infra section using the menu on the left, and select the Clusters view.
-
-
- Click on the cluster where you want to deploy Prometheus (for example, local-cluster).
-
-
- Navigate to the Apps tab.
-
-
- Click and configure a Helm chart with the following settings.
-
-
-
-
-| Setting | Value |
-|---------|-------|
-| Chart Repository URL | `https://prometheus-community.github.io/helm-charts` |
-| Chart Name | `prometheus` |
-| Namespace | `observability` |
-| Release Name | `prometheus` |
-
-
-Use the following chart values to enable the OTLP receiver and required features:
-
-```yaml title="Prometheus Helm values"
-# Enable OTLP receiver for OpenTelemetry metrics ingestion
-# Enable delta-to-cumulative conversion for OTLP metrics
-# Enable lifecycle API for configuration reloads
-server:
- extraFlags:
- - web.enable-otlp-receiver
- - web.enable-lifecycle
- extraArgs:
- enable-feature: otlp-deltatocumulative
-
-# Skip RBAC creation if prometheus-server ClusterRole already exists
-rbac:
- create: false
-
-# Disable components not needed for this setup
-prometheus-node-exporter:
- enabled: false
-alertmanager:
- enabled: false
-kube-state-metrics:
- enabled: false
-prometheus-pushgateway:
- enabled: false
-```
-
-:::info Existing Prometheus installations
-The `rbac.create: false` setting skips ClusterRole creation, which prevents conflicts if a `prometheus-server` ClusterRole already exists in your cluster from vCluster Platform or other components. If you don't have an existing ClusterRole, either remove this setting or use `server.clusterRoleNameOverride: "otel-prometheus-server"` to create one with a unique name.
-:::
-
-Click to deploy Prometheus.
-
-
-## Deploy OpenTelemetry Collector
-
-
-Deploy the built-in OpenTelemetry App on each connected host cluster.
-This OpenTelemetry App accepts the Prometheus connection information and deploys [opentelemetry-collector](https://opentelemetry.io/docs/collector/)
-as a DaemonSet via Helm.
-
-The OpenTelemetry Collector Agent on each node pushes metrics about the workloads running on that node to the Prometheus instance. The metrics include vCluster, vCluster Platform, and Kubernetes metadata as labels.
-
-
-
- Go to the Clusters dropdown using the menu on the left, and select the Clusters view.
-
-
- Click on the cluster where you are installing the OpenTelemetry Collector App.
-
-
- Navigate to the Apps tab.
-
-
- Click on the OpenTelemetry Collector App.
-
-
- Enter the Prometheus connection endpoint: http://prometheus-server.observability.svc.cluster.local:80
-
-
- Click on the button to finish.
-
-
-
-:::info Prometheus endpoint
-If you deployed Prometheus in a different namespace or cluster, adjust the endpoint URL accordingly. The format is `http://-server..svc.cluster.local:80`.
-:::
-
-
-## Deploy Grafana
-
-
-Deploy Grafana with a pre-configured Prometheus datasource and dashboard using the Platform Apps UI.
-
-
-
- Go to the Clusters dropdown using the menu on the left, and select the Clusters view.
-
-
- Click on the cluster where you deployed Prometheus.
-
-
- Navigate to the Apps tab.
-
-
- Click and configure a Helm chart with the following settings.
-
-
-
-
-| Setting | Value |
-|---------|-------|
-| Chart Repository URL | `https://grafana.github.io/helm-charts` |
-| Chart Name | `grafana` |
-| Namespace | `observability` |
-| Release Name | `grafana` |
-
-
-Use the following chart values to configure the Prometheus datasource and include a pre-built dashboard:
-
-```yaml title="Grafana Helm values"
-# Configure Prometheus as the default datasource
-datasources:
- datasources.yaml:
- apiVersion: 1
- datasources:
- - name: Prometheus
- type: prometheus
- url: http://prometheus-server
- access: proxy
- isDefault: true
-
-# Configure dashboard provisioning
-dashboardProviders:
- dashboardproviders.yaml:
- apiVersion: 1
- providers:
- - name: 'default'
- orgId: 1
- folder: ''
- type: file
- disableDeletion: false
- editable: true
- options:
- path: /var/lib/grafana/dashboards/default
-
-# Include a pre-built dashboard for vCluster Platform metrics
-dashboards:
- default:
- vcluster-platform-metrics:
- json: |
- {"title":"CPU and Memory usage by Project, Space, Virtual Cluster","panels":[{"gridPos":{"h":8,"w":12,"x":0,"y":0},"id":3,"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true}},"targets":[{"disableTextWrap":false,"editorMode":"code","expr":"sum by(loft_virtualcluster_name) (k8s_pod_cpu_time_seconds_total{loft_virtualcluster_name=~\".+\"})","fullMetaSearch":false,"includeNullMetadata":true,"instant":false,"legendFormat":"__auto","range":true,"refId":"A","useBackend":false}],"title":"CPU Usage by Virtual Cluster","type":"timeseries"},{"gridPos":{"h":8,"w":12,"x":12,"y":0},"id":4,"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true}},"targets":[{"disableTextWrap":false,"editorMode":"code","expr":"sum by (loft_virtualcluster_name) (k8s_pod_memory_usage_bytes{loft_virtualcluster_name=~\".+\"})\n/1024/1024","fullMetaSearch":false,"includeNullMetadata":true,"instant":false,"legendFormat":"__auto","range":true,"refId":"A","useBackend":false}],"title":"Memory Usage (MiB) by Virtual Cluster","type":"timeseries"},{"gridPos":{"h":8,"w":12,"x":0,"y":8},"id":2,"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true}},"targets":[{"disableTextWrap":false,"editorMode":"builder","expr":"sum by(loft_project_name) (k8s_pod_cpu_time_seconds_total{loft_project_name=~\".+\"})","fullMetaSearch":false,"includeNullMetadata":true,"instant":false,"interval":"","legendFormat":"__auto","range":true,"refId":"A","useBackend":false}],"title":"CPU Usage by Project","type":"timeseries"},{"gridPos":{"h":8,"w":12,"x":12,"y":8},"id":1,"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true}},"targets":[{"disableTextWrap":false,"editorMode":"code","expr":"sum by (loft_project_name) (k8s_pod_memory_usage_bytes{loft_project_name=~\".+\"})\n/1024/1024","fullMetaSearch":false,"includeNullMetadata":true,"instant":false,"legendFormat":"__auto","range":true,"refId":"A","useBackend":false}],"title":"Memory Usage (MiB) by Project","type":"timeseries"},{"gridPos":{"h":8,"w":12,"x":0,"y":16},"id":6,"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true}},"targets":[{"disableTextWrap":false,"editorMode":"code","expr":"sum by(loft_space_name) (k8s_pod_cpu_time_seconds_total{loft_space_name=~\".+\"})","fullMetaSearch":false,"includeNullMetadata":true,"instant":false,"legendFormat":"__auto","range":true,"refId":"A","useBackend":false}],"title":"CPU Usage by Space","type":"timeseries"},{"gridPos":{"h":8,"w":12,"x":12,"y":16},"id":5,"options":{"legend":{"calcs":[],"displayMode":"list","placement":"bottom","showLegend":true}},"targets":[{"disableTextWrap":false,"editorMode":"code","expr":"sum by (loft_space_name) (k8s_pod_memory_usage_bytes{loft_space_name=~\".+\"})\n/1024/1024","fullMetaSearch":false,"includeNullMetadata":true,"instant":false,"legendFormat":"__auto","range":true,"refId":"A","useBackend":false}],"title":"Memory Usage (MiB) by Space","type":"timeseries"}],"schemaVersion":38}
-```
-
-Click to deploy Grafana.
-
-
-## Access Grafana
-
-
-After deploying Grafana, retrieve the admin password and access the dashboard.
-
-
-### Get the Grafana password
-
-
-Run the following command to retrieve the Grafana admin password:
-
-```bash
-kubectl get secret --namespace observability grafana \
- -o jsonpath="{.data.admin-password}" | base64 --decode; echo
-```
-
-### Option 1: Port forward
-
-Use port forwarding to access Grafana locally:
-
-```bash
-kubectl port-forward -n observability service/grafana 8080:80
-```
-
-Then open your browser and navigate to `http://localhost:8080`. Log in with username `admin` and the password from the previous step.
-
-### Option 2: Ingress
-
-Create an Ingress resource to expose Grafana externally. The following example uses the nginx ingress controller (deprecated):
-
-```yaml title="grafana-ingress.yaml"
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
- annotations:
- nginx.ingress.kubernetes.io/rewrite-target: /
- # Uncomment the following for TLS with cert-manager:
- # cert-manager.io/cluster-issuer: letsencrypt-prod
- # nginx.ingress.kubernetes.io/ssl-redirect: "true"
- name: grafana
- namespace: observability
-spec:
- ingressClassName: nginx
- rules:
- - host: grafana.example.com # Replace with your hostname
- http:
- paths:
- - backend:
- service:
- name: grafana
- port:
- number: 80
- path: /
- pathType: Prefix
- # Uncomment the following for TLS:
- # tls:
- # - hosts:
- # - grafana.example.com
- # secretName: grafana-ingress-tls
-```
-
-Apply the Ingress:
-
-```bash
-kubectl apply -f grafana-ingress.yaml
-```
-
-Navigate to your configured hostname and log in with the admin credentials.
-
-
-## Create Prometheus queries
-
-
-After collecting data with OpenTelemetry, you can aggregate the metrics using the associated labels like the vCluster name.
-
-Here's an example Prometheus query showing the CPU usage aggregated by the vCluster name:
-
-```promql
-sum by(loft_virtualcluster_name) (k8s_pod_cpu_time_seconds_total{loft_virtualcluster_name=~".+"})
-```
-
-### Available labels
-
-The OpenTelemetry Collector adds the following vCluster Platform labels to metrics:
-
-| Label | Description |
-|-------|-------------|
-| `loft_project_name` | The vCluster Platform project name |
-| `loft_virtualcluster_name` | The virtual cluster name |
-| `loft_space_name` | The space name |
-| `loft_cluster_name` | The connected cluster name |
-
-### Example queries
-
-**Memory usage by project:**
-```promql
-sum by (loft_project_name) (k8s_pod_memory_usage_bytes{loft_project_name=~".+"}) / 1024 / 1024
-```
-
-**CPU usage by space:**
-```promql
-sum by(loft_space_name) (k8s_pod_cpu_time_seconds_total{loft_space_name=~".+"})
-```
-
-## Example dashboard
-
-The Grafana deployment includes a pre-built dashboard that visualizes CPU and Memory usage aggregated by vCluster, Project, and Space.
-
-
-
-You can also [import](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/) additional dashboards or create custom visualizations using the available labels.
diff --git a/platform/maintenance/monitoring/fleet-monitoring.mdx b/platform/maintenance/monitoring/fleet-monitoring.mdx
new file mode 100644
index 000000000..70c44697d
--- /dev/null
+++ b/platform/maintenance/monitoring/fleet-monitoring.mdx
@@ -0,0 +1,1399 @@
+---
+title: "Fleet Monitoring"
+sidebar_label: "Fleet Monitoring"
+sidebar_position: 4
+---
+
+import Flow, { Step } from '@site/src/components/Flow'
+import NavStep from '@site/src/components/NavStep'
+import Button from '@site/src/components/Button'
+import PageVariables from '@site/src/components/PageVariables';
+import InterpolatedCodeBlock from "@site/src/components/InterpolatedCodeBlock";
+
+This guide explains how to configure Prometheus to collect workload metrics
+from across multiple virtual clusters. Metrics can be aggregated by cluster,
+project, and virtual cluster. Because the architecture uses Prometheus
+`remote_write`, it supports both the Shared Nodes and Private Nodes tenancy
+models. Without `remote_write`, private nodes could not be scraped directly
+using Prometheus' regular "pull-based" metrics ingestion model.
+
+:::warning
+This guide is by no means meant to be a day 2 monitoring solution that can just
+be copied to your current infrastructure. Especially Observability is highly
+specialized to the underlying architecture. The guide's goal is to lay out
+general capabilities and to show what's possible along a stripped-down example
+architecture. Those can be applied with modifications to your actual use cases.
+:::
+
+## Architecture
+
+
+```mermaid
+%%{init: {'flowchart': {'curve': 'stepAfter'}}}%%
+flowchart TB
+ GlobalProm["Central Prometheus"]
+
+ GlobalProm ~~~ local
+ GlobalProm ~~~ connected
+
+ subgraph local["Local Cluster"]
+ subgraph localPrivVC["Virtual Cluster (Private)"]
+ LocalPromAgent["Prometheus agent"]
+ end
+
+ LocalKPS["Kube-Prom-Stack"]
+
+ subgraph localSharedVC["Virtual Cluster (Shared)"]
+ LocalSharedWorkload["Workload"]
+ end
+ end
+
+ subgraph connected["Connected Cluster"]
+ subgraph connPrivVC["Virtual Cluster (Private)"]
+ ConnPromAgent["Prometheus agent"]
+ end
+
+ ConnKPS["Kube-Prom-Stack"]
+
+ subgraph connSharedVC["Virtual Cluster (Shared)"]
+ ConnSharedWorkload["Workload"]
+ end
+ end
+
+ subgraph connNode["Private Node"]
+ ConnWorkload["Workload"]
+ end
+
+ subgraph localNode["Private Node"]
+ LocalWorkload["Workload"]
+ end
+
+ ConnKPS ---|remote_write| GlobalProm
+ ConnPromAgent ---|remote_write| GlobalProm
+ ConnKPS -->|scrape| connPrivVC
+ ConnKPS -->|scrape| connSharedVC
+ ConnPromAgent -->|scrape| ConnWorkload
+
+ LocalKPS -->|scrape| localPrivVC
+ LocalKPS -->|scrape| localSharedVC
+ LocalPromAgent -->|scrape| LocalWorkload
+ LocalPromAgent ---|remote_write| GlobalProm
+ LocalKPS ---|remote_write| GlobalProm
+
+ connected ~~~ connNode
+ local ~~~ localNode
+
+ classDef magenta fill:#fef,stroke:#d63384
+ classDef dark fill:#f5f5f5,stroke:#333
+ classDef green fill:#cfd,stroke:#8b9
+ classDef blue fill:#cdf,stroke:#89b
+ classDef orange fill:#fed,stroke:#db8
+ classDef purple fill:#dcf,stroke:#a8d
+
+ class local magenta
+ class connected dark
+ class localPrivVC,localSharedVC,connPrivVC,connSharedVC green
+ class localHelmKPS,connHelmKPS blue
+ class localNode,connNode orange
+ class GlobalProm purple
+```
+
+
+The architecture comprises the following:
+
+- Cluster Architecture:
+ - A local cluster that hosts vCluster Platform.
+ - Two virtual clusters running on the local cluster:
+ - One virtual cluster sharing the nodes of the local cluster (Shared Nodes Tenancy Model).
+ - One virtual cluster with two private nodes and node-to-node VPN (Private Nodes Tenancy Model).
+ - An external cluster that is connected to vCluster Platform (hosting vCluster Platform agent).
+ - Two virtual clusters running on the connected cluster:
+ - One virtual cluster sharing the nodes of the connected cluster (Shared Nodes Tenancy Model).
+ - One virtual cluster with two private nodes and node-to-node VPN (Private Nodes Tenancy Model).
+
+- Prometheus Architecture
+ - A central Prometheus (remote_write receiver)
+ - A Prometheus Operator (to scrape virtual cluster own metrics via `ServiceMonitors`) and a Prometheus Agent (remote_writer) per Cluster
+ - A Prometheus Agent (remote_writer) per virtual cluster with private nodes (Private Nodes Tenancy Model).
+
+
+## Deploy Prometheus Agent and Prometheus Operator on each Cluster
+
+
+:::info Prerequisites
+The central Prometheus must be configured as a remote write receiver. The following Helm values enable this:
+
+```yaml
+server:
+ extraFlags:
+ - web.enable-remote-write-receiver
+```
+
+Virtual clusters with shared nodes must be deployed with a ServiceMonitor. This
+allows scraping their API server and controller metrics from the Prometheus
+agent running on the host cluster. Enable this in your `vcluster.yaml`:
+
+```yaml
+controlPlane:
+ serviceMonitor:
+ enabled: true
+```
+:::
+
+Deploy Prometheus Agent and Prometheus Operator using the Platform Apps UI.
+
+
+
+Go to the Infra section using the menu on the left, and select the Clusters view.
+
+
+ Click on the Cluster to deploy Prometheus.
+
+
+ Navigate to the Apps tab.
+
+
+ Click and configure a Helm chart with the following settings.
+
+
+
+
+| Setting | Value |
+|---------|-------|
+| Chart Repository URL | `https://prometheus-community.github.io/helm-charts` |
+| Chart Name | `kube-prometheus-stack` |
+| Namespace | `monitoring` |
+| Release Name | `prometheus-agent` |
+
+
+Use the following chart values by specifying the URL of the central Prometheus,
+the namespace of platform/agent, and the cluster name.
+
+:::info
+Below steps must be repeated for each cluster.
+:::
+
+
+
+
+
+
+
+Click to deploy Prometheus.
+
+
+## Deploy Prometheus Agent on each Virtual Cluster with Private Nodes
+
+
+Each virtual cluster with private nodes needs its own Prometheus instance to
+scrape kubelet metrics from its dedicated nodes and forward them to the central
+Prometheus via remote write.
+
+:::info Prerequisites
+Virtual clusters with private nodes must be deployed with node-to-node vCluster VPN enabled. Add the following to your `vcluster.yaml`:
+
+```yaml
+privateNodes:
+ enabled: true
+ vpn:
+ enabled: true
+ nodeToNode:
+ enabled: true
+```
+
+Below steps must be repeated for each virtual cluster.
+:::
+
+
+
+**1. Connect to the virtual cluster:**
+
+
+
+**2. Configure Helm values:**
+
+Save the following as `prometheus-virtualcluster-values.yaml` and set the name of the virtual
+cluster. This is necessary in order to be able to aggregate any workload
+running on the private nodes to their corresponding virtual cluster.
+
+
+
+**3. Install Prometheus inside the vCluster:**
+
+
+
+## Golden Signals Queries
+
+With Prometheus deployed and forwarding metrics to the central receiver, you can
+query the aggregated data. This section provides PromQL queries organized around
+the Golden Signals framework.
+
+The [Four Golden Signals](https://sre.google/sre-book/monitoring-distributed-systems/) of monitoring are:
+- Latency
+- Traffic
+- Errors
+- Saturation
+
+The queries below serve two purposes:
+1. To provide a set of labels that makes it possible to aggregate by cluster, project, and virtual cluster.
+2. To adhere to the Golden Signals mentioned above.
+
+### Custom Agent Metrics
+
+The vCluster Platform agent emits a set of custom metrics carrying information
+about virtual clusters as labels. These metrics always return `1` and can
+therefore be joined via PromQL in order to make those labels available for
+aggregation later.
+
+#### `instance_info`
+
+Following labels are attached:
+
+- `kind`: `VirtualClusterInstance`
+- `name`: the name of the instance.
+- `namespace`: the namespace of the instance. This is usually the project namespace.
+- `project`: the Project that the instance belongs to.
+
+#### `virtualcluster_info`
+
+Following labels are attached:
+- `kind`: Kind of the virtual cluster (StatefulSet or Deployment)
+- `name`: Name of the virtual cluster
+- `namespace`: Namespace of the virtual cluster
+- `instance_name`: Name of the accompanying VirtualClusterInstance (only present if the virtual cluster is registered with the platform)
+- `instance_namespace`: Namespace of the accompanying VirtualClusterInstance (only present if the virtual cluster is registered with the platform)
+- `creation_timestamp`: Creation timestamp of the virtual cluster
+
+### Latency
+
+Each query below uses an `instance_info` join to enrich metrics with cluster and
+project labels. The pattern
+`* on (vcluster_name) group_left (...) label_replace(instance_info, ...)`
+maps the virtual cluster name to its corresponding `instance_info` entry,
+copying labels like `cluster` and `project` into the result. Subsequent queries
+use the same technique.
+
+#### kube-apiserver request latency (p99, by verb) (virtual cluster only)
+
+```promql
+histogram_quantile(0.99,
+ sum by (le, verb, cluster, project, vcluster_name) (
+ (
+ rate(apiserver_request_duration_seconds_bucket{vcluster_name!=""}[5m])
+ * on (vcluster_name) group_left (cluster, project)
+ label_replace(instance_info, "vcluster_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(apiserver_request_duration_seconds_bucket{job!~"apiserver|kubelet|loft"}[5m]),
+ "vcluster_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (vcluster_name) group_left (project)
+ label_replace(instance_info, "vcluster_name", "$1", "name", "(.*)")
+ )
+ )
+)
+```
+
+**Why:** Shows the tail latency of API server requests broken down by operation
+type (GET, LIST, PUT, POST, PATCH, DELETE, WATCH). The p99 captures outliers
+that averages hide. WATCH is expected to show 60s (long-poll).
+
+#### kube-apiserver request latency (p95, non-WATCH) (virtual cluster only)
+
+```promql
+histogram_quantile(0.95,
+ sum by (le, verb, cluster, project, vcluster_name) (
+ (
+ rate(apiserver_request_duration_seconds_bucket{verb!~"WATCH|CONNECT", vcluster_name!=""}[5m])
+ * on (vcluster_name) group_left (cluster, project)
+ label_replace(instance_info, "vcluster_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(apiserver_request_duration_seconds_bucket{verb!~"WATCH|CONNECT", job!~"apiserver|kubelet|loft"}[5m]),
+ "vcluster_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (vcluster_name) group_left (project)
+ label_replace(instance_info, "vcluster_name", "$1", "name", "(.*)")
+ )
+ )
+)
+```
+
+**Why:** Excludes long-running connections to focus on latency for synchronous
+API calls.
+
+#### etcd backend latency (p99, by operation) (virtual cluster only)
+
+```promql
+histogram_quantile(0.99,
+ sum by (le, operation, cluster, project, vcluster_name) (
+ (
+ rate(etcd_request_duration_seconds_bucket{vcluster_name!=""}[5m])
+ * on (vcluster_name) group_left (cluster, project)
+ label_replace(instance_info, "vcluster_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(etcd_request_duration_seconds_bucket{job!~"apiserver|kubelet|loft"}[5m]),
+ "vcluster_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (vcluster_name) group_left (project)
+ label_replace(instance_info, "vcluster_name", "$1", "name", "(.*)")
+ )
+ )
+)
+```
+
+**Why:** etcd is the persistence backend. High latencies here (especially for
+`get` and `list`) propagate to every API call.
+
+#### API gateway latency (p99, by route type) (vCluster Platform only)
+
+```promql
+histogram_quantile(0.99,
+ sum(rate(apigateway_kubernetes_request_duration_seconds_bucket[5m])) by (le, cluster)
+)
+```
+
+```promql
+histogram_quantile(0.99,
+ sum(rate(apigateway_auth_request_duration_seconds_bucket[5m])) by (le, cluster)
+)
+```
+
+```promql
+histogram_quantile(0.99,
+ sum(rate(apigateway_ui_request_duration_seconds_bucket[5m])) by (le, cluster)
+)
+```
+
+**Why:** The vCluster Platform apigateway proxies Kubernetes, auth, and UI
+requests. These three queries cover end-to-end latency as experienced by
+platform users.
+
+#### Kubelet pod start latency (p99) (vCluster Platform only)
+
+```promql
+histogram_quantile(0.99,
+ sum(rate(kubelet_pod_start_sli_duration_seconds_bucket[5m])) by (le, cluster)
+)
+```
+
+**Why:** Measures how quickly the kubelet can launch new pods. Critical for
+scaling responsiveness.
+
+#### Scheduler end-to-end scheduling latency (p99) (vCluster Platform only)
+
+```promql
+histogram_quantile(0.99,
+ sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket[5m])) by (le, result, cluster)
+)
+```
+
+**Why:** How long the scheduler takes to place a pod. Slow scheduling causes
+queuing and delays workload startup.
+
+### Traffic
+
+#### kube-apiserver request rate (by verb) (virtual cluster only)
+
+```promql
+sum by (verb, cluster, project, instance_name) (
+ (
+ rate(apiserver_request_total{instance_name!=""}[5m])
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(apiserver_request_total{job!~"apiserver|kubelet|loft"}[5m]),
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** The most fundamental measure of cluster workload. Shows how many requests per second the API server handles, broken down by verb.
+
+#### kube-apiserver request rate (by resource) (virtual cluster only)
+
+```promql
+topk(10,
+ sum by (resource, cluster, project, instance_name) (
+ (
+ rate(apiserver_request_total{instance_name!=""}[5m])
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(apiserver_request_total{job!~"apiserver|kubelet|loft"}[5m]),
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ )
+)
+```
+
+**Why:** Identifies which Kubernetes resources generate the most API traffic, revealing "hot" resource types.
+
+#### API gateway traffic (by route type) (vCluster Platform only)
+
+```promql
+sum(rate(apigateway_kubernetes_request_total[5m])) by (cluster)
+```
+
+```promql
+sum(rate(apigateway_auth_request_total[5m])) by (cluster)
+```
+
+```promql
+sum(rate(apigateway_ui_request_total[5m])) by (cluster)
+```
+
+**Why:** Measures platform-level traffic through the gateway, split by Kubernetes API proxy, auth, and UI.
+
+#### REST client outbound request rate (by code) (virtual cluster only)
+
+```promql
+sum by (code, cluster, project, instance_name) (
+ (
+ rate(rest_client_requests_total{instance_name!=""}[5m])
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(rest_client_requests_total{job!~"apiserver|kubelet|loft"}[5m]),
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** How many outbound API calls the control-plane components make.
+
+#### Network I/O rate (by namespace) (virtual cluster only)
+
+```promql
+topk(10,
+ sum by (namespace, cluster, project, instance_name) (
+ (
+ label_replace(
+ rate(container_network_receive_bytes_total{instance_name=""}[5m])
+ * on (namespace) group_left (name, project)
+ (virtualcluster_info * on (name) group_left (project) instance_info),
+ "instance_name",
+ "$1",
+ "name",
+ "(.*)"
+ )
+ * on (pod, cluster) group_left (namespace)
+ label_replace(kube_pod_labels, "namespace", "$1", "label_vcluster_loft_sh_namespace", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(container_network_receive_bytes_total{instance_name!=""}[5m]),
+ "name",
+ "$1",
+ "instance_name",
+ "(.*)"
+ )
+ * on (name) group_left (cluster, project)
+ instance_info
+ )
+ )
+)
+```
+
+```promql
+topk(10,
+ sum by (namespace, cluster, project, instance_name) (
+ (
+ label_replace(
+ rate(container_network_transmit_bytes_total{instance_name=""}[5m])
+ * on (namespace) group_left (name, project)
+ (virtualcluster_info * on (name) group_left (project) instance_info),
+ "instance_name",
+ "$1",
+ "name",
+ "(.*)"
+ )
+ * on (pod, cluster) group_left (namespace)
+ label_replace(kube_pod_labels, "namespace", "$1", "label_vcluster_loft_sh_namespace", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(container_network_transmit_bytes_total{instance_name!=""}[5m]),
+ "name",
+ "$1",
+ "instance_name",
+ "(.*)"
+ )
+ * on (name) group_left (cluster, project)
+ instance_info
+ )
+ )
+)
+```
+
+**Why:** Measures network throughput per namespace, revealing which workloads generate the most network traffic.
+
+### Errors
+
+#### kube-apiserver error rate (4xx/5xx, by code) (virtual cluster only)
+
+```promql
+sum by (code, cluster, project, instance_name) (
+ (
+ rate(apiserver_request_total{code=~"[45]..", instance_name!=""}[5m])
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(apiserver_request_total{code=~"[45]..", job!~"apiserver|kubelet|loft"}[5m]),
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** HTTP-level error rates.
+
+#### kube-apiserver error ratio (errors / total) (virtual cluster only)
+
+```promql
+sum by (cluster, project, instance_name) (
+ (
+ rate(apiserver_request_total{code=~"5..", instance_name!=""}[5m])
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(apiserver_request_total{code=~"5..", job!~"apiserver|kubelet|loft"}[5m]),
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+/
+sum by (cluster, project, instance_name) (
+ (
+ rate(apiserver_request_total{instance_name!=""}[5m])
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(apiserver_request_total{job!~"apiserver|kubelet|loft"}[5m]),
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** The fraction of server-side errors. A ratio above 1% is a red flag.
+
+#### etcd request errors (virtual cluster only)
+
+```promql
+sum by (operation, cluster, project, instance_name) (
+ (
+ rate(etcd_request_errors_total{instance_name!=""}[5m])
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(etcd_request_errors_total{job!~"apiserver|kubelet|loft"}[5m]),
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** Backend storage errors directly impact cluster health.
+
+#### Container OOM kills (virtual cluster only)
+
+```promql
+sum by (namespace, pod, cluster, project, instance_name) (
+ (
+ label_replace(
+ rate(container_oom_events_total{instance_name=""}[5m])
+ * on (namespace) group_left (name, project)
+ (virtualcluster_info * on (name) group_left (project) instance_info),
+ "instance_name",
+ "$1",
+ "name",
+ "(.*)"
+ )
+ * on (pod, cluster) group_left (namespace)
+ label_replace(kube_pod_labels, "namespace", "$1", "label_vcluster_loft_sh_namespace", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(container_oom_events_total{instance_name!=""}[5m]),
+ "name",
+ "$1",
+ "instance_name",
+ "(.*)"
+ )
+ * on (name) group_left (cluster, project)
+ instance_info
+ )
+)
+```
+
+**Why:** Out-of-memory kills indicate resource misconfiguration.
+
+#### Kubelet runtime operation errors (vCluster Platform only)
+
+```promql
+sum(rate(kubelet_runtime_operations_errors_total[5m])) by (operation_type, cluster)
+```
+
+**Why:** Container runtime failures (image pulls, container create/start failures).
+
+#### REST client error rate (outbound 5xx) (virtual cluster only)
+
+```promql
+sum by (host, cluster, project, instance_name) (
+ (
+ rate(rest_client_requests_total{code=~"5..", instance_name!=""}[5m])
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(rest_client_requests_total{code=~"5..", job!~"apiserver|kubelet|loft"}[5m]),
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** Errors when control-plane components call external APIs.
+
+#### Container restart rate (vCluster Platform only)
+
+```promql
+sum(rate(kubelet_restarted_pods_total[5m])) by (static, cluster)
+```
+
+**Why:** Frequent pod restarts indicate crashloops or unhealthy workloads.
+
+### Saturation
+
+#### Container CPU usage (top pods) (virtual cluster only)
+
+```promql
+topk(10,
+ sum by (namespace, pod, cluster, project, instance_name) (
+ (
+ label_replace(
+ rate(container_cpu_usage_seconds_total{instance_name=""}[5m])
+ * on (namespace) group_left (name, project)
+ (virtualcluster_info * on (name) group_left (project) instance_info),
+ "instance_name",
+ "$1",
+ "name",
+ "(.*)"
+ )
+ * on (pod, cluster) group_left (namespace)
+ label_replace(kube_pod_labels, "namespace", "$1", "label_vcluster_loft_sh_namespace", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(container_cpu_usage_seconds_total{instance_name!=""}[5m]),
+ "name",
+ "$1",
+ "instance_name",
+ "(.*)"
+ )
+ * on (name) group_left (cluster, project)
+ instance_info
+ )
+ )
+)
+```
+
+**Why:** Shows the most CPU-hungry pods across the cluster.
+
+#### Container memory working set (top pods) (virtual cluster only)
+
+```promql
+topk(10,
+ sum by (namespace, pod, cluster, project, instance_name) (
+ (
+ label_replace(
+ container_memory_working_set_bytes{instance_name=""}
+ * on (namespace) group_left (name, project)
+ (virtualcluster_info * on (name) group_left (project) instance_info),
+ "instance_name",
+ "$1",
+ "name",
+ "(.*)"
+ )
+ * on (pod, cluster) group_left (namespace)
+ label_replace(kube_pod_labels, "namespace", "$1", "label_vcluster_loft_sh_namespace", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ container_memory_working_set_bytes{instance_name!=""},
+ "name",
+ "$1",
+ "instance_name",
+ "(.*)"
+ )
+ * on (name) group_left (cluster, project)
+ instance_info
+ )
+ )
+)
+```
+
+**Why:** Working set is the "real" memory usage that matters for OOM decisions.
+
+#### CPU throttling ratio (by pod) (virtual cluster only)
+
+```promql
+topk(10,
+ sum by (namespace, pod, cluster, project, instance_name) (
+ (
+ label_replace(
+ rate(container_cpu_cfs_throttled_periods_total{instance_name=""}[5m])
+ * on (namespace) group_left (name, project)
+ (virtualcluster_info * on (name) group_left (project) instance_info),
+ "instance_name",
+ "$1",
+ "name",
+ "(.*)"
+ )
+ * on (pod, cluster) group_left (namespace)
+ label_replace(kube_pod_labels, "namespace", "$1", "label_vcluster_loft_sh_namespace", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(container_cpu_cfs_throttled_periods_total{instance_name!=""}[5m]),
+ "name",
+ "$1",
+ "instance_name",
+ "(.*)"
+ )
+ * on (name) group_left (cluster, project)
+ instance_info
+ )
+ )
+ /
+ sum by (namespace, pod, cluster, project, instance_name) (
+ (
+ label_replace(
+ rate(container_cpu_cfs_periods_total{instance_name=""}[5m])
+ * on (namespace) group_left (name, project)
+ (virtualcluster_info * on (name) group_left (project) instance_info),
+ "instance_name",
+ "$1",
+ "name",
+ "(.*)"
+ )
+ * on (pod, cluster) group_left (namespace)
+ label_replace(kube_pod_labels, "namespace", "$1", "label_vcluster_loft_sh_namespace", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ rate(container_cpu_cfs_periods_total{instance_name!=""}[5m]),
+ "name",
+ "$1",
+ "instance_name",
+ "(.*)"
+ )
+ * on (name) group_left (cluster, project)
+ instance_info
+ )
+ )
+)
+```
+
+**Why:** Shows which pods are being throttled by cgroup CPU limits.
+
+#### kube-apiserver inflight requests (virtual cluster only)
+
+```promql
+(
+ apiserver_current_inflight_requests{instance_name!=""}
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+)
+or
+(
+ label_replace(
+ apiserver_current_inflight_requests{job!~"apiserver|kubelet|loft"},
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+)
+```
+
+**Why:** Shows current request concurrency for mutating vs read-only. When this approaches flow control limits, requests start queuing.
+
+#### kube-apiserver flow-control queue depth (virtual cluster only)
+
+```promql
+sum by (priority_level, cluster, project, instance_name) (
+ (
+ apiserver_flowcontrol_current_inqueue_requests{instance_name!=""}
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ apiserver_flowcontrol_current_inqueue_requests{job!~"apiserver|kubelet|loft"},
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** Requests waiting in priority-level queues. Non-zero means the API server is saturated for that priority level.
+
+#### Scheduler pending pods (by queue) (vCluster Platform only)
+
+```promql
+scheduler_pending_pods
+```
+
+**Why:** Pods waiting for scheduling. Non-zero in `unschedulable` means cluster capacity is exhausted.
+
+#### Workqueue depth (by queue name) (virtual cluster only)
+
+```promql
+topk(10,
+ (
+ workqueue_depth{instance_name!=""}
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ workqueue_depth{job!~"apiserver|kubelet|loft"},
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** Controller work queues. Growing depth = controllers can't keep up with the event rate.
+
+#### WATCH connection count (long-running requests) (virtual cluster only)
+
+```promql
+sum by (verb, cluster, project, instance_name) (
+ (
+ apiserver_longrunning_requests{instance_name!=""}
+ * on (instance_name) group_left (cluster, project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+ or
+ (
+ label_replace(
+ apiserver_longrunning_requests{job!~"apiserver|kubelet|loft"},
+ "instance_name",
+ "$1",
+ "job",
+ "(.*)"
+ )
+ * on (instance_name) group_left (project)
+ label_replace(instance_info, "instance_name", "$1", "name", "(.*)")
+ )
+)
+```
+
+**Why:** A proxy for how many controllers/informers are active. Sudden spikes may indicate reconnect storms.
+
+#### Node CPU usage rate (vCluster Platform only)
+
+```promql
+rate(node_cpu_usage_seconds_total[5m])
+```
+
+**Why:** Overall node-level CPU consumption from kubelet resource metrics.
+
+#### Node memory working set (vCluster Platform only)
+
+```promql
+node_memory_working_set_bytes
+```
+
+**Why:** Overall node memory pressure.
+
+#### Filesystem usage (by PVC / volume) (vCluster Platform only)
+
+```promql
+kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes
+```
+
+**Why:** Volume fill percentage. Approaching 100% means workloads will fail.