Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 42 additions & 178 deletions deploy/helm/AGENTS.md

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions deploy/helm/moai-inference-framework/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ Moreh Inference Framework

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| alerts.heimdall.enabled | bool | `false` | Enable Heimdall alert provisioning. Requires `slack.webhookUrl` or `slack.existingSecret`. Set `prometheus-stack.grafana.grafana.ini.server.root_url` for clickable links in Slack messages. |
| alerts.heimdall.slack.existingSecret | string | `""` | Externally-managed Secret holding the webhook URL. Resolved by Helm `lookup` at install/upgrade time and takes precedence over `webhookUrl`; renders empty under `helm template`/`--dry-run` (no cluster access). |
| alerts.heimdall.slack.secretKeys.webhookUrlKey | string | `"webhook-url"` | Data key inside `existingSecret` that stores the webhook URL. |
| alerts.heimdall.slack.webhookUrl | string | `""` | Slack webhook URL (inline). Used only when `existingSecret` is empty. SECRET — pass via `--set-file` or an external secrets operator; never commit. |
| commonLabels | object | `{}` | Labels applied to all resources. |
| ecrTokenRefresher.aws.accessKeyId | string | `""` | AWS_ACCESS_KEY_ID |
| ecrTokenRefresher.aws.region | string | `"ap-northeast-2"` | AWS Region. |
Expand Down Expand Up @@ -131,6 +135,7 @@ Moreh Inference Framework
| prometheus-stack.defaultRules.create | bool | `false` | |
| prometheus-stack.enabled | bool | `true` | Enable prometheus-community/kube-prometheus-stack. Set to false if already deployed. |
| prometheus-stack.grafana.enabled | bool | `true` | |
| prometheus-stack.grafana.sidecar.alerts.enabled | bool | `true` | |
| prometheus-stack.grafana.sidecar.dashboards.enabled | bool | `true` | |
| prometheus-stack.kubeApiServer.enabled | bool | `false` | |
| prometheus-stack.kubeControllerManager.enabled | bool | `false` | |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: 1
policies:
- orgId: 1
receiver: grafana-default-email
group_by:
- grafana_folder
- alertname
routes:
- receiver: heimdall-slack
object_matchers:
- - component
- "="
- heimdall
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
continue: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
apiVersion: 1
groups:
- orgId: 1
name: Heimdall Error Alerts
folder: Heimdall
interval: 1m
rules:
- uid: heimdall-error-log-burst
title: Heimdall Error Log Burst
condition: B
data:
# LogQL: group by instance/namespace and extract the error message into a label_format.
# The resulting time series carries instance, namespace, and error_summary labels,
# which propagate to alert labels so Slack messages can render them.
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: loki
model:
refId: A
datasource:
type: loki
uid: loki
expr: |
sum by (instance, namespace, error_summary) (
count_over_time(
{app="heimdall-inference-scheduler", level="error"}
| json
| label_format error_summary=`{{ if .error }}{{ printf "%.180s" .message }}: {{ printf "%.180s" .error }}{{ else }}{{ printf "%.300s" .message }}{{ end }}`
[5m]
)
)
queryType: instant
intervalMs: 1000
maxDataPoints: 43200
- refId: B
datasourceUid: __expr__
model:
refId: B
datasource:
type: __expr__
uid: __expr__
type: threshold
expression: A
conditions:
- evaluator:
type: gt
params: [0]
operator:
type: and
query:
params: []
reducer:
type: last
params: []
type: query
intervalMs: 1000
maxDataPoints: 43200
for: 1m
noDataState: OK
execErrState: Error
annotations:
summary: Heimdall error logs detected
description: '{{ $values.A.Value }} error log entries detected in the last 5 minutes.'
# Grafana Explore deep link — pre-filled LogQL filtered to Heimdall error logs.
# `{{ externalURL }}` is substituted by Grafana at alert-evaluation time
# from the configured `server.root_url`, so the link resolves to whichever
# Grafana instance is fronting the cluster (no chart-side override needed).
# Note: Grafana's substituted `externalURL` always carries a trailing slash,
# so the path here must NOT start with one to avoid `//path` in the final URL.
# Slack template references this annotation as a clickable link.
exploreURL: '{{ externalURL }}explore?schemaVersion=1&orgId=1&panes=%7B%22h1%22%3A%7B%22datasource%22%3A%22loki%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22loki%22%7D%2C%22expr%22%3A%22%7Bapp%3D%5C%22heimdall-inference-scheduler%5C%22%2Clevel%3D%5C%22error%5C%22%7D%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-30m%22%2C%22to%22%3A%22now%22%7D%7D%7D'
# Direct link to this alert rule view (same `externalURL` source).
ruleURL: '{{ externalURL }}alerting/grafana/heimdall-error-log-burst/view'
labels:
severity: warning
component: heimdall
isPaused: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
apiVersion: 1
templates:
- orgId: 1
name: heimdall-slack-templates
template: |
{{ define "heimdall-slack.title" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }}
{{ end }}

{{ define "heimdall-slack.body" }}
{{ range .Alerts }}
:pushpin: *Summary*: {{ .Annotations.summary }}
:memo: *Description*: {{ .Annotations.description }}

{{ if .Labels.error_summary }}:warning: *Error message*:
```{{ .Labels.error_summary }}```{{ end }}

*Context*
{{ if .Labels.namespace }}• Namespace: `{{ .Labels.namespace }}`{{ end }}
{{ if .Labels.instance }}• Instance: `{{ .Labels.instance }}`{{ end }}
• Severity: `{{ .Labels.severity }}`
• Component: `{{ .Labels.component }}`
{{ if .Labels.environment }}• Environment: `{{ .Labels.environment }}`{{ end }}

{{ if .Annotations.exploreURL }}:mag: <{{ .Annotations.exploreURL }}|View error logs in Grafana>{{ end }}
{{ if .Annotations.ruleURL }}:bell: <{{ .Annotations.ruleURL }}|View alert rule>{{ end }}
{{ end }}
{{ end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{{- $ps := index .Values "prometheus-stack" }}
{{- if and $ps.enabled $ps.grafana.enabled $ps.grafana.sidecar.alerts.enabled .Values.alerts.heimdall.enabled }}
{{- $files := .Files.Glob "files/alerts/*.yaml" }}
{{- if $files }}
{{- range $path, $_ := $files }}
{{- $alertName := base $path | trimSuffix ".yaml" }}
---
apiVersion: v1
kind: ConfigMap
metadata:
namespace: {{ include "common.names.namespace" $ }}
name: {{ include "common.names.name" $ }}-alert-{{ $alertName }}
annotations:
{{- with $ps.grafana.sidecar.alerts.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
labels:
{{ tpl $ps.grafana.sidecar.alerts.label $ }}: {{ ((tpl $ps.grafana.sidecar.alerts.labelValue $) | default 1) | quote }}
{{- include "mif.labels" $ | nindent 4 }}
data:
{{ base $path }}: |-
{{- $.Files.Get $path | nindent 4 }}
{{- end }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ data:
apiVersion: 1
datasources:
- name: Loki
# Stable UID so other provisioned resources (e.g. the Heimdall alert
# rules under `files/alerts/`) can reference this datasource by a
# known identifier instead of the random UID Grafana would otherwise
# assign.
uid: loki
type: loki
access: proxy
url: http://{{ .Release.Name }}-loki-gateway.{{ include "common.names.namespace" . }}.svc.cluster.local
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{{- $ps := index .Values "prometheus-stack" }}
{{- if and $ps.enabled $ps.grafana.enabled $ps.grafana.sidecar.alerts.enabled .Values.alerts.heimdall.enabled }}
{{- $slack := .Values.alerts.heimdall.slack }}
{{- $webhookUrl := "" }}
{{- if $slack.existingSecret }}
{{- /*
Resolve the URL from an externally-managed Secret at key
`secretKeys.webhookUrlKey`. `lookup` returns nil during `helm template` and
`helm install --dry-run` (no cluster access), so the rendered URL is empty
there — the ConfigMap is then skipped and alerts will not deliver until
applied against a real cluster. Takes precedence over `webhookUrl`.
*/ -}}
{{- $existing := lookup "v1" "Secret" (include "common.names.namespace" .) $slack.existingSecret }}
{{- $key := $slack.secretKeys.webhookUrlKey }}
{{- if and $existing $existing.data (index $existing.data $key) }}
{{- $webhookUrl = index $existing.data $key | b64dec }}
{{- end }}
{{- else }}
{{- $webhookUrl = $slack.webhookUrl }}
{{- end }}
Comment thread
seongsu-dev marked this conversation as resolved.
{{- /*
Trim surrounding whitespace, including trailing newlines that creep in
when operators load the URL with `--set-file` or from a Secret whose
data was stored from a file. Grafana's contact-point provisioning
rejects the URL otherwise (treats `https://...\n` as an invalid URL).
*/ -}}
{{- $webhookUrl = trim $webhookUrl }}
{{- if $webhookUrl }}
---
apiVersion: v1
kind: ConfigMap
metadata:
namespace: {{ include "common.names.namespace" . }}
name: {{ include "common.names.name" . }}-alert-heimdall-slack-contact-points
annotations:
{{- with $ps.grafana.sidecar.alerts.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
labels:
{{ tpl $ps.grafana.sidecar.alerts.label . }}: {{ ((tpl $ps.grafana.sidecar.alerts.labelValue .) | default 1) | quote }}
{{- include "mif.labels" . | nindent 4 }}
data:
heimdall-slack-contact-points.yaml: |
apiVersion: 1
contactPoints:
- orgId: 1
name: heimdall-slack
receivers:
- uid: heimdall-slack
type: slack
disableResolveMessage: false
settings:
url: {{ $webhookUrl | quote }}
title: '{{`{{ template "heimdall-slack.title" . }}`}}'
text: '{{`{{ template "heimdall-slack.body" . }}`}}'
{{- end }}
{{- end }}
26 changes: 26 additions & 0 deletions deploy/helm/moai-inference-framework/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ prometheus-stack:
sidecar:
dashboards:
enabled: true
alerts:
enabled: true
kubernetesServiceMonitors:
enabled: true
kubeApiServer:
Expand Down Expand Up @@ -59,6 +61,30 @@ prometheus-stack:
thanosRuler:
enabled: false

# Heimdall alert provisioning consumed by the grafana-sc-alerts sidecar.
# Rules, templates, policies, and the `heimdall-slack` contact point are all
# delivered as ConfigMaps labelled `grafana_alert=1`; routing is hardcoded.
alerts:
heimdall:
# -- Enable Heimdall alert provisioning. Requires `slack.webhookUrl` or
# `slack.existingSecret`. Set `prometheus-stack.grafana.grafana.ini.server.root_url`
# for clickable links in Slack messages.
enabled: false

slack:
# -- Slack webhook URL (inline). Used only when `existingSecret` is empty.
# SECRET — pass via `--set-file` or an external secrets operator; never commit.
webhookUrl: ""
# -- Externally-managed Secret holding the webhook URL. Resolved by Helm
# `lookup` at install/upgrade time and takes precedence over `webhookUrl`;
# renders empty under `helm template`/`--dry-run` (no cluster access).
existingSecret: ""
# Mirrors the Bitnami `secretKeys.<role>Key` pattern so operators can
# match the field name to the secret's actual data key.
secretKeys:
# -- Data key inside `existingSecret` that stores the webhook URL.
webhookUrlKey: webhook-url
Comment thread
hhk7734 marked this conversation as resolved.
Comment thread
seongsu-dev marked this conversation as resolved.

lws:
# -- Enable kubernetes-sigs/lws. Set to false if already deployed.
enabled: true
Expand Down
Loading