diff --git a/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml b/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml index 3110ccea7..266ea1b96 100644 --- a/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml +++ b/.tekton/integration-tests/pipeline/lightspeed-stack-integration-test.yaml @@ -167,6 +167,8 @@ spec: echo "========== End parameters ==========" - name: lightspeed-stack-integration-tests description: Task to run integration tests from lightspeed-stack repository + # Full Behave suite (proxy + tls) can exceed 2h; needs PipelineRun timeouts >= this value. + timeout: 3h params: - name: SNAPSHOT value: $(params.SNAPSHOT) diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml new file mode 100644 index 000000000..6797de24a --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/e2e-mock-tls-inference.yaml @@ -0,0 +1,104 @@ +# Mock HTTPS OpenAI API for tls.feature (Konflux / Prow; no Docker Compose). +# Llama Stack run.yaml uses https://e2e-mock-tls-inference..svc.cluster.local:8443|8444|8445/v1 +apiVersion: v1 +kind: Pod +metadata: + name: e2e-mock-tls-inference + labels: + app: e2e-mock-tls-inference +spec: + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: e2e-mock-tls-inference + image: python:3.12-slim + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 1000 + seccompProfile: + type: RuntimeDefault + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: PYTHONPATH + value: /app:/tmp/pydeps + command: + - /bin/sh + - -c + - | + set -e + pip install --quiet --no-cache-dir --target /tmp/pydeps 'trustme>=1.2.1' 'cryptography>=42.0.0' + NS="${POD_NAMESPACE:-default}" + export TLS_CERT_DNS_NAMES="mock-tls-inference,localhost,127.0.0.1,e2e-mock-tls-inference,e2e-mock-tls-inference.${NS}.svc.cluster.local" + exec python /app/server.py + ports: + - containerPort: 8443 + name: tls + - containerPort: 8444 + name: mtls + - containerPort: 8445 + name: mismatch + volumeMounts: + - name: server-script + mountPath: /app/server.py + subPath: server.py + readOnly: true + - name: certs-work + mountPath: /certs + readinessProbe: + exec: + command: + - python3 + - -c + - | + import ssl, urllib.request + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + urllib.request.urlopen("https://localhost:8443/health", context=ctx) + initialDelaySeconds: 8 + periodSeconds: 5 + livenessProbe: + exec: + command: + - python3 + - -c + - | + import ssl, urllib.request + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + urllib.request.urlopen("https://localhost:8443/health", context=ctx) + initialDelaySeconds: 15 + periodSeconds: 20 + volumes: + - name: server-script + configMap: + name: e2e-mock-tls-inference-script + - name: certs-work + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: e2e-mock-tls-inference +spec: + selector: + app: e2e-mock-tls-inference + ports: + - name: tls + port: 8443 + targetPort: tls + - name: mtls + port: 8444 + targetPort: mtls + - name: mismatch + port: 8445 + targetPort: mismatch diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml index db166d0eb..3a8d56674 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-openai.yaml @@ -201,6 +201,10 @@ spec: mountPath: /tmp/interception-proxy-ca.pem subPath: ca.pem readOnly: true + # tls.feature: client/CA PEMs from Secret e2e-mock-tls-certs (optional). + - name: mock-tls-certs + mountPath: /certs + readOnly: true volumes: - name: app-root emptyDir: {} @@ -217,3 +221,7 @@ spec: secret: secretName: e2e-interception-proxy-ca optional: true + - name: mock-tls-certs + secret: + secretName: e2e-mock-tls-certs + optional: true diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 7f7b3d9a4..b80b41914 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -25,6 +25,9 @@ # disrupt-llama-stack - Delete llama-stack pod to disrupt connection # deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy (proxy.feature step) # deploy-e2e-interception-proxy - Deploy in-cluster interception proxy (proxy.feature step) +# deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature step) +# sync-mock-tls-certs-secret - Publish /certs PEMs to Secret for llama-stack mount +# dump-pod-logs [container] - Print events, describe, init + container logs (on failure) set -e @@ -40,21 +43,94 @@ E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks- # Helper functions # ============================================================================ +# Print diagnostics to stdout (captured by Behave as CAPTURED STDOUT). +e2e_ops_dump_pod_logs() { + local pod_name="${1:?pod name required}" + local preferred_container="${2:-}" + local log_tail="${3:-200}" + local prefix="[e2e-ops] " + local init_ctr ctr restart_count phase + + echo "${prefix}========== failure logs: pod/$pod_name (namespace $NAMESPACE) ==========" + + echo "${prefix}--- events for pod/$pod_name ---" + oc get events -n "$NAMESPACE" --field-selector "involvedObject.name=${pod_name}" \ + --sort-by='.lastTimestamp' 2>&1 | tail -50 | sed "s/^/${prefix}/" \ + || echo "${prefix}(could not list events)" + + if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then + echo "${prefix}pod/$pod_name not found (deleted or never created)" + oc get pods -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true + echo "${prefix}========== end failure logs: pod/$pod_name ==========" + return 0 + fi + + phase=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "?") + echo "${prefix}pod phase=$phase" + oc get pod "$pod_name" -n "$NAMESPACE" -o wide 2>&1 | sed "s/^/${prefix}/" || true + oc describe pod "$pod_name" -n "$NAMESPACE" 2>&1 | sed "s/^/${prefix}/" || true + + for init_ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.spec.initContainers[*].name}' 2>/dev/null); do + [[ -n "$init_ctr" ]] || continue + echo "${prefix}--- logs pod/$pod_name -c $init_ctr (init, tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$init_ctr" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || echo "${prefix}(no init logs for $init_ctr)" + done + + for ctr in $(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.spec.containers[*].name}' 2>/dev/null); do + [[ -n "$ctr" ]] || continue + echo "${prefix}--- logs pod/$pod_name -c $ctr (tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || echo "${prefix}(no logs for $ctr)" + restart_count=$(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath="{.status.containerStatuses[?(@.name==\"${ctr}\")].restartCount}" \ + 2>/dev/null) || restart_count="0" + if [[ "${restart_count:-0}" -gt 0 ]]; then + echo "${prefix}--- logs pod/$pod_name -c $ctr --previous (tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$ctr" --previous --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || true + fi + done + + if [[ -n "$preferred_container" ]]; then + echo "${prefix}--- logs pod/$pod_name -c $preferred_container (preferred, tail $log_tail) ---" + oc logs "$pod_name" -n "$NAMESPACE" -c "$preferred_container" --tail="$log_tail" 2>&1 \ + | sed "s/^/${prefix}/" || true + fi + + echo "${prefix}========== end failure logs: pod/$pod_name ==========" +} + wait_for_pod() { local pod_name="$1" local max_attempts="${2:-24}" - + local attempt + local ready + local phase + for ((attempt=1; attempt<=max_attempts; attempt++)); do - local ready - ready=$(oc get pod "$pod_name" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") - if [[ "$ready" == "true" ]]; then - echo "✓ Pod $pod_name ready" - return 0 + if ! oc get pod "$pod_name" -n "$NAMESPACE" &>/dev/null; then + phase="Missing" + else + phase=$(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") + ready=$(oc get pod "$pod_name" -n "$NAMESPACE" \ + -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false") + if [[ "$ready" == "true" ]]; then + echo "✓ Pod $pod_name ready (attempt $attempt/$max_attempts)" + return 0 + fi + fi + if [[ $((attempt % 10)) -eq 0 ]]; then + echo "[e2e-ops] $pod_name not ready yet (attempt $attempt/$max_attempts, phase=${phase:-?})..." fi sleep 3 done - - echo "Pod $pod_name not ready after $((max_attempts * 3))s" + + echo "Pod $pod_name not ready after $((max_attempts * 3))s (last phase: ${phase:-unknown})" + e2e_ops_dump_pod_logs "$pod_name" "" 250 return 1 } @@ -272,9 +348,7 @@ wait_for_llama_stack_http_health() { fi done echo "ERROR: Llama Stack did not respond on http://127.0.0.1:8321/v1/health inside the pod" - oc get pod llama-stack-service -n "$NAMESPACE" -o wide 2>&1 || true - oc describe pod llama-stack-service -n "$NAMESPACE" 2>&1 | tail -40 || true - oc logs llama-stack-service -n "$NAMESPACE" -c llama-stack-container --tail=120 2>&1 || true + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 250 return 1 } @@ -310,15 +384,23 @@ cmd_restart_lightspeed() { # Don't let a timeout here abort the function — still attempt port-forward # and diagnostics so later scenarios have a chance to recover. local pod_ready=true - if ! wait_for_pod "lightspeed-stack-service" 40; then + local lcs_pod_wait=40 + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]]; then + # readinessProbe: 20s + 30*5s; LCS + Llama handshake can exceed 195s on Konflux (TLS suite). + lcs_pod_wait=100 + fi + echo "[e2e-ops] Waiting for lightspeed-stack-service Ready (max ${lcs_pod_wait} attempts, $((lcs_pod_wait * 3))s)..." + if ! wait_for_pod "lightspeed-stack-service" "$lcs_pod_wait"; then pod_ready=false - echo "⚠️ Pod not ready within 120s — dumping diagnostics:" - oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true - oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true + echo "⚠️ Pod not ready within $((lcs_pod_wait * 3))s" fi - # Re-label pod for service discovery - oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + # Re-label pod for service discovery (ignore if pod was deleted / not created yet) + if oc get pod lightspeed-stack-service -n "$NAMESPACE" &>/dev/null; then + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite + else + echo "⚠️ Cannot label lightspeed-stack-service — pod missing" + fi # Re-establish port-forwards (may succeed even if readiness was slow) cmd_restart_port_forward @@ -326,6 +408,7 @@ cmd_restart_lightspeed() { if [[ "$pod_ready" == "false" ]]; then echo "⚠️ Lightspeed restart completed but pod was slow to become ready" + e2e_ops_dump_pod_logs "lightspeed-stack-service" "lightspeed-stack-container" 150 return 1 fi echo "✓ Lightspeed restart complete" @@ -333,6 +416,7 @@ cmd_restart_lightspeed() { cmd_restart_llama_stack() { echo "===== Restoring llama-stack service =====" + echo "[e2e-ops] restart-llama-stack env: E2E_KONFLUX_E2E=${E2E_KONFLUX_E2E:-0} E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA=${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" # Pod.spec is largely immutable; delete so apply creates a pod with current volumes/env. echo "Deleting llama-stack pod (if any) before apply..." timeout 45 oc delete pod llama-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { @@ -350,22 +434,52 @@ cmd_restart_llama_stack() { exit 1 fi fi + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + echo "[e2e-ops] Syncing e2e-mock-tls-certs secret before llama-stack apply..." + if ! cmd_sync_mock_tls_certs_secret; then + echo "===== Llama-stack restore FAILED (mock TLS certs secret sync) =====" + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120 + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 120 + exit 1 + fi + fi _LLAMA_SVC_FQDN="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" oc create secret generic llama-stack-ip-secret \ --from-literal=key="$_LLAMA_SVC_FQDN" \ -n "$NAMESPACE" \ --dry-run=client -o yaml | oc apply -f - oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/llama-stack-openai.yaml" - wait_for_pod "llama-stack-service" 90 + local llama_pod_wait=90 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + # readinessProbe: 20s + 36*5s = 200s; clone/enrich/RAG on Konflux often needs 400s+ total. + llama_pod_wait=180 + fi + echo "[e2e-ops] Waiting for llama-stack-service Ready (max ${llama_pod_wait} attempts, $((llama_pod_wait * 3))s)..." + if ! wait_for_pod "llama-stack-service" "$llama_pod_wait"; then + echo "===== Llama-stack restore FAILED (pod not Ready within $((llama_pod_wait * 3))s) =====" + exit 1 + fi echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite if [[ "${E2E_COPY_INTERCEPTION_CA_TO_LLAMA:-0}" == "1" ]]; then if ! _verify_interception_ca_mounted_in_llama; then echo "===== Llama-stack restore FAILED (interception CA not mounted) =====" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150 + exit 1 + fi + fi + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + if ! _verify_mock_tls_certs_mounted_in_llama; then + echo "===== Llama-stack restore FAILED (mock TLS certs not mounted) =====" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150 exit 1 fi fi - if ! wait_for_llama_stack_http_health 50; then + local llama_health_attempts=50 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + llama_health_attempts=100 + fi + if ! wait_for_llama_stack_http_health "$llama_health_attempts"; then echo "===== Llama-stack restore FAILED (HTTP not healthy) =====" exit 1 fi @@ -381,6 +495,7 @@ cmd_restart_llama_stack() { if ! cmd_restart_llama_port_forward; then echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed" + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150 exit 1 fi @@ -474,20 +589,23 @@ cmd_restart_llama_port_forward() { local local_port="${LOCAL_LLAMA_PORT:-8321}" local remote_port="${REMOTE_LLAMA_PORT:-8321}" local max_attempts=6 + if [[ "${E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA:-0}" == "1" ]]; then + max_attempts=10 + fi local pf_pid local pf_resource local llama_pf_log="/tmp/port-forward-llama.log" - echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port..." + echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port (max $max_attempts attempts)..." for ((attempt=1; attempt<=max_attempts; attempt++)); do kill_stale_llama_forward "$local_port" sleep 3 - if [[ $attempt -le 2 ]]; then - pf_resource="svc/llama-stack-service-svc" - else + if [[ "${E2E_KONFLUX_E2E:-0}" == "1" ]] || [[ $attempt -ge 3 ]]; then pf_resource="pod/llama-stack-service" + else + pf_resource="svc/llama-stack-service-svc" fi echo "Llama port-forward attempt $attempt/$max_attempts -> $pf_resource" @@ -525,8 +643,10 @@ cmd_restart_llama_port_forward() { echo "Failed to establish Llama Stack port-forward on :$local_port" if [[ -s "$llama_pf_log" ]]; then + echo "[e2e-ops] $llama_pf_log (tail 30):" tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true fi + e2e_ops_dump_pod_logs "llama-stack-service" "llama-stack-container" 150 return 1 } @@ -709,6 +829,145 @@ cmd_copy_interception_proxy_ca_to_llama() { cmd_sync_interception_proxy_ca_secret } +_MOCK_TLS_CERT_FILES=( + ca.crt + client.crt + client.key + untrusted-ca.crt + expired-ca.crt + untrusted-client.crt + untrusted-client.key + expired-client.crt +) + +_mock_tls_secret_is_complete() { + local f b64 + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + b64=$(oc get secret e2e-mock-tls-certs -n "$NAMESPACE" \ + -o "go-template={{index .data \"${f}\"}}" 2>/dev/null) || return 1 + [[ -n "$b64" ]] || return 1 + done + return 0 +} + +_get_mock_tls_inference_pod_name() { + oc get pod -n "$NAMESPACE" -l app=e2e-mock-tls-inference \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true +} + +_wait_for_mock_tls_inference_pod() { + if ! oc wait pod -l app=e2e-mock-tls-inference -n "$NAMESPACE" \ + --for=condition=Ready --timeout=120s 2>/dev/null; then + echo "ERROR: e2e-mock-tls-inference pod not Ready" >&2 + oc get pods -n "$NAMESPACE" -l app=e2e-mock-tls-inference -o wide 2>&1 || true + return 1 + fi + return 0 +} + +_copy_mock_tls_cert_from_pod() { + local mock_pod_name="$1" + local cert_file="$2" + local dest="$3" + local attempt + + for ((attempt=1; attempt<=4; attempt++)); do + if oc exec --request-timeout=90 -n "$NAMESPACE" "$mock_pod_name" \ + -c e2e-mock-tls-inference -- cat "/certs/$cert_file" >"$dest" 2>/dev/null \ + && [[ -s "$dest" ]]; then + return 0 + fi + echo "[e2e-ops] WARN: read /certs/$cert_file from mock pod failed (attempt $attempt/4)" + sleep 5 + done + return 1 +} + +_recycle_mock_tls_inference_pod() { + echo "[e2e-ops] Recycling e2e-mock-tls-inference pod (stale or unresponsive)..." + oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true + sleep 3 + if ! _wait_for_mock_tls_inference_pod; then + return 1 + fi + # Certs are written at container start; allow trustme + pip to finish. + sleep 10 + return 0 +} + +cmd_sync_mock_tls_certs_secret() { + local mock_pod_name tmpdir f recycle_attempt + + if _mock_tls_secret_is_complete; then + echo "✓ Secret e2e-mock-tls-certs already complete (${#_MOCK_TLS_CERT_FILES[@]} keys); skipping sync" + return 0 + fi + + for recycle_attempt in 1 2; do + mock_pod_name=$(_get_mock_tls_inference_pod_name) + if [[ -z "$mock_pod_name" ]]; then + echo "ERROR: no e2e-mock-tls-inference pod in namespace $NAMESPACE" >&2 + echo " Run: e2e-ops.sh deploy-e2e-mock-tls-inference" >&2 + return 1 + fi + + if ! _wait_for_mock_tls_inference_pod; then + if [[ $recycle_attempt -lt 2 ]] && _recycle_mock_tls_inference_pod; then + continue + fi + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120 + return 1 + fi + + tmpdir=$(mktemp -d) + local sync_ok=true + for f in "${_MOCK_TLS_CERT_FILES[@]}"; do + if ! _copy_mock_tls_cert_from_pod "$mock_pod_name" "$f" "$tmpdir/$f"; then + echo "ERROR: failed to read /certs/$f from e2e-mock-tls-inference pod" >&2 + e2e_ops_dump_pod_logs "$mock_pod_name" "e2e-mock-tls-inference" 120 + sync_ok=false + break + fi + done + + if [[ "$sync_ok" == "true" ]]; then + if ! oc create secret generic e2e-mock-tls-certs \ + --from-file="$tmpdir" \ + -n "$NAMESPACE" \ + --dry-run=client -o yaml | oc apply -f -; then + echo "ERROR: failed to apply e2e-mock-tls-certs secret" >&2 + rm -rf "$tmpdir" + return 1 + fi + rm -rf "$tmpdir" + echo "✓ Secret e2e-mock-tls-certs updated (${#_MOCK_TLS_CERT_FILES[@]} files)" + return 0 + fi + + rm -rf "$tmpdir" + if [[ $recycle_attempt -lt 2 ]]; then + _recycle_mock_tls_inference_pod || return 1 + fi + done + + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 120 + return 1 +} + +_verify_mock_tls_certs_mounted_in_llama() { + local llama_pod_name="llama-stack-service" + if oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ + sh -c 'test -s /certs/ca.crt && test -s /certs/client.crt && test -s /certs/client.key'; then + echo "✓ mock TLS certs present under /certs in llama-stack" + return 0 + fi + echo "ERROR: /certs missing or incomplete in llama-stack pod" >&2 + oc get secret e2e-mock-tls-certs -n "$NAMESPACE" 2>&1 || true + oc exec -n "$NAMESPACE" "$llama_pod_name" -c llama-stack-container -- \ + ls -la /certs 2>&1 || true + return 1 +} + _e2e_repo_root() { cd "$SCRIPT_DIR/../../../.." && pwd } @@ -745,6 +1004,34 @@ cmd_deploy_e2e_interception_proxy() { echo "✓ e2e-interception-proxy ready at http://e2e-interception-proxy.${NAMESPACE}.svc.cluster.local:8889" } +cmd_deploy_e2e_mock_tls_inference() { + local repo_root + repo_root="$(_e2e_repo_root)" + echo "Deploying e2e-mock-tls-inference in namespace $NAMESPACE..." + oc create configmap e2e-mock-tls-inference-script -n "$NAMESPACE" \ + --from-file=server.py="$repo_root/tests/e2e/mock_tls_inference_server/server.py" \ + --dry-run=client -o yaml | oc apply -f - + oc delete pod e2e-mock-tls-inference -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || true + oc apply -n "$NAMESPACE" -f "$MANIFEST_DIR/e2e-mock-tls-inference.yaml" + if ! oc wait pod/e2e-mock-tls-inference -n "$NAMESPACE" --for=condition=Ready --timeout=240s; then + echo "ERROR: e2e-mock-tls-inference failed to become ready" >&2 + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 150 + return 1 + fi + echo "✓ e2e-mock-tls-inference ready at https://e2e-mock-tls-inference.${NAMESPACE}.svc.cluster.local:8443" + if ! cmd_sync_mock_tls_certs_secret; then + echo "WARNING: mock TLS server is up but e2e-mock-tls-certs secret sync failed" >&2 + e2e_ops_dump_pod_logs "e2e-mock-tls-inference" "e2e-mock-tls-inference" 150 + return 1 + fi +} + +cmd_dump_pod_logs() { + local pod_name="${1:?pod name required}" + local container="${2:-}" + e2e_ops_dump_pod_logs "$pod_name" "$container" 200 +} + cmd_disrupt_llama_stack() { local pod_name="llama-stack-service" @@ -815,6 +1102,15 @@ case "$COMMAND" in deploy-e2e-interception-proxy) cmd_deploy_e2e_interception_proxy ;; + deploy-e2e-mock-tls-inference) + cmd_deploy_e2e_mock_tls_inference + ;; + sync-mock-tls-certs-secret) + cmd_sync_mock_tls_certs_secret + ;; + dump-pod-logs) + cmd_dump_pod_logs "$@" + ;; *) echo "Usage: $0 [args...]" echo "" @@ -833,6 +1129,9 @@ case "$COMMAND" in echo " sync-interception-proxy-ca-secret - Publish trustme CA to Secret for llama mount" echo " deploy-e2e-tunnel-proxy - Deploy in-cluster tunnel proxy pod" echo " deploy-e2e-interception-proxy - Deploy in-cluster interception proxy pod" + echo " deploy-e2e-mock-tls-inference - Deploy mock HTTPS inference server (tls.feature)" + echo " sync-mock-tls-certs-secret - Publish mock TLS /certs to Secret for llama mount" + echo " dump-pod-logs [container] - Events, describe, init + container logs" exit 1 ;; esac diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml index babdc2b99..fd45ea744 100644 --- a/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml +++ b/tests/e2e/configuration/server-mode/lightspeed-stack-tls.yaml @@ -8,7 +8,7 @@ service: access_log: true llama_stack: use_as_library_client: false - url: http://llama-stack:8321 + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 api_key: xyzzy user_data_collection: feedback_enabled: true diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index fdca1247c..7ee711494 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -26,6 +26,10 @@ reset_llama_stack_disrupt_once_tracking, reset_llama_stack_was_running, ) +from tests.e2e.features.steps.tls import ( + prepare_tls_feature_entry_on_prow, + reset_tls_prow_state, +) from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, @@ -237,24 +241,26 @@ def before_scenario(context: Context, scenario: Scenario) -> None: delattr(context, _attr) -def _dump_pod_logs_on_failure(scenario: Scenario, namespace: str) -> None: - """Dump llama-stack and lightspeed-stack pod logs when a scenario fails in Prow.""" +def _dump_pod_logs_on_failure( + context: Context, scenario: Scenario, namespace: str +) -> None: + """Dump pod diagnostics when a scenario fails in Prow (init + main container logs).""" if scenario.status != "failed": return - for pod in ("llama-stack-service", "lightspeed-stack-service"): - print(f"--- {pod} logs (scenario failed: {scenario.name}) ---") + pods: tuple[str, ...] = ("llama-stack-service", "lightspeed-stack-service") + feature = getattr(context, "feature", None) + feat_file = getattr(feature, "filename", "") or "" if feature else "" + if "tls.feature" in feat_file: + pods = (*pods, "e2e-mock-tls-inference") + print(f"--- scenario failed: {scenario.name!r} — dumping pod logs ---", flush=True) + for pod in pods: try: - r = subprocess.run( - ["oc", "logs", pod, "-n", namespace, "--tail=100"], - capture_output=True, - text=True, - timeout=15, - check=False, - ) - print(r.stdout or r.stderr or "(no output)") + result = run_e2e_ops("dump-pod-logs", [pod], timeout=90) + print(result.stdout, end="") + if result.stderr: + print(result.stderr, end="") except subprocess.TimeoutExpired: - print("(timed out fetching logs)") - print(f"--- end {pod} logs ---") + print(f"(timed out dumping logs for {pod})") def after_scenario(context: Context, scenario: Scenario) -> None: @@ -288,7 +294,7 @@ def after_scenario(context: Context, scenario: Scenario) -> None: """ if is_prow_environment(): _dump_pod_logs_on_failure( - scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc") + context, scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc") ) if getattr(context, "scenario_lightspeed_override_active", False): @@ -451,6 +457,9 @@ def before_feature(context: Context, feature: Feature) -> None: context.active_lightspeed_stack_config_basename = None # One real Llama disruption per feature (module-level flag; survives context resets) reset_llama_stack_disrupt_once_tracking() + if feature.filename and "tls.feature" in feature.filename: + reset_tls_prow_state() + prepare_tls_feature_entry_on_prow() try: max_flaky = int(os.getenv("E2E_FLAKY_MAX_ATTEMPTS", _E2E_FLAKY_MAX_ATTEMPTS)) diff --git a/tests/e2e/features/proxy.feature b/tests/e2e/features/proxy.feature index 907c4317d..00fde258a 100644 --- a/tests/e2e/features/proxy.feature +++ b/tests/e2e/features/proxy.feature @@ -1,4 +1,4 @@ -@e2e_group_3 @skip-in-library-mode +@e2e_group_3 @skip-in-library-mode @skip-in-prow Feature: Proxy and TLS networking tests for Llama Stack providers Verify that the Lightspeed Stack works correctly when Llama Stack's diff --git a/tests/e2e/features/steps/proxy.py b/tests/e2e/features/steps/proxy.py index 3fb29b270..7755cca91 100644 --- a/tests/e2e/features/steps/proxy.py +++ b/tests/e2e/features/steps/proxy.py @@ -305,6 +305,7 @@ def restore_if_modified(context: Context) -> None: _stop_proxy(context, "tunnel_proxy", "proxy_loop") _stop_proxy(context, "interception_proxy", "interception_proxy_loop") os.environ.pop("E2E_COPY_INTERCEPTION_CA_TO_LLAMA", None) + os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) if hasattr(context, "needs_interception_ca_on_llama"): delattr(context, "needs_interception_ca_on_llama") @@ -318,6 +319,14 @@ def restore_if_modified(context: Context) -> None: @given("Llama Stack is restarted") def restart_llama_stack(context: Context) -> None: """Restart the Llama Stack container.""" + from tests.e2e.features.steps.tls import ( + is_tls_configuration_feature, + restart_llama_for_tls_feature, + ) + + if is_tls_configuration_feature(context): + restart_llama_for_tls_feature(context) + return restart_container("llama-stack") diff --git a/tests/e2e/features/steps/tls.py b/tests/e2e/features/steps/tls.py index 66d56adcc..67f5fa360 100644 --- a/tests/e2e/features/steps/tls.py +++ b/tests/e2e/features/steps/tls.py @@ -9,6 +9,7 @@ """ import copy +import os from typing import Any, Optional from behave import given # pyright: ignore[reportAttributeAccessIssue] @@ -16,19 +17,17 @@ from tests.e2e.utils.llama_config_utils import ( backup_llama_config, + clear_llama_config_backup, load_llama_config, + reset_llama_run_config_to_pipeline_default, write_llama_config, ) +from tests.e2e.utils.prow_utils import get_namespace, restart_pod, run_e2e_ops +from tests.e2e.utils.utils import is_prow_environment -_TLS_PROVIDER_BASE: dict[str, Any] = { - "provider_id": "tls-openai", - "provider_type": "remote::openai", - "config": { - "api_key": "test-key", - "base_url": "https://mock-tls-inference:8443/v1", - "allowed_models": ["mock-tls-model"], - }, -} +_MOCK_TLS_PORT_TLS = 8443 +_MOCK_TLS_PORT_MTLS = 8444 +_MOCK_TLS_PORT_HOSTNAME_MISMATCH = 8445 _TLS_MODEL_RESOURCE: dict[str, str] = { "model_id": "mock-tls-model", @@ -36,6 +35,126 @@ "provider_model_id": "mock-tls-model", } +_mock_tls_cluster_deploy_state: dict[str, bool] = {"done": False} + + +def reset_tls_prow_state() -> None: + """Reset per-feature Prow state (call from ``before_feature``).""" + _mock_tls_cluster_deploy_state["done"] = False + os.environ.pop("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA", None) + clear_llama_config_backup() + + +def prepare_tls_feature_entry_on_prow() -> None: + """Baseline cluster state when tls.feature runs after other features in test_list. + + Earlier features (disrupted, MCP) delete or reconfigure Llama without mock TLS + certs. Isolated tls.feature runs skip that churn, which is why the same Gherkin + passes alone but flakes mid-feature in the full suite. + """ + if not is_prow_environment(): + return + print("[tls.feature] Prow/Konflux entry: reset run.yaml and warm Llama + mock TLS...") + reset_llama_run_config_to_pipeline_default() + result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300) + print(result.stdout, end="") + if result.returncode != 0: + raise RuntimeError( + "tls.feature entry: deploy-e2e-mock-tls-inference failed: " + f"{result.stderr or result.stdout}" + ) + _mock_tls_cluster_deploy_state["done"] = True + _prepare_tls_prow_llama_restart_env() + os.environ.setdefault( + "E2E_MOCK_TLS_INFERENCE_HOST", + _cluster_mock_tls_inference_host(), + ) + restart_pod("llama-stack") + print("[tls.feature] Prow/Konflux entry baseline complete", flush=True) + + +def is_tls_configuration_feature(context: Context) -> bool: + """Return True when the active Behave feature is ``tls.feature``.""" + feature = getattr(context, "feature", None) + if feature is None: + return False + name = getattr(feature, "name", "") or "" + return "TLS configuration" in name + + +def _prepare_tls_prow_llama_restart_env() -> None: + """Set env for full llama pod recreate with mock TLS certs mounted.""" + os.environ["E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA"] = "1" + + +def restart_llama_for_tls_feature(context: Context) -> None: + """Restart Llama for TLS tests (full pod recreate on Prow/Konflux).""" + from tests.e2e.utils.utils import restart_container + + if not is_prow_environment(): + restart_container("llama-stack") + return + + _prepare_tls_prow_llama_restart_env() + scenario = getattr(getattr(context, "scenario", None), "name", "") or "?" + print( + f"[tls.feature] Llama Stack restart: full recreate scenario={scenario!r}", + flush=True, + ) + restart_container("llama-stack") + + +def _cluster_mock_tls_inference_host() -> str: + """DNS name of the in-cluster mock TLS inference server (Konflux / Prow).""" + explicit = os.getenv("E2E_MOCK_TLS_INFERENCE_HOST", "").strip() + if explicit: + return explicit + return f"e2e-mock-tls-inference.{get_namespace()}.svc.cluster.local" + + +def _mock_tls_base_url(port: int) -> str: + """OpenAI-compatible base URL for the mock TLS inference server.""" + if is_prow_environment(): + host = _cluster_mock_tls_inference_host() + else: + host = "mock-tls-inference" + return f"https://{host}:{port}/v1" + + +def _tls_provider_base() -> dict[str, Any]: + """Default tls-openai provider dict with environment-appropriate base_url.""" + return { + "provider_id": "tls-openai", + "provider_type": "remote::openai", + "config": { + "api_key": "test-key", + "base_url": _mock_tls_base_url(_MOCK_TLS_PORT_TLS), + "allowed_models": ["mock-tls-model"], + "refresh_models": False, + }, + } + + +def _deploy_cluster_mock_tls_inference() -> None: + """Deploy the in-cluster mock TLS inference pod (Konflux / Prow).""" + if _mock_tls_cluster_deploy_state["done"]: + print("Using existing e2e-mock-tls-inference deployment") + return + + result = run_e2e_ops("deploy-e2e-mock-tls-inference", timeout=300) + print(result.stdout, end="") + if result.returncode != 0: + raise AssertionError( + "Failed to deploy e2e-mock-tls-inference: " + f"{result.stderr or result.stdout}" + ) + _prepare_tls_prow_llama_restart_env() + os.environ.setdefault( + "E2E_MOCK_TLS_INFERENCE_HOST", + _cluster_mock_tls_inference_host(), + ) + _mock_tls_cluster_deploy_state["done"] = True + def _ensure_tls_provider(config: dict[str, Any]) -> dict[str, Any]: """Find or create the tls-openai inference provider in the config. @@ -59,7 +178,7 @@ def _ensure_tls_provider(config: dict[str, Any]) -> dict[str, Any]: return provider # Provider not found — add it - provider = copy.deepcopy(_TLS_PROVIDER_BASE) + provider = copy.deepcopy(_tls_provider_base()) inference.append(provider) # Also register the model resource @@ -85,8 +204,13 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) - provider.setdefault("config", {}).setdefault("network", {}) if base_url is not None: provider["config"]["base_url"] = base_url + else: + provider["config"]["base_url"] = _mock_tls_base_url(_MOCK_TLS_PORT_TLS) + provider.setdefault("config", {})["refresh_models"] = False provider["config"]["network"]["tls"] = tls_config write_llama_config(config) + if is_prow_environment(): + _prepare_tls_prow_llama_restart_env() # --- Background Steps --- @@ -94,6 +218,15 @@ def _configure_tls(tls_config: dict[str, Any], base_url: Optional[str] = None) - # run.yaml (see proxy.py). Restart steps are listed in tls.feature / proxy.feature. +@given("The mock TLS inference server is deployed") +def deploy_mock_tls_inference_server(context: Context) -> None: + """Ensure mock TLS inference is reachable (Compose locally, pod in Prow).""" + if is_prow_environment(): + _deploy_cluster_mock_tls_inference() + return + print("Using docker-compose mock-tls-inference service") + + # --- TLS Configuration Steps --- @@ -124,7 +257,7 @@ def configure_tls_mtls(context: Context) -> None: "client_cert": "/certs/client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -139,7 +272,7 @@ def configure_mtls_no_client_cert(context: Context) -> None: """Configure run.yaml for mTLS port without client cert (should fail).""" _configure_tls( {"verify": "/certs/ca.crt"}, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -152,7 +285,7 @@ def configure_mtls_wrong_client_cert(context: Context) -> None: "client_cert": "/certs/ca.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -165,7 +298,7 @@ def configure_mtls_untrusted_client_cert(context: Context) -> None: "client_cert": "/certs/untrusted-client.crt", "client_key": "/certs/untrusted-client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -178,7 +311,7 @@ def configure_mtls_expired_client_cert(context: Context) -> None: "client_cert": "/certs/expired-client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8444/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_MTLS), ) @@ -187,7 +320,7 @@ def configure_tls_hostname_mismatch(context: Context) -> None: """Configure run.yaml to connect to hostname-mismatch server (should fail).""" _configure_tls( {"verify": "/certs/ca.crt"}, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) @@ -200,7 +333,7 @@ def configure_mtls_hostname_mismatch(context: Context) -> None: "client_cert": "/certs/client.crt", "client_key": "/certs/client.key", }, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) @@ -211,7 +344,7 @@ def configure_tls_min_version_hostname_mismatch(context: Context, version: str) """Configure run.yaml with TLS min version against hostname-mismatch server.""" _configure_tls( {"verify": "/certs/ca.crt", "min_version": version}, - base_url="https://mock-tls-inference:8445/v1", + base_url=_mock_tls_base_url(_MOCK_TLS_PORT_HOSTNAME_MISMATCH), ) diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature index a900b1c0f..97c089067 100644 --- a/tests/e2e/features/tls.feature +++ b/tests/e2e/features/tls.feature @@ -1,16 +1,19 @@ -@e2e_group_1 @skip-in-library-mode @skip-in-prow +@e2e_group_1 @skip-in-library-mode Feature: TLS configuration for remote inference providers Validate that Llama Stack's NetworkConfig.tls settings are applied correctly when connecting to a remote inference provider over HTTPS. + # Only Llama run.yaml changes per scenario; LCS uses lightspeed-stack-tls.yaml throughout. + Background: Given The service is started locally And The system is in default state And REST API service prefix is /v1 And the Lightspeed stack configuration directory is "tests/e2e/configuration" + And The original Llama Stack config is restored if modified + And The mock TLS inference server is deployed And The service uses the lightspeed-stack-tls.yaml configuration And The service is restarted - And The original Llama Stack config is restored if modified Scenario: Inference succeeds with TLS verification disabled Given Llama Stack is configured with TLS verification disabled diff --git a/tests/e2e/mock_tls_inference_server/server.py b/tests/e2e/mock_tls_inference_server/server.py index bfb4cbae5..25bd23a0c 100644 --- a/tests/e2e/mock_tls_inference_server/server.py +++ b/tests/e2e/mock_tls_inference_server/server.py @@ -13,6 +13,7 @@ import datetime import json +import os import ssl import threading import time @@ -29,6 +30,25 @@ MTLS_PORT = 8444 HOSTNAME_MISMATCH_PORT = 8445 +_DEFAULT_SERVER_CERT_DNS_NAMES: tuple[str, ...] = ( + "mock-tls-inference", + "localhost", + "127.0.0.1", +) + + +def _server_cert_dns_names() -> tuple[str, ...]: + """Return DNS identities for the main server certificate. + + Reads comma-separated ``TLS_CERT_DNS_NAMES`` (set in Konflux/Prow manifest). + Falls back to Docker Compose defaults when unset. + """ + raw = os.environ.get("TLS_CERT_DNS_NAMES", "").strip() + if not raw: + return _DEFAULT_SERVER_CERT_DNS_NAMES + names = tuple(name.strip() for name in raw.split(",") if name.strip()) + return names or _DEFAULT_SERVER_CERT_DNS_NAMES + class OpenAIHandler(BaseHTTPRequestHandler): """Handles OpenAI-compatible API requests over HTTPS.""" @@ -221,8 +241,9 @@ def main() -> None: # Generate CA and certificates ca = trustme.CA() - # Server cert with SANs for Docker service name and localhost - server_cert = ca.issue_cert("mock-tls-inference", "localhost", "127.0.0.1") + server_dns_names = _server_cert_dns_names() + print(f" Server cert DNS names: {', '.join(server_dns_names)}") + server_cert = ca.issue_cert(*server_dns_names) # Client cert for mTLS testing (use a simple hostname without spaces) client_cert = ca.issue_cert("tls-e2e-test-client") diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt index 34e1b8647..26926a81f 100644 --- a/tests/e2e/test_list.txt +++ b/tests/e2e/test_list.txt @@ -26,4 +26,4 @@ features/mcp_servers_api_auth.feature features/mcp_servers_api_no_config.feature features/proxy.feature features/tls.feature -features/opentelemetry.feature +features/opentelemetry.feature \ No newline at end of file diff --git a/tests/e2e/utils/llama_config_utils.py b/tests/e2e/utils/llama_config_utils.py index eb5f67b9d..e8fdf4832 100644 --- a/tests/e2e/utils/llama_config_utils.py +++ b/tests/e2e/utils/llama_config_utils.py @@ -3,6 +3,7 @@ import os import shutil import tempfile +from pathlib import Path from typing import Any, Optional import yaml @@ -20,6 +21,25 @@ _llama_config_backup_key: dict[str, Optional[str]] = {"value": None} +def clear_llama_config_backup() -> None: + """Drop in-memory run.yaml backup (e.g. at start of tls.feature).""" + _llama_config_backup_key["value"] = None + + +def reset_llama_run_config_to_pipeline_default() -> None: + """Reset llama-stack-config run.yaml to Konflux/Prow pipeline seed (run-ci.yaml).""" + if not is_prow_environment(): + return + run_ci = ( + Path(__file__).resolve().parents[1] / "configs" / "run-ci.yaml" + ) + if not run_ci.is_file(): + print(f"WARN: pipeline run.yaml seed not found at {run_ci}", flush=True) + return + print(f"Resetting llama-stack-config from {run_ci.name}...", flush=True) + update_llama_run_configmap(str(run_ci)) + + def _local_llama_config_path() -> str: """Return local run.yaml path for Docker/local e2e execution.""" return os.getenv("E2E_LLAMA_CONFIG_PATH", _DEFAULT_LOCAL_LLAMA_CONFIG_PATH) diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index ff771904b..48056b243 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -93,11 +93,18 @@ def restart_pod(container_name: str) -> None: """ if container_name in _LLAMA_RESTART_NAMES: op = "restart-llama-stack" - timeout = 420 + # Subprocess cap must exceed e2e-ops internal waits (pod + in-pod health + port-forward). + # Konflux TLS full recreate: ~6–12 min typical, 15+ min under load (user-reported 400s+). + if os.environ.get("E2E_COPY_MOCK_TLS_CERTS_TO_LLAMA") == "1": + timeout = 1200 + elif os.environ.get("E2E_KONFLUX_E2E") == "1": + timeout = 720 + else: + timeout = 420 elif container_name in _LIGHTSPEED_RESTART_NAMES: op = "restart-lightspeed" - # Pod wait (up to ~120s) + port-forward retries + slow Konflux/Prow clusters. - timeout = 320 + # Konflux LCS: TCP readiness + Llama handshake; TLS suite often needs 200–300s. + timeout = 480 if os.environ.get("E2E_KONFLUX_E2E") == "1" else 320 else: print( f"Warning: restart_pod({container_name!r}) unknown; " @@ -110,11 +117,17 @@ def restart_pod(container_name: str) -> None: print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="") - detail = (result.stderr or result.stdout or "").strip() + combined = f"{result.stdout or ''}\n{result.stderr or ''}".strip() + # Prefer full e2e-ops output when diagnostics were printed (TLS/Llama failures). + if "========== failure logs:" in combined: + detail = combined + else: + detail = "\n".join(combined.splitlines()[-40:]) if combined else "" + detail = detail or f"exit {result.returncode}" raise subprocess.CalledProcessError( result.returncode, op, - detail or None, + detail, ) except subprocess.TimeoutExpired as e: print(f"Failed to restart pod {container_name}: {e}") @@ -128,8 +141,11 @@ def restore_llama_stack_pod() -> None: subprocess.CalledProcessError: If oc/e2e-ops restore fails. subprocess.TimeoutExpired: If the operation times out. """ - # wait_for_pod (up to ~180s) + in-pod /v1/health polling (~105s) — allow headroom. - result = run_e2e_ops("restart-llama-stack", timeout=420) + if os.environ.get("E2E_KONFLUX_E2E") == "1": + timeout = 720 + else: + timeout = 420 + result = run_e2e_ops("restart-llama-stack", timeout=timeout) print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="")