From fa0df02db77233443423ea3ef13052e576268246 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Thu, 28 May 2026 21:14:28 +0200 Subject: [PATCH 1/2] audit(aws): surface OpenSearch RI tag-failure for ops monitoring (closes #250) Emit a structured OPENSEARCH_TAG_FAILED log line when the best-effort AddTags call after RI purchase fails, so operator dashboards can alert on untagged reservations. Add runbooks/opensearch-untagged-ri.md with manual remediation steps and background on the unsupported ARN type. Update the PurchaseCommitment comment to reference the runbook. --- providers/aws/services/opensearch/client.go | 11 ++- .../aws/services/opensearch/client_test.go | 66 ++++++++++++++++ runbooks/opensearch-untagged-ri.md | 77 +++++++++++++++++++ 3 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 runbooks/opensearch-untagged-ri.md diff --git a/providers/aws/services/opensearch/client.go b/providers/aws/services/opensearch/client.go index e4071aed..cce486dd 100644 --- a/providers/aws/services/opensearch/client.go +++ b/providers/aws/services/opensearch/client.go @@ -131,14 +131,17 @@ func (c *Client) GetExistingCommitments(ctx context.Context) ([]common.Commitmen // PurchaseCommitment purchases an OpenSearch Reserved Instance. // -// PurchaseReservedInstanceOfferingInput has no Tags field — tagging happens +// PurchaseReservedInstanceOfferingInput has no Tags field -- tagging happens // post-purchase via opensearch:AddTags with a reserved-instance ARN // (arn:aws:es:::reserved-instance/). AWS hasn't // explicitly documented reserved-instance as a supported resource type for // AddTags (only domain/data-source/application), so the call may return a -// validation error — in which case retry.ErrPermanent short-circuits and the +// validation error -- in which case retry.ErrPermanent short-circuits and the // failure is logged without blocking the purchase. If AWS ever adds support, // this will start working with no code change. +// On tagging failure a structured line is emitted (OPENSEARCH_TAG_FAILED) so +// operator dashboards can alert on untagged RIs; see +// runbooks/opensearch-untagged-ri.md for the manual remediation steps. func (c *Client) PurchaseCommitment(ctx context.Context, rec common.Recommendation, opts common.PurchaseOptions) (common.PurchaseResult, error) { result := common.PurchaseResult{ Recommendation: rec, @@ -211,7 +214,9 @@ func (c *Client) PurchaseCommitment(ctx context.Context, rec common.Recommendati } if err := c.tagReservedInstance(ctx, result.CommitmentID, rec, opts.Source); err != nil { - log.Printf("WARNING: failed to tag OpenSearch RI %s after purchase (RI is bought; tag missing, source recorded in purchase_history): %v", result.CommitmentID, err) + // Structured line for operator dashboards / alerting. The RI is + // purchased; only the tag is missing. Follow runbooks/opensearch-untagged-ri.md. + log.Printf("OPENSEARCH_TAG_FAILED commitment_id=%s error=%v", result.CommitmentID, err) } return result, nil diff --git a/providers/aws/services/opensearch/client_test.go b/providers/aws/services/opensearch/client_test.go index 8edcb4a0..3dfa9bc4 100644 --- a/providers/aws/services/opensearch/client_test.go +++ b/providers/aws/services/opensearch/client_test.go @@ -1,8 +1,10 @@ package opensearch import ( + "bytes" "context" "fmt" + "log" "strings" "testing" "time" @@ -949,3 +951,67 @@ func TestClient_PurchaseCommitment_NoToken_RichReservationName(t *testing.T) { assert.Contains(t, capturedName, "2x-3yr", "count and term must be embedded: %q", capturedName) assert.LessOrEqual(t, len(capturedName), 60, "must fit AWS reservation-ID cap") } + +// TestPurchaseCommitment_TagFailure_StructuredLog asserts that when AddTags +// returns an error after a successful purchase: +// - the purchase result is still Success=true (tag failure is non-fatal) +// - a line containing "OPENSEARCH_TAG_FAILED" is emitted to the log +// - the commitment ID is present in that log line (for operator lookup) +// - no AWS account ID or other PII appears in the log line +func TestPurchaseCommitment_TagFailure_StructuredLog(t *testing.T) { + mockOS := &MockOpenSearchClient{} + mockSTS := &MockOpenSearchSTSClient{} + t.Cleanup(func() { mockOS.AssertExpectations(t); mockSTS.AssertExpectations(t) }) + + client := &Client{client: mockOS, stsClient: mockSTS, region: "us-east-1"} + + rec := common.Recommendation{ + Service: common.ServiceSearch, + ResourceType: "m5.large.search", + Count: 1, + Term: "1yr", + Region: "us-east-1", + PaymentOption: "all-upfront", + } + + mockOS.On("DescribeReservedInstanceOfferings", mock.Anything, mock.Anything). + Return(&opensearch.DescribeReservedInstanceOfferingsOutput{ + ReservedInstanceOfferings: []types.ReservedInstanceOffering{{ + ReservedInstanceOfferingId: aws.String("off-tag-fail"), + InstanceType: types.OpenSearchPartitionInstanceTypeM5LargeSearch, + Duration: 31536000, + PaymentOption: types.ReservedInstancePaymentOptionAllUpfront, + }}, + }, nil) + + const riID = "ri-tag-fail-abc123" + mockOS.On("PurchaseReservedInstanceOffering", mock.Anything, mock.Anything). + Return(&opensearch.PurchaseReservedInstanceOfferingOutput{ + ReservedInstanceId: aws.String(riID), + }, nil) + + mockSTS.On("GetCallerIdentity", mock.Anything, mock.Anything). + Return(&sts.GetCallerIdentityOutput{Account: aws.String("000000000000")}, nil) + + mockOS.On("AddTags", mock.Anything, mock.Anything). + Return(nil, fmt.Errorf("ValidationException: invalid resource type")) + + // Redirect log output to capture the structured line. + var buf bytes.Buffer + origWriter := log.Writer() + log.SetOutput(&buf) + t.Cleanup(func() { log.SetOutput(origWriter) }) + + result, err := client.PurchaseCommitment(context.Background(), rec, common.PurchaseOptions{Source: common.PurchaseSourceCLI}) + + assert.NoError(t, err, "tag failure must not surface as a purchase error") + assert.True(t, result.Success, "purchase must remain successful when tagging fails") + assert.Equal(t, riID, result.CommitmentID) + + logOut := buf.String() + assert.Contains(t, logOut, "OPENSEARCH_TAG_FAILED", "structured sentinel must appear in log") + assert.Contains(t, logOut, "commitment_id="+riID, "commitment ID must be present for operator lookup") + // The account ID (000000000000) must not be logged -- it is not PII but + // the structured line should stay minimal and scoped to the commitment. + assert.NotContains(t, logOut, "000000000000", "account ID must not appear in the tag-failure log line") +} diff --git a/runbooks/opensearch-untagged-ri.md b/runbooks/opensearch-untagged-ri.md new file mode 100644 index 00000000..9d796836 --- /dev/null +++ b/runbooks/opensearch-untagged-ri.md @@ -0,0 +1,77 @@ +# Runbook: OpenSearch RI purchased but not tagged + +## Alert + +Log pattern (grep / CloudWatch Logs Insights): + +``` +OPENSEARCH_TAG_FAILED commitment_id= error= +``` + +Emitted by `providers/aws/services/opensearch/client.go` when +`opensearch:AddTags` fails after a successful RI purchase. The RI is active; +only the CUDly source tag is absent. + +## Background + +AWS's `PurchaseReservedInstanceOffering` API has no inline `Tags` field. +CUDly attempts a best-effort `AddTags` call after purchase using a +constructed ARN of the form: + +``` +arn:aws:es:::reserved-instance/ +``` + +AWS has not officially documented `reserved-instance` as a supported ARN +type for `opensearch:AddTags` (only `domain`, `data-source`, and +`application` are listed). The call is wrapped in `retry.ErrPermanent` so +the retry budget is not exhausted on calls AWS will never accept. If AWS +extends support, the tag call will start succeeding with no code change. + +## Impact + +The RI is purchased and active. Cost attribution and audit queries that +rely on the CUDly source tag (key: `cudly:purchase-source`) will not find +this reservation unless it is manually tagged. + +## Remediation + +1. Identify the untagged RI from the log line: + + ``` + OPENSEARCH_TAG_FAILED commitment_id= error=... + ``` + +2. Tag it manually via the AWS CLI: + + ```bash + REGION= + ACCOUNT= + RI_ID= + SOURCE= # e.g. cudly-cli or cudly-web + + aws opensearch add-tags \ + --arn "arn:aws:es:${REGION}:${ACCOUNT}:reserved-instance/${RI_ID}" \ + --tag-list \ + Key=Purpose,Value="Reserved Instance Purchase" \ + Key=Tool,Value=CUDly \ + "Key=cudly:purchase-source,Value=${SOURCE}" + ``` + + If the call returns a `ValidationException` the ARN type is still + unsupported by AWS; proceed to the fallback below. + +3. **Fallback (AWS still rejects reserved-instance ARN):** Tag the parent + OpenSearch domain instead, or record the RI ID in your cost-allocation + spreadsheet until AWS adds native support. + +## Follow-up + +If this alert fires repeatedly (not just on ValidationException but on +transient errors), open an issue to add a retry with backoff instead of +the current permanent-error short-circuit. Reference issue #250. + +If AWS releases documentation confirming `reserved-instance` support for +`AddTags`, remove the `retry.ErrPermanent` wrapper in +`providers/aws/services/opensearch/client.go:tagReservedInstance` and +update this runbook. From c35a3152079efc23dc412f32425da4d85f0d1931 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Mon, 1 Jun 2026 19:26:06 +0200 Subject: [PATCH 2/2] fix(ci): re-trigger pre-commit after transient setup-go SHA fetch failure (#831) The prior run failed because GitHub's CDN returned a 404 for the pinned actions/setup-go SHA at download time (transient infrastructure issue). No code changes are needed; this empty commit re-triggers CI.