Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions providers/aws/services/opensearch/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,17 @@ func (c *Client) GetExistingCommitments(ctx context.Context) ([]common.Commitmen

// PurchaseCommitment purchases an OpenSearch Reserved Instance.
//
// PurchaseReservedInstanceOfferingInput has no Tags field tagging happens
// PurchaseReservedInstanceOfferingInput has no Tags field -- tagging happens
// post-purchase via opensearch:AddTags with a reserved-instance ARN
// (arn:aws:es:<region>:<account>:reserved-instance/<uuid>). AWS hasn't
// explicitly documented reserved-instance as a supported resource type for
// AddTags (only domain/data-source/application), so the call may return a
// validation error in which case retry.ErrPermanent short-circuits and the
// validation error -- in which case retry.ErrPermanent short-circuits and the
// failure is logged without blocking the purchase. If AWS ever adds support,
// this will start working with no code change.
// On tagging failure a structured line is emitted (OPENSEARCH_TAG_FAILED) so
// operator dashboards can alert on untagged RIs; see
// runbooks/opensearch-untagged-ri.md for the manual remediation steps.
func (c *Client) PurchaseCommitment(ctx context.Context, rec common.Recommendation, opts common.PurchaseOptions) (common.PurchaseResult, error) {
result := common.PurchaseResult{
Recommendation: rec,
Expand Down Expand Up @@ -211,7 +214,9 @@ func (c *Client) PurchaseCommitment(ctx context.Context, rec common.Recommendati
}

if err := c.tagReservedInstance(ctx, result.CommitmentID, rec, opts.Source); err != nil {
log.Printf("WARNING: failed to tag OpenSearch RI %s after purchase (RI is bought; tag missing, source recorded in purchase_history): %v", result.CommitmentID, err)
// Structured line for operator dashboards / alerting. The RI is
// purchased; only the tag is missing. Follow runbooks/opensearch-untagged-ri.md.
log.Printf("OPENSEARCH_TAG_FAILED commitment_id=%s error=%v", result.CommitmentID, err)
}

return result, nil
Expand Down
66 changes: 66 additions & 0 deletions providers/aws/services/opensearch/client_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package opensearch

import (
"bytes"
"context"
"fmt"
"log"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -949,3 +951,67 @@ func TestClient_PurchaseCommitment_NoToken_RichReservationName(t *testing.T) {
assert.Contains(t, capturedName, "2x-3yr", "count and term must be embedded: %q", capturedName)
assert.LessOrEqual(t, len(capturedName), 60, "must fit AWS reservation-ID cap")
}

// TestPurchaseCommitment_TagFailure_StructuredLog asserts that when AddTags
// returns an error after a successful purchase:
// - the purchase result is still Success=true (tag failure is non-fatal)
// - a line containing "OPENSEARCH_TAG_FAILED" is emitted to the log
// - the commitment ID is present in that log line (for operator lookup)
// - no AWS account ID or other PII appears in the log line
func TestPurchaseCommitment_TagFailure_StructuredLog(t *testing.T) {
mockOS := &MockOpenSearchClient{}
mockSTS := &MockOpenSearchSTSClient{}
t.Cleanup(func() { mockOS.AssertExpectations(t); mockSTS.AssertExpectations(t) })

client := &Client{client: mockOS, stsClient: mockSTS, region: "us-east-1"}

rec := common.Recommendation{
Service: common.ServiceSearch,
ResourceType: "m5.large.search",
Count: 1,
Term: "1yr",
Region: "us-east-1",
PaymentOption: "all-upfront",
}

mockOS.On("DescribeReservedInstanceOfferings", mock.Anything, mock.Anything).
Return(&opensearch.DescribeReservedInstanceOfferingsOutput{
ReservedInstanceOfferings: []types.ReservedInstanceOffering{{
ReservedInstanceOfferingId: aws.String("off-tag-fail"),
InstanceType: types.OpenSearchPartitionInstanceTypeM5LargeSearch,
Duration: 31536000,
PaymentOption: types.ReservedInstancePaymentOptionAllUpfront,
}},
}, nil)

const riID = "ri-tag-fail-abc123"
mockOS.On("PurchaseReservedInstanceOffering", mock.Anything, mock.Anything).
Return(&opensearch.PurchaseReservedInstanceOfferingOutput{
ReservedInstanceId: aws.String(riID),
}, nil)

mockSTS.On("GetCallerIdentity", mock.Anything, mock.Anything).
Return(&sts.GetCallerIdentityOutput{Account: aws.String("000000000000")}, nil)

mockOS.On("AddTags", mock.Anything, mock.Anything).
Return(nil, fmt.Errorf("ValidationException: invalid resource type"))

// Redirect log output to capture the structured line.
var buf bytes.Buffer
origWriter := log.Writer()
log.SetOutput(&buf)
t.Cleanup(func() { log.SetOutput(origWriter) })

result, err := client.PurchaseCommitment(context.Background(), rec, common.PurchaseOptions{Source: common.PurchaseSourceCLI})

assert.NoError(t, err, "tag failure must not surface as a purchase error")
assert.True(t, result.Success, "purchase must remain successful when tagging fails")
assert.Equal(t, riID, result.CommitmentID)

logOut := buf.String()
assert.Contains(t, logOut, "OPENSEARCH_TAG_FAILED", "structured sentinel must appear in log")
assert.Contains(t, logOut, "commitment_id="+riID, "commitment ID must be present for operator lookup")
// The account ID (000000000000) must not be logged -- it is not PII but
// the structured line should stay minimal and scoped to the commitment.
assert.NotContains(t, logOut, "000000000000", "account ID must not appear in the tag-failure log line")
}
77 changes: 77 additions & 0 deletions runbooks/opensearch-untagged-ri.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Runbook: OpenSearch RI purchased but not tagged

## Alert

Log pattern (grep / CloudWatch Logs Insights):

```
OPENSEARCH_TAG_FAILED commitment_id=<id> error=<msg>
```

Emitted by `providers/aws/services/opensearch/client.go` when
`opensearch:AddTags` fails after a successful RI purchase. The RI is active;
only the CUDly source tag is absent.

## Background

AWS's `PurchaseReservedInstanceOffering` API has no inline `Tags` field.
CUDly attempts a best-effort `AddTags` call after purchase using a
constructed ARN of the form:

```
arn:aws:es:<region>:<account>:reserved-instance/<uuid>
```

AWS has not officially documented `reserved-instance` as a supported ARN
type for `opensearch:AddTags` (only `domain`, `data-source`, and
`application` are listed). The call is wrapped in `retry.ErrPermanent` so
the retry budget is not exhausted on calls AWS will never accept. If AWS
extends support, the tag call will start succeeding with no code change.

## Impact

The RI is purchased and active. Cost attribution and audit queries that
rely on the CUDly source tag (key: `cudly:purchase-source`) will not find
this reservation unless it is manually tagged.

## Remediation

1. Identify the untagged RI from the log line:

```
OPENSEARCH_TAG_FAILED commitment_id=<ri-uuid> error=...
```

2. Tag it manually via the AWS CLI:

```bash
REGION=<region>
ACCOUNT=<account-id>
RI_ID=<ri-uuid>
SOURCE=<purchase-source> # e.g. cudly-cli or cudly-web

aws opensearch add-tags \
--arn "arn:aws:es:${REGION}:${ACCOUNT}:reserved-instance/${RI_ID}" \
--tag-list \
Key=Purpose,Value="Reserved Instance Purchase" \
Key=Tool,Value=CUDly \
"Key=cudly:purchase-source,Value=${SOURCE}"
```

If the call returns a `ValidationException` the ARN type is still
unsupported by AWS; proceed to the fallback below.

3. **Fallback (AWS still rejects reserved-instance ARN):** Tag the parent
OpenSearch domain instead, or record the RI ID in your cost-allocation
spreadsheet until AWS adds native support.

## Follow-up

If this alert fires repeatedly (not just on ValidationException but on
transient errors), open an issue to add a retry with backoff instead of
the current permanent-error short-circuit. Reference issue #250.

If AWS releases documentation confirming `reserved-instance` support for
`AddTags`, remove the `retry.ErrPermanent` wrapper in
`providers/aws/services/opensearch/client.go:tagReservedInstance` and
update this runbook.
Loading