Skip to content
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
800bd8d
Add design: Google Gemini embedding client
pmbrull May 7, 2026
9caae6c
Add implementation plan: Google Gemini embedding client
pmbrull May 7, 2026
4d389e9
feat(spec): add google embedding provider config block
pmbrull May 7, 2026
ed28632
feat(search): add GoogleEmbeddingClient with happy-path test
pmbrull May 7, 2026
e0aa250
refactor(search): extract MODELS_PREFIX constant in GoogleEmbeddingCl…
pmbrull May 7, 2026
544f8b1
test(search): add constructor validation tests for GoogleEmbeddingClient
pmbrull May 7, 2026
c2c04e8
test(search): add blank model id test and clarify null-modelId workar…
pmbrull May 7, 2026
0a5e281
test(search): add HTTP error and malformed response tests for GoogleE…
pmbrull May 7, 2026
bf39f6c
test(search): tighten empty values array assertion to check message
pmbrull May 7, 2026
257f92c
test(search): verify Google embedding request URL, headers, and body …
pmbrull May 7, 2026
399b3a7
test(search): extract endpoint constant and harden extractBody helper
pmbrull May 7, 2026
82342c9
feat(search): wire google embedding provider into SearchRepository sw…
pmbrull May 7, 2026
69912b7
test(search): cover null dimension and custom endpoint, drop redundan…
pmbrull May 7, 2026
a61bf0a
Update generated TypeScript types
github-actions[bot] May 7, 2026
e91c90f
Remove internal planning docs from PR
pmbrull May 7, 2026
7870e5c
Address PR review comments
pmbrull May 8, 2026
8174348
Update generated TypeScript types
github-actions[bot] May 8, 2026
69a5fb2
Wire google embedding provider into openmetadata.yaml defaults
pmbrull May 8, 2026
0b112df
Use gemini-embedding-001 default and pass outputDimensionality
pmbrull May 8, 2026
1581619
Update generated TypeScript types
github-actions[bot] May 8, 2026
a787b4c
Guard against missing google config in SystemRepository diagnostic
pmbrull May 8, 2026
e7c36ab
Validate google.endpoint contains :embedContent at construction
pmbrull May 8, 2026
30ddc3d
feat(spec): add modelId chat field to google block
pmbrull May 8, 2026
b00e1e2
Update generated TypeScript types
github-actions[bot] May 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion conf/openmetadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ elasticsearch:
naturalLanguageSearch:
enabled: ${NATURAL_LANGUAGE_SEARCH_ENABLED:-false}
semanticSearchEnabled: ${SEMANTIC_SEARCH_ENABLED:-false}
embeddingProvider: ${EMBEDDING_PROVIDER:-bedrock} # Options: "openai", "bedrock", "djl"
embeddingProvider: ${EMBEDDING_PROVIDER:-bedrock} # Options: "openai", "bedrock", "google", "djl"
maxConcurrentEmbeddingRequests: ${MAX_CONCURRENT_EMBEDDING_REQUESTS:-10}
providerClass: ${NATURAL_LANGUAGE_SEARCH_PROVIDER_CLASS:-org.openmetadata.service.search.nlq.NoOpNLQService}
bedrock:
Expand All @@ -515,6 +515,11 @@ elasticsearch:
apiVersion: ${OPENAI_API_VERSION:-"2024-02-01"} # Azure OpenAI API version
embeddingModelId: ${OPENAI_EMBEDDING_MODEL_ID:-"text-embedding-3-small"}
embeddingDimension: ${OPENAI_EMBEDDING_DIMENSION:-1536}
google:
apiKey: ${GOOGLE_API_KEY:-""} # API key from Google AI Studio
embeddingModelId: ${GOOGLE_EMBEDDING_MODEL_ID:-"text-embedding-004"}
embeddingDimension: ${GOOGLE_EMBEDDING_DIMENSION:-768}
endpoint: ${GOOGLE_API_ENDPOINT:-""} # Optional override; full :embedContent URL. Leave empty to use the default Generative Language API endpoint.
Comment thread
pmbrull marked this conversation as resolved.
djl:
embeddingModel: ${DJL_EMBEDDING_MODEL:-"ai.djl.huggingface.pytorch/sentence-transformers/all-MiniLM-L6-v2"}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@
import org.openmetadata.service.search.vector.client.BedrockEmbeddingClient;
import org.openmetadata.service.search.vector.client.DjlEmbeddingClient;
import org.openmetadata.service.search.vector.client.EmbeddingClient;
import org.openmetadata.service.search.vector.client.GoogleEmbeddingClient;
import org.openmetadata.service.search.vector.client.OpenAIEmbeddingClient;
import org.openmetadata.service.security.policyevaluator.SubjectContext;
import org.openmetadata.service.util.EntityUtil;
Expand Down Expand Up @@ -3227,6 +3228,13 @@ protected EmbeddingClient createEmbeddingClient(ElasticSearchConfiguration esCon
}
yield new OpenAIEmbeddingClient(esConfig);
}
case "google" -> {
if (config.getGoogle() == null) {
throw new IllegalStateException(
"Google configuration is required when using google provider");
}
yield new GoogleEmbeddingClient(esConfig);
}
case "djl" -> {
if (config.getDjl() == null) {
throw new IllegalStateException("DJL configuration is required when using djl provider");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* Copyright 2024 Collate
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package org.openmetadata.service.search.vector.client;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.io.IOException;
import java.net.URI;
import java.net.URLEncoder;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import lombok.extern.slf4j.Slf4j;
import org.openmetadata.schema.service.configuration.elasticsearch.ElasticSearchConfiguration;
import org.openmetadata.schema.service.configuration.elasticsearch.Google;
import org.openmetadata.schema.service.configuration.elasticsearch.NaturalLanguageSearchConfiguration;

@Slf4j
public final class GoogleEmbeddingClient extends EmbeddingClient {
private static final ObjectMapper MAPPER = new ObjectMapper();
private static final String MODELS_PREFIX = "models/";
private static final String DEFAULT_BASE_URL =
"https://generativelanguage.googleapis.com/v1beta/" + MODELS_PREFIX;

private final HttpClient httpClient;
private final String apiKey;
private final String modelId;
private final int dimension;
private final String endpoint;

public GoogleEmbeddingClient(ElasticSearchConfiguration config) {
super(resolveMaxConcurrent(config));
NaturalLanguageSearchConfiguration nlsCfg = config.getNaturalLanguageSearch();
Google googleCfg = nlsCfg.getGoogle();
if (googleCfg == null) {
throw new IllegalArgumentException("Google configuration is required");
}
if (googleCfg.getApiKey() == null || googleCfg.getApiKey().isBlank()) {
throw new IllegalArgumentException("Google API key is required");
}
if (googleCfg.getEmbeddingModelId() == null || googleCfg.getEmbeddingModelId().isBlank()) {
throw new IllegalArgumentException("Google embedding model ID is required");
}
if (googleCfg.getEmbeddingDimension() == null || googleCfg.getEmbeddingDimension() <= 0) {
throw new IllegalArgumentException("Google embedding dimension must be positive");
}

this.apiKey = googleCfg.getApiKey();
this.modelId = googleCfg.getEmbeddingModelId();
this.dimension = googleCfg.getEmbeddingDimension();
this.endpoint = resolveEndpoint(googleCfg);
this.httpClient = HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(30)).build();

LOG.info(
"Initialized GoogleEmbeddingClient with model={}, dimension={}, endpoint={}",
modelId,
dimension,
endpoint);
}

GoogleEmbeddingClient(
HttpClient httpClient, String apiKey, String modelId, int dimension, String endpoint) {
this(httpClient, apiKey, modelId, dimension, endpoint, DEFAULT_MAX_CONCURRENT_REQUESTS);
}

GoogleEmbeddingClient(
HttpClient httpClient,
String apiKey,
String modelId,
int dimension,
String endpoint,
int maxConcurrentRequests) {
super(maxConcurrentRequests);
this.httpClient = httpClient;
this.apiKey = apiKey;
this.modelId = modelId;
this.dimension = dimension;
this.endpoint = endpoint;
}

private String resolveEndpoint(Google config) {
String configured = config.getEndpoint();
if (configured != null && !configured.isBlank()) {
return configured.replaceAll("/+$", "");
Comment thread
pmbrull marked this conversation as resolved.
Outdated
}
return DEFAULT_BASE_URL + config.getEmbeddingModelId() + ":embedContent";
}

@Override
protected float[] doEmbed(String text) {
if (text == null || text.isBlank()) {
throw new IllegalArgumentException("Input text must not be null or blank");
}

try {
String body = buildRequestBody(text);
HttpRequest request = buildRequest(body);
HttpResponse<String> response =
httpClient.send(request, HttpResponse.BodyHandlers.ofString());

if (response.statusCode() != 200) {
String errorMsg = extractErrorMessage(response.body());
throw new RuntimeException(
"Google API returned status " + response.statusCode() + ": " + errorMsg);
}

return parseEmbeddingResponse(response.body());
} catch (IOException e) {
LOG.error("IO error calling Google API: {}", e.getMessage(), e);
throw new RuntimeException("Google embedding generation failed due to IO error", e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException("Google embedding generation was interrupted", e);
}
}

private String buildRequestBody(String text) throws IOException {
ObjectNode payload = MAPPER.createObjectNode();
payload.put("model", MODELS_PREFIX + modelId);
ObjectNode content = payload.putObject("content");
ArrayNode parts = content.putArray("parts");
ObjectNode part = parts.addObject();
part.put("text", text);
return MAPPER.writeValueAsString(payload);
}

private HttpRequest buildRequest(String body) {
// Google's Generative Language API requires the API key as a `key=` query parameter;
// it does not accept Bearer/Authorization headers for AI Studio keys.
String encodedKey = URLEncoder.encode(apiKey, StandardCharsets.UTF_8);
Comment on lines +151 to +154
String separator = endpoint.contains("?") ? "&" : "?";
String url = endpoint + separator + "key=" + encodedKey;
return HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Content-Type", "application/json")
.timeout(Duration.ofSeconds(30))
.POST(HttpRequest.BodyPublishers.ofString(body))
.build();
Comment on lines +151 to +162
}

@Override
public int getDimension() {
return dimension;
}

@Override
public String getModelId() {
return modelId;
}

private float[] parseEmbeddingResponse(String responseBody) {
try {
JsonNode root = MAPPER.readTree(responseBody);
JsonNode embedding = root.get("embedding");
if (embedding == null || !embedding.isObject()) {
throw new RuntimeException("Invalid Google response: no embedding object found");
}
JsonNode values = embedding.get("values");
if (values == null || !values.isArray() || values.isEmpty()) {
throw new RuntimeException("Invalid Google response: no values array found");
}
float[] result = new float[values.size()];
for (int i = 0; i < values.size(); i++) {
result[i] = (float) values.get(i).asDouble();
}
return result;
} catch (IOException e) {
throw new RuntimeException("Failed to parse Google embedding response", e);
}
}

private String extractErrorMessage(String responseBody) {
try {
JsonNode root = MAPPER.readTree(responseBody);
JsonNode error = root.get("error");
if (error != null && error.has("message")) {
return error.get("message").asText();
}
} catch (Exception e) {
LOG.trace("Could not parse Google error envelope: {}", e.getMessage());
}
return responseBody;
}
}
Loading
Loading