diff --git a/distribution/offloaders/pom.xml b/distribution/offloaders/pom.xml index e94893d192366..20f9ec11aca4d 100644 --- a/distribution/offloaders/pom.xml +++ b/distribution/offloaders/pom.xml @@ -53,7 +53,7 @@ ${project.groupId} - tiered-storage-jcloud + tiered-storage-opendal ${project.version} pom provided diff --git a/distribution/offloaders/src/assemble/README b/distribution/offloaders/src/assemble/README index ebc1c28016e01..7b16de5854d89 100644 --- a/distribution/offloaders/src/assemble/README +++ b/distribution/offloaders/src/assemble/README @@ -8,3 +8,14 @@ contains: * META-INF/DEPEDENCIES file with licensing information for all transitive dependencies + +Included offloaders: + + * tiered-storage-opendal-.nar (cloud offloaders: aws-s3/S3/aliyun-oss/google-cloud-storage/azureblob) + * tiered-storage-file-system-.nar (filesystem offloader) + +Migration / rollback guidance (to avoid offloader discovery conflicts): + + * Migration: ensure ${PULSAR_HOME}/offloaders contains only the OpenDAL NAR + (tiered-storage-opendal-*.nar). Do not keep tiered-storage-jcloud-*.nar alongside it. + * Rollback: remove tiered-storage-opendal-*.nar and restore tiered-storage-jcloud-*.nar. diff --git a/distribution/offloaders/src/assemble/offloaders.xml b/distribution/offloaders/src/assemble/offloaders.xml index 38f7eee906064..318755d56c04b 100644 --- a/distribution/offloaders/src/assemble/offloaders.xml +++ b/distribution/offloaders/src/assemble/offloaders.xml @@ -40,7 +40,7 @@ - ${basedir}/../../tiered-storage/jcloud/target/tiered-storage-jcloud-${project.version}.nar + ${basedir}/../../tiered-storage/opendal/target/tiered-storage-opendal-${project.version}.nar offloaders 644 diff --git a/docker/README.md b/docker/README.md index 733384c68528e..a625cfe8f577d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -52,14 +52,14 @@ FROM apachepulsar/pulsar:${VERSION} # Add the cassandra connector (also works with ScyllaDB) COPY --from=pulsar-all /pulsar/connectors/pulsar-io-cassandra-*.nar /pulsar/connectors -# Add the jcloud offloader -COPY --from=pulsar-all /pulsar/connectors/tiered-storage-jcloud-*.nar /pulsar/offloaders +# Add the OpenDAL offloader +COPY --from=pulsar-all /pulsar/connectors/tiered-storage-opendal-*.nar /pulsar/offloaders ``` NOTE: the above example uses a wildcard in the `COPY` commands because argument expansion does not work for `COPY`. Assuming that you have the above `Dockerfile` in your local directory and are running docker on your local host, you can -run the following command to build a custom image with the cassandra connector and the jcloud offloader. The cassandra connector is compatible with both Apache Cassandra and ScyllaDB. +run the following command to build a custom image with the cassandra connector and the OpenDAL offloader. The cassandra connector is compatible with both Apache Cassandra and ScyllaDB. ```shell docker build --build-arg VERSION=2.9.1 -t pulsar-custom:2.9.1 . @@ -106,4 +106,4 @@ argument, you can run as the root user. If you're running your container on kubernetes, you can override the container's default user by setting the pod's `securityContext`. -Bitnami provides a helpful guide here: https://engineering.bitnami.com/articles/running-non-root-containers-on-openshift.html. \ No newline at end of file +Bitnami provides a helpful guide here: https://engineering.bitnami.com/articles/running-non-root-containers-on-openshift.html. diff --git a/pulsar-bom/pom.xml b/pulsar-bom/pom.xml index 1a0a9366949e7..4e761b6e8049e 100644 --- a/pulsar-bom/pom.xml +++ b/pulsar-bom/pom.xml @@ -680,6 +680,11 @@ tiered-storage-file-system ${project.version} + + org.apache.pulsar + tiered-storage-opendal + ${project.version} + org.apache.pulsar tiered-storage-jcloud diff --git a/tests/integration/src/test/java/org/apache/pulsar/tests/integration/containers/AzuriteContainer.java b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/containers/AzuriteContainer.java new file mode 100644 index 0000000000000..7066595185dbb --- /dev/null +++ b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/containers/AzuriteContainer.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pulsar.tests.integration.containers; + +import lombok.extern.slf4j.Slf4j; + +/** + * Azure Blob Storage emulator container (Azurite). + */ +@Slf4j +public class AzuriteContainer extends ChaosContainer { + + public static final String NAME = "azurite"; + public static final int BLOB_PORT = 10000; + private static final String IMAGE_NAME = "mcr.microsoft.com/azure-storage/azurite:latest"; + + private final String hostname; + + public AzuriteContainer(String clusterName, String hostname, String accountName, String accountKey) { + super(clusterName, IMAGE_NAME); + this.hostname = hostname; + // `AZURITE_ACCOUNTS` format: ":[;:...]" + this.withEnv("AZURITE_ACCOUNTS", accountName + ":" + accountKey); + this.withExposedPorts(BLOB_PORT); + } + + @Override + public String getContainerName() { + return clusterName + "-" + hostname; + } + + @Override + public void start() { + this.withCreateContainerCmdModifier(createContainerCmd -> { + createContainerCmd.withHostName(hostname); + createContainerCmd.withName(getContainerName()); + }); + + super.start(); + log.info("Start Azurite service"); + } +} + diff --git a/tests/integration/src/test/java/org/apache/pulsar/tests/integration/containers/GcsContainer.java b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/containers/GcsContainer.java new file mode 100644 index 0000000000000..97981879faa9c --- /dev/null +++ b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/containers/GcsContainer.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pulsar.tests.integration.containers; + +import lombok.extern.slf4j.Slf4j; + +/** + * Google Cloud Storage emulator container. + */ +@Slf4j +public class GcsContainer extends ChaosContainer { + + public static final String NAME = "gcs"; + public static final int PORT = 4443; + private static final String IMAGE_NAME = "fsouza/fake-gcs-server:latest"; + + private final String hostname; + + public GcsContainer(String clusterName, String hostname) { + super(clusterName, IMAGE_NAME); + this.hostname = hostname; + this.withExposedPorts(PORT); + // Use HTTP to avoid TLS certificate issues inside the Pulsar broker container. + this.withCommand("-scheme", "http", "-backend", "memory"); + } + + @Override + public String getContainerName() { + return clusterName + "-" + hostname; + } + + @Override + public void start() { + this.withCreateContainerCmdModifier(createContainerCmd -> { + createContainerCmd.withHostName(hostname); + createContainerCmd.withName(getContainerName()); + }); + + super.start(); + log.info("Start GCS emulator service"); + } +} + diff --git a/tests/integration/src/test/java/org/apache/pulsar/tests/integration/offload/TestAzureBlobOffload.java b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/offload/TestAzureBlobOffload.java new file mode 100644 index 0000000000000..d6a903f2e2813 --- /dev/null +++ b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/offload/TestAzureBlobOffload.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pulsar.tests.integration.offload; + +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.charset.StandardCharsets; +import java.security.InvalidKeyException; +import java.security.NoSuchAlgorithmException; +import java.time.ZoneOffset; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Base64; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Supplier; +import javax.crypto.Mac; +import javax.crypto.spec.SecretKeySpec; +import lombok.extern.slf4j.Slf4j; +import org.apache.pulsar.tests.integration.containers.AzuriteContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.Test; + +@Slf4j +public class TestAzureBlobOffload extends TestBaseOffload { + + private static final String STORAGE_ACCOUNT = "pulsar"; + private static final String STORAGE_KEY_BASE64 = "cHVsc2FyLXRlc3Qta2V5LXB1bHNhci10ZXN0LWtleS0xMjM0"; + private static final String CONTAINER = "pulsar-integtest"; + private static final String AZURE_API_VERSION = "2020-10-02"; + + private AzuriteContainer azuriteContainer; + + @Override + protected void beforeStartCluster() throws Exception { + super.beforeStartCluster(); + + log.info("Azurite init"); + azuriteContainer = new AzuriteContainer(pulsarCluster.getClusterName(), AzuriteContainer.NAME, + STORAGE_ACCOUNT, STORAGE_KEY_BASE64) + .withNetwork(pulsarCluster.getNetwork()) + .withNetworkAliases(AzuriteContainer.NAME); + azuriteContainer.start(); + createContainerIfMissing(); + log.info("Azurite start finish."); + } + + @AfterClass(alwaysRun = true) + public void teardownAzurite() { + if (azuriteContainer != null) { + azuriteContainer.stop(); + } + } + + @Test(dataProvider = "ServiceAndAdminUrls") + public void testPublishOffloadAndConsumeViaCLI(Supplier serviceUrl, Supplier adminUrl) + throws Exception { + super.testPublishOffloadAndConsumeViaCLI(serviceUrl.get(), adminUrl.get()); + } + + @Test(dataProvider = "ServiceAndAdminUrls") + public void testPublishOffloadAndConsumeViaThreshold(Supplier serviceUrl, Supplier adminUrl) + throws Exception { + super.testPublishOffloadAndConsumeViaThreshold(serviceUrl.get(), adminUrl.get()); + } + + @Test(dataProvider = "ServiceAndAdminUrls") + public void testPublishOffloadAndConsumeDeletionLag(Supplier serviceUrl, Supplier adminUrl) + throws Exception { + super.testPublishOffloadAndConsumeDeletionLag(serviceUrl.get(), adminUrl.get()); + } + + @Override + protected Map getEnv() { + Map result = new HashMap<>(); + result.put("managedLedgerMaxEntriesPerLedger", String.valueOf(getNumEntriesPerLedger())); + result.put("managedLedgerMinLedgerRolloverTimeMinutes", "0"); + result.put("managedLedgerOffloadDriver", "azureblob"); + + // OpenDAL uses `managedLedgerOffloadBucket` as the container name for azblob. + result.put("managedLedgerOffloadBucket", CONTAINER); + // Azurite uses path-style endpoint: http://host:port/ + result.put("managedLedgerOffloadServiceEndpoint", + "http://" + AzuriteContainer.NAME + ":" + AzuriteContainer.BLOB_PORT + "/" + STORAGE_ACCOUNT); + + // Keep compatibility with tiered-storage-jcloud behavior (env based credentials). + result.put("AZURE_STORAGE_ACCOUNT", STORAGE_ACCOUNT); + result.put("AZURE_STORAGE_ACCESS_KEY", STORAGE_KEY_BASE64); + + return result; + } + + private void createContainerIfMissing() throws Exception { + String host = azuriteContainer.getHost(); + int port = azuriteContainer.getMappedPort(AzuriteContainer.BLOB_PORT); + + String date = DateTimeFormatter.RFC_1123_DATE_TIME.format(ZonedDateTime.now(ZoneOffset.UTC)); + String canonicalizedHeaders = "x-ms-date:" + date + "\n" + + "x-ms-version:" + AZURE_API_VERSION + "\n"; + String canonicalizedResource = "/" + STORAGE_ACCOUNT + "/" + CONTAINER + "\nrestype:container"; + + // See https://learn.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key + String stringToSign = "PUT\n" // VERB + + "\n" // Content-Encoding + + "\n" // Content-Language + + "0\n" // Content-Length + + "\n" // Content-MD5 + + "\n" // Content-Type + + "\n" // Date (empty because x-ms-date is used) + + "\n" // If-Modified-Since + + "\n" // If-Match + + "\n" // If-None-Match + + "\n" // If-Unmodified-Since + + "\n" // Range + + canonicalizedHeaders + + canonicalizedResource; + + String signature = buildAzureSignature(STORAGE_KEY_BASE64, stringToSign); + String authorization = "SharedKey " + STORAGE_ACCOUNT + ":" + signature; + + String uri = "http://" + host + ":" + port + "/" + STORAGE_ACCOUNT + "/" + CONTAINER + "?restype=container"; + HttpClient client = HttpClient.newHttpClient(); + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(uri)) + .header("x-ms-date", date) + .header("x-ms-version", AZURE_API_VERSION) + .header("Authorization", authorization) + // .header("Content-Length", "0") + .PUT(HttpRequest.BodyPublishers.ofByteArray(new byte[0])) + .build(); + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + int code = response.statusCode(); + if (code != 201 && code != 202 && code != 409) { + throw new RuntimeException("Failed to create Azurite container. status=" + code + + ", body=" + response.body()); + } + } + + private static String buildAzureSignature(String base64Key, String stringToSign) + throws NoSuchAlgorithmException, InvalidKeyException { + byte[] key = Base64.getDecoder().decode(base64Key); + Mac mac = Mac.getInstance("HmacSHA256"); + mac.init(new SecretKeySpec(key, "HmacSHA256")); + byte[] hmac = mac.doFinal(stringToSign.getBytes(StandardCharsets.UTF_8)); + return Base64.getEncoder().encodeToString(hmac); + } +} + diff --git a/tests/integration/src/test/java/org/apache/pulsar/tests/integration/offload/TestGcsOffload.java b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/offload/TestGcsOffload.java new file mode 100644 index 0000000000000..fa750cad2e4c1 --- /dev/null +++ b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/offload/TestGcsOffload.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pulsar.tests.integration.offload; + +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Supplier; +import lombok.extern.slf4j.Slf4j; +import org.apache.pulsar.tests.integration.containers.GcsContainer; +import org.testng.annotations.AfterClass; +import org.testng.annotations.Test; + +@Slf4j +public class TestGcsOffload extends TestBaseOffload { + + private static final String BUCKET = "pulsar-integtest"; + private GcsContainer gcsContainer; + + @Override + protected void beforeStartCluster() throws Exception { + super.beforeStartCluster(); + + log.info("GCS emulator init"); + gcsContainer = new GcsContainer(pulsarCluster.getClusterName(), GcsContainer.NAME) + .withNetwork(pulsarCluster.getNetwork()) + .withNetworkAliases(GcsContainer.NAME); + gcsContainer.start(); + createBucketIfMissing(); + log.info("GCS emulator start finish."); + } + + @AfterClass(alwaysRun = true) + public void teardownGcs() { + if (gcsContainer != null) { + gcsContainer.stop(); + } + } + + @Test(dataProvider = "ServiceAndAdminUrls") + public void testPublishOffloadAndConsumeViaCLI(Supplier serviceUrl, Supplier adminUrl) + throws Exception { + super.testPublishOffloadAndConsumeViaCLI(serviceUrl.get(), adminUrl.get()); + } + + @Test(dataProvider = "ServiceAndAdminUrls") + public void testPublishOffloadAndConsumeViaThreshold(Supplier serviceUrl, Supplier adminUrl) + throws Exception { + super.testPublishOffloadAndConsumeViaThreshold(serviceUrl.get(), adminUrl.get()); + } + + @Test(dataProvider = "ServiceAndAdminUrls") + public void testPublishOffloadAndConsumeDeletionLag(Supplier serviceUrl, Supplier adminUrl) + throws Exception { + super.testPublishOffloadAndConsumeDeletionLag(serviceUrl.get(), adminUrl.get()); + } + + @Override + protected Map getEnv() { + Map result = new HashMap<>(); + result.put("managedLedgerMaxEntriesPerLedger", String.valueOf(getNumEntriesPerLedger())); + result.put("managedLedgerMinLedgerRolloverTimeMinutes", "0"); + result.put("managedLedgerOffloadDriver", "google-cloud-storage"); + + // Keep compatibility with legacy GCS keys. + result.put("gcsManagedLedgerOffloadBucket", BUCKET); + result.put("gcsManagedLedgerOffloadServiceEndpoint", + "http://" + GcsContainer.NAME + ":" + GcsContainer.PORT); + + return result; + } + + private void createBucketIfMissing() throws Exception { + String host = gcsContainer.getHost(); + int port = gcsContainer.getMappedPort(GcsContainer.PORT); + + HttpClient client = HttpClient.newHttpClient(); + String uri = "http://" + host + ":" + port + "/storage/v1/b?project=pulsar-it"; + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(uri)) + .header("Content-Type", "application/json") + .POST(HttpRequest.BodyPublishers.ofString("{\"name\":\"" + BUCKET + "\"}")) + .build(); + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + int code = response.statusCode(); + if (code != 200 && code != 409) { + throw new RuntimeException("Failed to create GCS bucket via emulator. status=" + code + + ", body=" + response.body()); + } + } +} diff --git a/tests/integration/src/test/java/org/apache/pulsar/tests/integration/suites/PulsarTieredStorageTestSuite.java b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/suites/PulsarTieredStorageTestSuite.java index 6bf4061760b01..c668eb7edcfbc 100644 --- a/tests/integration/src/test/java/org/apache/pulsar/tests/integration/suites/PulsarTieredStorageTestSuite.java +++ b/tests/integration/src/test/java/org/apache/pulsar/tests/integration/suites/PulsarTieredStorageTestSuite.java @@ -19,18 +19,34 @@ package org.apache.pulsar.tests.integration.suites; import static java.util.stream.Collectors.joining; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.Comparator; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import java.util.stream.Stream; import lombok.extern.slf4j.Slf4j; import org.apache.pulsar.tests.integration.containers.BrokerContainer; +import org.apache.pulsar.tests.integration.containers.PulsarContainer; import org.apache.pulsar.tests.integration.topologies.PulsarClusterSpec; import org.apache.pulsar.tests.integration.topologies.PulsarClusterTestBase; +import org.testcontainers.containers.BindMode; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; @Slf4j public abstract class PulsarTieredStorageTestSuite extends PulsarClusterTestBase { + private static final String IT_OFFLOADER_IMPL_PROPERTY = "pulsar.it.offloader"; + private static final String IT_OFFLOADER_IMPL_OPENDAL = "opendal"; + private static final String PULSAR_OFFLOADERS_DIR = "/pulsar/offloaders"; + + private Path overriddenOffloadersDir; + protected int getNumEntriesPerLedger() { return 1024; } @@ -55,6 +71,7 @@ public final void setupCluster() throws Exception { @Override public final void tearDownCluster() throws Exception { super.tearDownCluster(); + cleanupOverriddenOffloadersDir(); } protected abstract Map getEnv(); @@ -62,8 +79,111 @@ public final void tearDownCluster() throws Exception { @Override protected void beforeStartCluster() throws Exception { super.beforeStartCluster(); + maybeOverrideOffloadersWithOpenDAL(); for (BrokerContainer brokerContainer : pulsarCluster.getBrokers()) { getEnv().forEach(brokerContainer::withEnv); } } + + private void maybeOverrideOffloadersWithOpenDAL() throws IOException { + String offloaderImpl = System.getProperty(IT_OFFLOADER_IMPL_PROPERTY, "").trim(); + if (!IT_OFFLOADER_IMPL_OPENDAL.equalsIgnoreCase(offloaderImpl)) { + return; + } + + Path openDalNar = findOpenDalOffloaderNar(); + overriddenOffloadersDir = Files.createTempDirectory("pulsar-offloaders-opendal-"); + Files.copy(openDalNar, overriddenOffloadersDir.resolve(openDalNar.getFileName()), + StandardCopyOption.REPLACE_EXISTING); + + log.info("Overriding broker offloaders with OpenDAL NAR: {} -> {}", openDalNar, overriddenOffloadersDir); + for (BrokerContainer brokerContainer : pulsarCluster.getBrokers()) { + brokerContainer.withFileSystemBind(overriddenOffloadersDir.toString(), PULSAR_OFFLOADERS_DIR, + BindMode.READ_ONLY); + } + } + + private static Path findOpenDalOffloaderNar() throws IOException { + Path root = findRepoRoot(); + Path targetDir = root.resolve("tiered-storage").resolve("opendal").resolve("target"); + if (!Files.isDirectory(targetDir)) { + throw new IOException("OpenDAL offloader target dir not found: " + targetDir + + " (build it first: mvn -pl tiered-storage/opendal -DskipTests package)"); + } + + try (Stream stream = Files.list(targetDir)) { + List nars = stream + .filter(p -> p.getFileName().toString().startsWith("tiered-storage-opendal-")) + .filter(p -> p.getFileName().toString().endsWith(".nar")) + .sorted(Comparator.comparingLong(PulsarTieredStorageTestSuite::safeLastModifiedMillis)) + .collect(Collectors.toList()); + if (nars.isEmpty()) { + throw new IOException("No OpenDAL offloader NAR found under " + targetDir + + " (build it first: mvn -pl tiered-storage/opendal -DskipTests package)"); + } + return nars.get(nars.size() - 1); + } + } + + private static Path findRepoRoot() throws IOException { + // Allow explicit override for custom layouts. + String explicit = System.getProperty("pulsar.repo.root"); + if (explicit != null && !explicit.isBlank()) { + Path root = Paths.get(explicit).toAbsolutePath().normalize(); + if (!Files.isDirectory(root)) { + throw new IOException("pulsar.repo.root is not a directory: " + root); + } + return root; + } + + // Prefer a Maven-provided path we explicitly export in `tests/integration/pom.xml`. + // Example value: `${project.build.directory}` = `/tests/integration/target`. + String buildDir = System.getProperty("maven.buildDirectory"); + Path start = (buildDir != null && !buildDir.isBlank()) + ? Paths.get(buildDir) + : Paths.get(System.getProperty("user.dir", ".")); + + Path current = start.toAbsolutePath().normalize(); + for (int i = 0; i < 10 && current != null; i++) { + if (Files.isDirectory(current.resolve("tiered-storage").resolve("opendal"))) { + return current; + } + current = current.getParent(); + } + + throw new IOException("Failed to locate Pulsar repo root from " + start + + " (set -Dpulsar.repo.root=/path/to/pulsar if needed)"); + } + + private static long safeLastModifiedMillis(Path path) { + try { + return Files.getLastModifiedTime(path).toMillis(); + } catch (IOException e) { + return 0; + } + } + + private void cleanupOverriddenOffloadersDir() { + if (overriddenOffloadersDir == null) { + return; + } + if (PulsarContainer.PULSAR_CONTAINERS_LEAVE_RUNNING) { + log.warn("Not deleting overridden offloaders dir {} because PULSAR_CONTAINERS_LEAVE_RUNNING=true", + overriddenOffloadersDir); + return; + } + try (Stream walk = Files.walk(overriddenOffloadersDir)) { + walk.sorted(Comparator.reverseOrder()).forEach(p -> { + try { + Files.deleteIfExists(p); + } catch (IOException e) { + log.warn("Failed to delete {}", p, e); + } + }); + } catch (IOException e) { + log.warn("Failed to cleanup overridden offloaders dir {}", overriddenOffloadersDir, e); + } finally { + overriddenOffloadersDir = null; + } + } } diff --git a/tests/integration/src/test/resources/tiered-opendal-storage.xml b/tests/integration/src/test/resources/tiered-opendal-storage.xml new file mode 100644 index 0000000000000..b83308558f874 --- /dev/null +++ b/tests/integration/src/test/resources/tiered-opendal-storage.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + diff --git a/tiered-storage/opendal/README.md b/tiered-storage/opendal/README.md new file mode 100644 index 0000000000000..7bbb11268f43a --- /dev/null +++ b/tiered-storage/opendal/README.md @@ -0,0 +1,58 @@ +# Tiered Storage / OpenDAL offloader + +This module provides a Pulsar tiered storage (ledger offload) implementation based on the +Apache OpenDAL Java binding. + +The build produces the offloader NAR: + +- `tiered-storage-opendal-.nar` (Maven artifact: `org.apache.pulsar:tiered-storage-opendal`) + +## Supported driver names (no broker config change) + +The OpenDAL offloader is compatible with the existing driver names so that existing broker +configuration continues to work: + +- `aws-s3` / `S3` +- `aliyun-oss` +- `google-cloud-storage` +- `azureblob` +- `transient` (tests/debug only) + +## Distribution packaging + +The default offloader distribution is expected to include only one “cloud” offloader NAR to +avoid discovery conflicts (multiple NARs claiming the same driver names). + +## Migration / rollback (operational guidance) + +These steps only affect what NAR is present under `${PULSAR_HOME}/offloaders`: + +- Migration (jcloud → opendal): + - Stop the broker. + - Remove any `tiered-storage-jcloud-*.nar` from `${PULSAR_HOME}/offloaders`. + - Ensure `tiered-storage-opendal-*.nar` is present in `${PULSAR_HOME}/offloaders`. + - Start the broker. Existing `managedLedgerOffloadDriver=aws-s3|...` settings stay the same. + +- Rollback (opendal → jcloud): + - Stop the broker. + - Remove `tiered-storage-opendal-*.nar` from `${PULSAR_HOME}/offloaders`. + - Restore `tiered-storage-jcloud-*.nar` into `${PULSAR_HOME}/offloaders`. + - Start the broker. + +## Compatibility constraints (must not change) + +To keep historical offloaded data readable across implementations, the OpenDAL offloader is +required to preserve these invariants: + +- Object key naming stays the same as the legacy JCloud implementation: + - `uuid-ledger-`, `uuid-ledger--index`, `uuid-index`, etc. +- Data object binary format is unchanged: + - `DataBlockHeader(128B) + entries + padding` + - Padding semantics must remain compatible with the existing “negative `readInt()` triggers seek fixup” behavior. +- Index object binary format is unchanged: + - Index block V1/V2: `magic word + length + metadata + sparse index entries` + - Serialization rules must match the existing implementation. +- Format version field stays unchanged: + - `S3ManagedLedgerOffloaderFormatVersion=1` + - Metadata key normalization differs across backends, so reads must tolerate key case differences and/or missing keys + with a safe fallback (without breaking reads of existing data). diff --git a/tiered-storage/opendal/pom.xml b/tiered-storage/opendal/pom.xml new file mode 100644 index 0000000000000..02b123e980cf6 --- /dev/null +++ b/tiered-storage/opendal/pom.xml @@ -0,0 +1,104 @@ + + + 4.0.0 + + + org.apache.pulsar + tiered-storage-parent + 4.2.0-SNAPSHOT + + + tiered-storage-opendal + Apache Pulsar :: Tiered Storage :: OpenDAL + + + + ${project.groupId} + managed-ledger + ${project.version} + provided + + + + org.apache.opendal + opendal + ${opendal.version} + + + + org.apache.opendal + opendal + ${opendal.version} + ${os.detected.classifier} + runtime + + + + ${project.groupId} + testmocks + ${project.version} + test + + + + + + + org.apache.nifi + nifi-nar-maven-plugin + + + + com.github.spotbugs + spotbugs-maven-plugin + ${spotbugs-maven-plugin.version} + + ${basedir}/src/main/resources/findbugsExclude.xml + + + + spotbugs + verify + + check + + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + + + checkstyle + verify + + check + + + + + + + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/BackedInputStream.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/BackedInputStream.java new file mode 100644 index 0000000000000..34ee4ea0f13e2 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/BackedInputStream.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Abstract input stream class. + */ +public abstract class BackedInputStream extends InputStream { + public abstract void seek(long position); + public abstract void seekForward(long position) throws IOException; + public abstract long getCurrentPosition(); +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/BlockAwareSegmentInputStream.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/BlockAwareSegmentInputStream.java new file mode 100644 index 0000000000000..c25f162566f60 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/BlockAwareSegmentInputStream.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; + +import java.io.InputStream; +import org.apache.bookkeeper.client.api.ReadHandle; + +/** + * The BlockAwareSegmentInputStream for each cold storage data block. + * This interface should be implemented while extends InputStream. + * It gets data from ledger, and will be read out the content for a data block. + * DataBlockHeader + entries(each with format[[entry_size -- int][entry_id -- long][entry_data]]) + padding + */ +public abstract class BlockAwareSegmentInputStream extends InputStream { + /** + * Get the ledger, from which this InputStream read data. + */ + public abstract ReadHandle getLedger(); + + /** + * Get start entry id contained in this InputStream. + * + * @return the start entry id + */ + public abstract long getStartEntryId(); + + /** + * Get block size that could read out from this InputStream. + * + * @return the block size + */ + public abstract int getBlockSize(); + + /** + * Get entry count that read out from this InputStream. + * + * @return the block entry count + */ + public abstract int getBlockEntryCount(); + + /** + * Get end entry id contained in this InputStream. + * + * @return the end entry id + */ + public abstract long getEndEntryId(); + + /** + * Get sum of entries data size read from the this InputStream. + * + * @return the block entry bytes count + */ + public abstract int getBlockEntryBytesCount(); +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/DataBlockHeader.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/DataBlockHeader.java new file mode 100644 index 0000000000000..ce0d6222c1e3d --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/DataBlockHeader.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; + +import java.io.InputStream; +import org.apache.bookkeeper.common.annotation.InterfaceStability.Unstable; + +/** + * The data block header in tiered storage for each data block. + * + *

Currently, It is in format: + * [ magic_word -- int ][ block_len -- int ][ first_entry_id -- long][padding] + * + * with the size: 4 + 4 + 8 + padding = 128 Bytes

+ */ +@Unstable +public interface DataBlockHeader { + + /** + * Get the length of the block in bytes, including the header. + */ + long getBlockLength(); + + /** + * Get the message entry Id for the first message that stored in this data block. + */ + long getFirstEntryId(); + + /** + * Get the size of this DataBlockHeader. + */ + long getHeaderLength(); + + /** + * Get the content of the data block header as InputStream. + * Read out in current format. + */ + InputStream toStream(); +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlock.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlock.java new file mode 100644 index 0000000000000..077f16a2beb90 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlock.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; + +import java.io.Closeable; +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.common.annotation.InterfaceStability.Unstable; + +/** + * The Index block abstraction used for offload a ledger to long term storage. + */ +@Unstable +public interface OffloadIndexBlock extends Closeable, OffloadIndexBlockV2 { + + /** + * Get the content of the index block as InputStream. + * Read out in format: + * | index_magic_header | index_block_len | index_entry_count | + * | data_object_size | segment_metadata_length | segment metadata | index entries ... | + */ + IndexInputStream toStream() throws IOException; + + /** + * Get the related OffloadIndexEntry that contains the given messageEntryId. + * + * @param messageEntryId + * the entry id of message + * @return the offload index entry + */ + OffloadIndexEntry getIndexEntryForEntry(long messageEntryId) throws IOException; + + /** + * Get the entry count that contained in this index Block. + */ + int getEntryCount(); + + /** + * Get LedgerMetadata. + */ + LedgerMetadata getLedgerMetadata(); + + /** + * Get the total size of the data object. + */ + long getDataObjectLength(); + + /** + * Get the length of the header in the blocks in the data object. + */ + long getDataBlockHeaderLength(); + + /** + * An input stream which knows the size of the stream upfront. + */ + class IndexInputStream extends FilterInputStream { + final long streamSize; + + public IndexInputStream(InputStream in, long streamSize) { + super(in); + this.streamSize = streamSize; + } + + /** + * @return the number of bytes in the stream. + */ + public long getStreamSize() { + return streamSize; + } + } + + default OffloadIndexEntry getIndexEntryForEntry(long ledgerId, long messageEntryId) throws IOException { + return getIndexEntryForEntry(messageEntryId); + } + + default long getStartEntryId(long ledgerId) { + return 0; //Offload index block v1 always start with 0; + } + + default LedgerMetadata getLedgerMetadata(long ledgerId) { + return getLedgerMetadata(); + } + +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockBuilder.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockBuilder.java new file mode 100644 index 0000000000000..cf73cf7f0b632 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockBuilder.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.common.annotation.InterfaceAudience.LimitedPrivate; +import org.apache.bookkeeper.common.annotation.InterfaceStability.Unstable; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.OffloadIndexBlockV2BuilderImpl; + +/** + * Interface for builder of index block used for offload a ledger to long term storage. + */ +@Unstable +@LimitedPrivate +public interface OffloadIndexBlockBuilder { + + /** + * Build index block with the passed in ledger metadata. + * + * @param metadata the ledger metadata + */ + OffloadIndexBlockBuilder withLedgerMetadata(LedgerMetadata metadata); + + /** + * Add one payload block related information into index block. + * It contains the first entryId in payload block, the payload block Id, + * and payload block size. + * This information will be used to consist one index entry in OffloadIndexBlock. + * + * @param firstEntryId the first entryId in payload block + * @param partId the payload block Id + * @param blockSize the payload block size + */ + OffloadIndexBlockBuilder addBlock(long firstEntryId, int partId, int blockSize); + + /** + * Specify the length of data object this index is associated with. + * @param dataObjectLength the length of the data object + */ + OffloadIndexBlockBuilder withDataObjectLength(long dataObjectLength); + + /** + * Specify the length of the block headers in the data object. + * @param dataHeaderLength the length of the headers + */ + OffloadIndexBlockBuilder withDataBlockHeaderLength(long dataHeaderLength); + + /** + * Finalize the immutable OffloadIndexBlock. + */ + OffloadIndexBlock build(); + + /** + * Construct OffloadIndex from an InputStream. + */ + OffloadIndexBlockV2 fromStream(InputStream is) throws IOException; + + /** + * create an OffloadIndexBlockBuilder. + */ + static OffloadIndexBlockBuilder create() { + return new OffloadIndexBlockV2BuilderImpl(); + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockV2.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockV2.java new file mode 100644 index 0000000000000..9dc3f00771a23 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockV2.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; + +import java.io.Closeable; +import java.io.IOException; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.common.annotation.InterfaceStability.Unstable; + +/** + * The Index block abstraction used for offload a ledger to long term storage. + */ +@Unstable +public interface OffloadIndexBlockV2 extends Closeable { + + /** + * Get the content of the index block as InputStream. + * Read out in format: + * | index_magic_header | index_block_len | index_entry_count | + * | data_object_size | segment_metadata_length | segment metadata | index entries ... | + */ + OffloadIndexBlock.IndexInputStream toStream() throws IOException; + + /** + * Get the related OffloadIndexEntry that contains the given messageEntryId. + * + * @param messageEntryId + * the entry id of message + * @return the offload index entry + */ + OffloadIndexEntry getIndexEntryForEntry(long ledgerId, long messageEntryId) throws IOException; + + long getStartEntryId(long ledgerId); + + /** + * Get the entry count that contained in this index Block. + */ + int getEntryCount(); + + /** + * Get LedgerMetadata. + * @return + */ + LedgerMetadata getLedgerMetadata(long ledgerId); + + /** + * Get the total size of the data object. + */ + long getDataObjectLength(); + + /** + * Get the length of the header in the blocks in the data object. + */ + long getDataBlockHeaderLength(); +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockV2Builder.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockV2Builder.java new file mode 100644 index 0000000000000..8373b7eb54125 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexBlockV2Builder.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.bookkeeper.common.annotation.InterfaceAudience.LimitedPrivate; +import org.apache.bookkeeper.common.annotation.InterfaceStability.Unstable; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.OffloadIndexBlockV2BuilderImpl; +import org.apache.bookkeeper.mledger.proto.MLDataFormats.ManagedLedgerInfo.LedgerInfo; + +/** + * Interface for builder of index block used for offload a ledger to long term storage. + */ +@Unstable +@LimitedPrivate +public interface OffloadIndexBlockV2Builder { + + /** + * Build index block with the passed in ledger metadata. + * + * @param ledgerId + * @param metadata the ledger metadata + */ + OffloadIndexBlockV2Builder addLedgerMeta(Long ledgerId, LedgerInfo metadata); + + /** + * Add one payload block related information into index block. + * It contains the first entryId in payload block, the payload block Id, + * and payload block size. + * This information will be used to consist one index entry in OffloadIndexBlock. + * + * @param firstEntryId the first entryId in payload block + * @param partId the payload block Id + * @param blockSize the payload block size + */ + OffloadIndexBlockV2Builder addBlock(long ledgerId, long firstEntryId, int partId, int blockSize); + + /** + * Specify the length of data object this index is associated with. + * @param dataObjectLength the length of the data object + */ + OffloadIndexBlockV2Builder withDataObjectLength(long dataObjectLength); + + /** + * Specify the length of the block headers in the data object. + * @param dataHeaderLength the length of the headers + */ + OffloadIndexBlockV2Builder withDataBlockHeaderLength(long dataHeaderLength); + + /** + * Finalize the immutable OffloadIndexBlock. + */ + OffloadIndexBlockV2 buildV2(); + + /** + * Construct OffloadIndex from an InputStream. + */ + OffloadIndexBlockV2 fromStream(InputStream is) throws IOException; + + /** + * create an OffloadIndexBlockBuilder. + */ + static OffloadIndexBlockV2Builder create() { + return new OffloadIndexBlockV2BuilderImpl(); + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexEntry.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexEntry.java new file mode 100644 index 0000000000000..935bb54f602c0 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/OffloadIndexEntry.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; + +import org.apache.bookkeeper.common.annotation.InterfaceAudience.LimitedPrivate; +import org.apache.bookkeeper.common.annotation.InterfaceStability.Unstable; + +/** + * The Index Entry in OffloadIndexBlock. + * It consists of the message entry id, the tiered storage block part id for this message entry, + * and the offset in tiered storage block for this message id. + */ +@Unstable +@LimitedPrivate +public interface OffloadIndexEntry { + + /** + * Get the entryId that this entry contains. + */ + long getEntryId(); + + /** + * Get the block part id of tiered storage. + */ + int getPartId(); + + /** + * Get the offset of this block within the object. + */ + long getOffset(); + + /** + * Get the offset of the block's data within the object. + */ + long getDataOffset(); +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/BlockAwareSegmentInputStreamImpl.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/BlockAwareSegmentInputStreamImpl.java new file mode 100644 index 0000000000000..06d7f2129ba31 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/BlockAwareSegmentInputStreamImpl.java @@ -0,0 +1,349 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import static com.google.common.base.Preconditions.checkState; +import com.google.common.collect.Lists; +import com.google.common.primitives.Ints; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.CompositeByteBuf; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.bookkeeper.client.api.LedgerEntries; +import org.apache.bookkeeper.client.api.LedgerEntry; +import org.apache.bookkeeper.client.api.ReadHandle; +import org.apache.bookkeeper.mledger.LedgerOffloaderStats; +import org.apache.bookkeeper.mledger.offload.jcloud.BlockAwareSegmentInputStream; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; +import org.apache.pulsar.common.naming.TopicName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The BlockAwareSegmentInputStreamImpl for each cold storage data block. + * It gets data from ledger, and will be read out the content for a data block. + * DataBlockHeader + entries(each with format[[entry_size -- int][entry_id -- long][entry_data]]) + padding + */ +public class BlockAwareSegmentInputStreamImpl extends BlockAwareSegmentInputStream { + private static final Logger log = LoggerFactory.getLogger(BlockAwareSegmentInputStreamImpl.class); + + static final int[] BLOCK_END_PADDING = new int[]{ 0xFE, 0xDC, 0xDE, 0xAD }; + static final byte[] BLOCK_END_PADDING_BYTES = Ints.toByteArray(0xFEDCDEAD); + + private final ByteBuf paddingBuf = PulsarByteBufAllocator.DEFAULT.buffer(128, 128); + + private final ReadHandle ledger; + private final long startEntryId; + private final int blockSize; + + // Number of Message entries that read from ledger and been readout from this InputStream. + private int blockEntryCount; + + // tracking read status for both header and entries. + // Bytes that already been read from this InputStream + private int bytesReadOffset = 0; + // Byte from this index is all padding byte + private int dataBlockFullOffset; + private final InputStream dataBlockHeaderStream; + + // how many entries want to read from ReadHandle each time. + private static final int ENTRIES_PER_READ = 100; + // buf the entry size and entry id. + static final int ENTRY_HEADER_SIZE = 4 /* entry size */ + 8 /* entry id */; + // Keep a list of all entries ByteBuf, each ByteBuf contains 2 buf: entry header and entry content. + private List entriesByteBuf = null; + private LedgerOffloaderStats offloaderStats; + private String managedLedgerName; + private String topicName; + private int currentOffset = 0; + private final AtomicBoolean close = new AtomicBoolean(false); + + public BlockAwareSegmentInputStreamImpl(ReadHandle ledger, long startEntryId, int blockSize) { + this.ledger = ledger; + this.startEntryId = startEntryId; + this.blockSize = blockSize; + this.dataBlockHeaderStream = DataBlockHeaderImpl.of(blockSize, startEntryId).toStream(); + this.blockEntryCount = 0; + this.dataBlockFullOffset = blockSize; + this.entriesByteBuf = Lists.newLinkedList(); + } + + public BlockAwareSegmentInputStreamImpl(ReadHandle ledger, long startEntryId, int blockSize, + LedgerOffloaderStats offloaderStats, String ledgerName) { + this(ledger, startEntryId, blockSize); + this.offloaderStats = offloaderStats; + this.managedLedgerName = ledgerName; + this.topicName = TopicName.fromPersistenceNamingEncoding(ledgerName); + } + + private ByteBuf readEntries(int len) throws IOException { + checkState(bytesReadOffset >= DataBlockHeaderImpl.getDataStartOffset()); + checkState(bytesReadOffset < blockSize); + + // once reach the end of entry buffer, read more, if there is more + if (bytesReadOffset < dataBlockFullOffset + && entriesByteBuf.isEmpty() + && startEntryId + blockEntryCount <= ledger.getLastAddConfirmed()) { + entriesByteBuf = readNextEntriesFromLedger(startEntryId + blockEntryCount, ENTRIES_PER_READ); + } + + if (!entriesByteBuf.isEmpty() + && bytesReadOffset + entriesByteBuf.get(0).readableBytes() <= blockSize) { + // always read from the first ByteBuf in the list, once read all of its content remove it. + ByteBuf entryByteBuf = entriesByteBuf.get(0); + int readableBytes = entryByteBuf.readableBytes(); + int read = Math.min(readableBytes, len); + ByteBuf buf = entryByteBuf.slice(currentOffset, read); + buf.retain(); + currentOffset += read; + entryByteBuf.readerIndex(currentOffset); + bytesReadOffset += read; + + if (entryByteBuf.readableBytes() == 0) { + entryByteBuf.release(); + entriesByteBuf.remove(0); + blockEntryCount++; + currentOffset = 0; + } + + return buf; + } else { + // no space for a new entry or there are no more entries + // set data block full, return end padding + if (dataBlockFullOffset == blockSize) { + dataBlockFullOffset = bytesReadOffset; + } + paddingBuf.clear(); + for (int i = 0; i < Math.min(len, paddingBuf.capacity()); i++) { + paddingBuf.writeByte(BLOCK_END_PADDING_BYTES[(bytesReadOffset++ - dataBlockFullOffset) + % BLOCK_END_PADDING_BYTES.length]); + } + return paddingBuf.retain(); + } + } + + // read ledger entries. + private int readEntries() throws IOException { + checkState(bytesReadOffset >= DataBlockHeaderImpl.getDataStartOffset()); + checkState(bytesReadOffset < blockSize); + + // once reach the end of entry buffer, read more, if there is more + if (bytesReadOffset < dataBlockFullOffset + && entriesByteBuf.isEmpty() + && startEntryId + blockEntryCount <= ledger.getLastAddConfirmed()) { + entriesByteBuf = readNextEntriesFromLedger(startEntryId + blockEntryCount, ENTRIES_PER_READ); + } + + if (!entriesByteBuf.isEmpty() && bytesReadOffset + entriesByteBuf.get(0).readableBytes() <= blockSize) { + // always read from the first ByteBuf in the list, once read all of its content remove it. + ByteBuf entryByteBuf = entriesByteBuf.get(0); + int ret = entryByteBuf.readUnsignedByte(); + bytesReadOffset++; + + if (entryByteBuf.readableBytes() == 0) { + entryByteBuf.release(); + entriesByteBuf.remove(0); + blockEntryCount++; + } + + return ret; + } else { + // no space for a new entry or there are no more entries + // set data block full, return end padding + if (dataBlockFullOffset == blockSize) { + dataBlockFullOffset = bytesReadOffset; + } + return BLOCK_END_PADDING[(bytesReadOffset++ - dataBlockFullOffset) % BLOCK_END_PADDING.length]; + } + } + + private List readNextEntriesFromLedger(long start, long maxNumberEntries) throws IOException { + long end = Math.min(start + maxNumberEntries - 1, ledger.getLastAddConfirmed()); + long startTime = System.nanoTime(); + try (LedgerEntries ledgerEntriesOnce = ledger.readAsync(start, end).get()) { + if (log.isDebugEnabled()) { + log.debug("read ledger entries. start: {}, end: {} cost {}", start, end, + TimeUnit.NANOSECONDS.toMicros(System.nanoTime() - startTime)); + } + if (offloaderStats != null && managedLedgerName != null) { + offloaderStats.recordReadLedgerLatency(topicName, System.nanoTime() - startTime, + TimeUnit.NANOSECONDS); + } + + List entries = Lists.newLinkedList(); + Iterator iterator = ledgerEntriesOnce.iterator(); + while (iterator.hasNext()) { + LedgerEntry entry = iterator.next(); + ByteBuf buf = entry.getEntryBuffer().retain(); + int entryLength = buf.readableBytes(); + long entryId = entry.getEntryId(); + + CompositeByteBuf entryBuf = PulsarByteBufAllocator.DEFAULT.compositeBuffer(2); + ByteBuf entryHeaderBuf = PulsarByteBufAllocator.DEFAULT.buffer(ENTRY_HEADER_SIZE, ENTRY_HEADER_SIZE); + + entryHeaderBuf.writeInt(entryLength).writeLong(entryId); + entryBuf.addComponents(true, entryHeaderBuf, buf); + + entries.add(entryBuf); + } + return entries; + } catch (InterruptedException | ExecutionException e) { + log.error("Exception when get CompletableFuture. ", e); + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new IOException(e); + } + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if (b == null) { + throw new NullPointerException("The given bytes are null"); + } else if (off < 0 || len < 0 || len > b.length - off) { + throw new IndexOutOfBoundsException("off=" + off + ", len=" + len + ", b.length=" + b.length); + } else if (len == 0) { + return 0; + } + + int offset = off; + int readLen = len; + int readBytes = 0; + // reading header + if (dataBlockHeaderStream.available() > 0) { + int read = dataBlockHeaderStream.read(b, off, len); + offset += read; + readLen -= read; + readBytes += read; + bytesReadOffset += read; + } + if (readLen == 0) { + return readBytes; + } + + // reading ledger entries + if (bytesReadOffset < blockSize) { + readLen = Math.min(readLen, blockSize - bytesReadOffset); + ByteBuf readEntries = readEntries(readLen); + int read = readEntries.readableBytes(); + readEntries.readBytes(b, offset, read); + readEntries.release(); + readBytes += read; + return readBytes; + } + + // reached end + return -1; + } + + @Override + public int read() throws IOException { + // reading header + if (dataBlockHeaderStream.available() > 0) { + bytesReadOffset++; + return dataBlockHeaderStream.read(); + } + + // reading Ledger entries. + if (bytesReadOffset < blockSize) { + return readEntries(); + } + + // reached end + return -1; + } + + @Override + public void close() throws IOException { + // The close method will be triggered twice in the BlobStoreManagedLedgerOffloader#offload method. + // The stream resource used by the try-with block which will called the close + // And through debug, writeBlobStore.uploadMultipartPart in the offload method also will trigger + // the close method. + // So we add the close variable to avoid release paddingBuf twice. + if (close.compareAndSet(false, true)) { + super.close(); + dataBlockHeaderStream.close(); + if (!entriesByteBuf.isEmpty()) { + entriesByteBuf.forEach(buf -> buf.release()); + entriesByteBuf.clear(); + } + paddingBuf.clear(); + paddingBuf.release(); + } + } + + @Override + public ReadHandle getLedger() { + return ledger; + } + + @Override + public long getStartEntryId() { + return startEntryId; + } + + @Override + public int getBlockSize() { + return blockSize; + } + + public int getDataBlockFullOffset() { + return dataBlockFullOffset; + } + + @Override + public int getBlockEntryCount() { + return blockEntryCount; + } + + @Override + public long getEndEntryId() { + // return -1 when no entry contained + if (blockEntryCount == 0) { + return -1; + } + return startEntryId + blockEntryCount - 1; + } + + @Override + public int getBlockEntryBytesCount() { + return dataBlockFullOffset - DataBlockHeaderImpl.getDataStartOffset() - ENTRY_HEADER_SIZE * blockEntryCount; + } + + public static long getHeaderSize() { + return DataBlockHeaderImpl.getDataStartOffset(); + } + + // Calculate the block size after uploaded `entryBytesAlreadyWritten` bytes + public static int calculateBlockSize(int maxBlockSize, ReadHandle readHandle, + long firstEntryToWrite, long entryBytesAlreadyWritten) { + return (int) Math.min( + maxBlockSize, + (readHandle.getLastAddConfirmed() - firstEntryToWrite + 1) * ENTRY_HEADER_SIZE + + (readHandle.getLength() - entryBytesAlreadyWritten) + + DataBlockHeaderImpl.getDataStartOffset()); + } + +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/BufferedOffloadStream.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/BufferedOffloadStream.java new file mode 100644 index 0000000000000..d4ff404d64942 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/BufferedOffloadStream.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import io.netty.buffer.ByteBuf; +import io.netty.buffer.CompositeByteBuf; +import java.io.IOException; +import java.io.InputStream; +import java.util.LinkedList; +import java.util.List; +import lombok.extern.slf4j.Slf4j; +import org.apache.bookkeeper.mledger.Entry; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; + +@Slf4j +public class BufferedOffloadStream extends InputStream { + static final int[] BLOCK_END_PADDING = BlockAwareSegmentInputStreamImpl.BLOCK_END_PADDING; + + private final long ledgerId; + private final long beginEntryId; + + public BufferedOffloadStream(int blockSize, List entries, long ledgerId, long beginEntryId) { + this.ledgerId = ledgerId; + this.beginEntryId = beginEntryId; + this.endEntryId = beginEntryId; + this.blockSize = blockSize; + this.entryBuffer = new LinkedList<>(entries); + this.blockHead = StreamingDataBlockHeaderImpl.of(blockSize, ledgerId, beginEntryId) + .toStream(); + } + + public long getEndEntryId() { + return endEntryId; + } + + private volatile long endEntryId; + static final int ENTRY_HEADER_SIZE = 4 /* entry size */ + 8 /* entry id */; + private final long blockSize; + private final List entryBuffer; + private final InputStream blockHead; + int offset = 0; + static final int NOT_INITIALIZED = -1; + int validDataOffset = NOT_INITIALIZED; + CompositeByteBuf currentEntry; + + public long getLedgerId() { + return ledgerId; + } + + public long getBeginEntryId() { + return beginEntryId; + } + + public long getBlockSize() { + return blockSize; + } + + @Override + public int read() throws IOException { + if (blockHead.available() > 0) { + offset++; + return blockHead.read(); + } + if (currentEntry != null) { + if (currentEntry.readableBytes() > 0) { + offset += 1; + return currentEntry.readUnsignedByte(); + } else { + currentEntry.release(); + currentEntry = null; + } + } + + if (blockSize <= offset) { + return -1; + } else if (validDataOffset != NOT_INITIALIZED) { + return BLOCK_END_PADDING[(offset++ - validDataOffset) % BLOCK_END_PADDING.length]; + } + + if (entryBuffer.isEmpty()) { + validDataOffset = offset; + return read(); + } + + Entry headEntry = entryBuffer.remove(0); + + if (headEntry.getLedgerId() != this.ledgerId) { + throw new RuntimeException( + String.format("there should not be multi ledger in a block %s %s", headEntry.getLedgerId(), + this.ledgerId)); + } + + final int entryLength = headEntry.getLength(); + final long entryId = headEntry.getEntryId(); + CompositeByteBuf entryBuf = PulsarByteBufAllocator.DEFAULT.compositeBuffer(2); + ByteBuf entryHeaderBuf = PulsarByteBufAllocator.DEFAULT.buffer(ENTRY_HEADER_SIZE, ENTRY_HEADER_SIZE); + entryHeaderBuf.writeInt(entryLength).writeLong(entryId); + entryBuf.addComponents(true, entryHeaderBuf, headEntry.getDataBuffer().retain()); + endEntryId = headEntry.getEntryId(); + headEntry.release(); + currentEntry = entryBuf; + return read(); + } + + @Override + public void close() throws IOException { + blockHead.close(); + } + + public static int calculateBlockSize(int streamingBlockSize, int entryCount, int entrySize) { + int validDataSize = (entryCount * ENTRY_HEADER_SIZE + + entrySize + + StreamingDataBlockHeaderImpl.getDataStartOffset()); + return Math.max(streamingBlockSize, validDataSize); + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/DataBlockHeaderImpl.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/DataBlockHeaderImpl.java new file mode 100644 index 0000000000000..d7631d01dd199 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/DataBlockHeaderImpl.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import com.google.common.io.CountingInputStream; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufInputStream; +import java.io.DataInputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import org.apache.bookkeeper.mledger.offload.jcloud.DataBlockHeader; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; + +/** + * The data block header in tiered storage for each data block. + */ +public class DataBlockHeaderImpl implements DataBlockHeader { + // Magic Word for data block. + // It is a sequence of bytes used to identify the start of a block. + static final int MAGIC_WORD = 0xFBDBABCB; + // This is bigger than header size. Leaving some place for alignment and future enhancement. + // Payload use this as the start offset. + private static final int HEADER_MAX_SIZE = 128; + private static final int HEADER_BYTES_USED = 4 /* magic */ + + 8 /* header len */ + + 8 /* block len */ + + 8 /* first entry id */; + private static final byte[] PADDING = new byte[HEADER_MAX_SIZE - HEADER_BYTES_USED]; + + public static DataBlockHeaderImpl of(int blockLength, long firstEntryId) { + return new DataBlockHeaderImpl(HEADER_MAX_SIZE, blockLength, firstEntryId); + } + + // Construct DataBlockHeader from InputStream, which contains `HEADER_MAX_SIZE` bytes readable. + public static DataBlockHeader fromStream(InputStream stream) throws IOException { + CountingInputStream countingStream = new CountingInputStream(stream); + DataInputStream dis = new DataInputStream(countingStream); + int magic = dis.readInt(); + if (magic != MAGIC_WORD) { + throw new IOException("Data block header magic word not match. read: " + magic + + " expected: " + MAGIC_WORD); + } + + long headerLen = dis.readLong(); + long blockLen = dis.readLong(); + long firstEntryId = dis.readLong(); + long toSkip = headerLen - countingStream.getCount(); + if (dis.skip(toSkip) != toSkip) { + throw new EOFException("Header was too small"); + } + + return new DataBlockHeaderImpl(headerLen, blockLen, firstEntryId); + } + + private final long headerLength; + private final long blockLength; + private final long firstEntryId; + + public static int getBlockMagicWord() { + return MAGIC_WORD; + } + + public static int getDataStartOffset() { + return HEADER_MAX_SIZE; + } + + @Override + public long getBlockLength() { + return this.blockLength; + } + + @Override + public long getHeaderLength() { + return this.headerLength; + } + + @Override + public long getFirstEntryId() { + return this.firstEntryId; + } + + public DataBlockHeaderImpl(long headerLength, long blockLength, long firstEntryId) { + this.headerLength = headerLength; + this.blockLength = blockLength; + this.firstEntryId = firstEntryId; + } + + /** + * Get the content of the data block header as InputStream. + * Read out in format: + * [ magic_word -- int ][ block_len -- int ][ first_entry_id -- long] [padding zeros] + */ + @Override + public InputStream toStream() { + ByteBuf out = PulsarByteBufAllocator.DEFAULT.buffer(HEADER_MAX_SIZE, HEADER_MAX_SIZE); + out.writeInt(MAGIC_WORD) + .writeLong(headerLength) + .writeLong(blockLength) + .writeLong(firstEntryId) + .writeBytes(PADDING); + + // true means the input stream will release the ByteBuf on close + return new ByteBufInputStream(out, true); + } + + @Override + public String toString() { + return String.format("DataBlockHeader(len:%d,hlen:%d,firstEntry:%d)", + blockLength, headerLength, firstEntryId); + } +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockImpl.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockImpl.java new file mode 100644 index 0000000000000..ef7c84e917268 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockImpl.java @@ -0,0 +1,362 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import static com.google.common.base.Preconditions.checkState; +import static org.apache.bookkeeper.mledger.offload.OffloadUtils.buildLedgerMetadataFormat; +import com.google.common.collect.Maps; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufInputStream; +import io.netty.util.Recycler; +import io.netty.util.Recycler.Handle; +import java.io.DataInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.TreeMap; +import org.apache.bookkeeper.client.api.DigestType; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlock; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexEntry; +import org.apache.bookkeeper.net.BookieId; +import org.apache.bookkeeper.proto.DataFormats; +import org.apache.bookkeeper.proto.DataFormats.LedgerMetadataFormat; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class OffloadIndexBlockImpl implements OffloadIndexBlock { + private static final Logger log = LoggerFactory.getLogger(OffloadIndexBlockImpl.class); + + private static final int INDEX_MAGIC_WORD = 0xDE47DE47; + + private LedgerMetadata segmentMetadata; + private long dataObjectLength; + private long dataHeaderLength; + private TreeMap indexEntries; + + private final Handle recyclerHandle; + + private static final Recycler RECYCLER = new Recycler() { + @Override + protected OffloadIndexBlockImpl newObject(Recycler.Handle handle) { + return new OffloadIndexBlockImpl(handle); + } + }; + + private OffloadIndexBlockImpl(Handle recyclerHandle) { + this.recyclerHandle = recyclerHandle; + } + + public static OffloadIndexBlockImpl get(LedgerMetadata metadata, long dataObjectLength, + long dataHeaderLength, + List entries) { + OffloadIndexBlockImpl block = RECYCLER.get(); + block.indexEntries = Maps.newTreeMap(); + entries.forEach(entry -> block.indexEntries.putIfAbsent(entry.getEntryId(), entry)); + checkState(entries.size() == block.indexEntries.size()); + block.segmentMetadata = metadata; + block.dataObjectLength = dataObjectLength; + block.dataHeaderLength = dataHeaderLength; + return block; + } + + public static OffloadIndexBlockImpl get(int magic, DataInputStream stream) throws IOException { + if (magic != INDEX_MAGIC_WORD) { + throw new IOException(String.format("Invalid MagicWord. read: 0x%x expected: 0x%x", + magic, INDEX_MAGIC_WORD)); + } + OffloadIndexBlockImpl block = RECYCLER.get(); + block.indexEntries = Maps.newTreeMap(); + block.fromStream(stream); + return block; + } + + public void recycle() { + dataObjectLength = -1; + dataHeaderLength = -1; + segmentMetadata = null; + indexEntries.clear(); + indexEntries = null; + if (recyclerHandle != null) { + recyclerHandle.recycle(this); + } + } + + @Override + public OffloadIndexEntry getIndexEntryForEntry(long messageEntryId) throws IOException { + if (messageEntryId > segmentMetadata.getLastEntryId()) { + log.warn("Try to get entry: {}, which beyond lastEntryId {}, return null", + messageEntryId, segmentMetadata.getLastEntryId()); + throw new IndexOutOfBoundsException("Entry index: " + messageEntryId + + " beyond lastEntryId: " + segmentMetadata.getLastEntryId()); + } + // find the greatest mapping Id whose entryId <= messageEntryId + return this.indexEntries.floorEntry(messageEntryId).getValue(); + } + + @Override + public int getEntryCount() { + return this.indexEntries.size(); + } + + @Override + public LedgerMetadata getLedgerMetadata() { + return this.segmentMetadata; + } + + @Override + public long getDataObjectLength() { + return this.dataObjectLength; + } + + @Override + public long getDataBlockHeaderLength() { + return this.dataHeaderLength; + } + + /** + * Get the content of the index block as InputStream. + * Read out in format: + * | index_magic_header | index_block_len | data_object_len | data_header_len | + * | index_entry_count | segment_metadata_len | segment metadata | index entries... | + */ + @Override + public OffloadIndexBlock.IndexInputStream toStream() throws IOException { + int indexEntryCount = this.indexEntries.size(); + byte[] ledgerMetadataByte = buildLedgerMetadataFormat(this.segmentMetadata); + int segmentMetadataLength = ledgerMetadataByte.length; + + int indexBlockLength = 4 /* magic header */ + + 4 /* index block length */ + + 8 /* data object length */ + + 8 /* data header length */ + + 4 /* index entry count */ + + 4 /* segment metadata length */ + + segmentMetadataLength + + indexEntryCount * (8 + 4 + 8); /* messageEntryId + blockPartId + blockOffset */ + + ByteBuf out = PulsarByteBufAllocator.DEFAULT.buffer(indexBlockLength, indexBlockLength); + + out.writeInt(INDEX_MAGIC_WORD) + .writeInt(indexBlockLength) + .writeLong(dataObjectLength) + .writeLong(dataHeaderLength) + .writeInt(indexEntryCount) + .writeInt(segmentMetadataLength); + // write metadata + out.writeBytes(ledgerMetadataByte); + + // write entries + this.indexEntries.entrySet().forEach(entry -> + out.writeLong(entry.getValue().getEntryId()) + .writeInt(entry.getValue().getPartId()) + .writeLong(entry.getValue().getOffset())); + + return new OffloadIndexBlock.IndexInputStream(new ByteBufInputStream(out, true), indexBlockLength); + } + + private static class InternalLedgerMetadata implements LedgerMetadata { + + private int ensembleSize; + private int writeQuorumSize; + private int ackQuorumSize; + private long lastEntryId; + private long length; + private DataFormats.LedgerMetadataFormat.DigestType digestType; + private long ctime; + private State state; + private Map customMetadata = Maps.newHashMap(); + private TreeMap> ensembles = + new TreeMap<>(); + + InternalLedgerMetadata(LedgerMetadataFormat ledgerMetadataFormat) { + this.ensembleSize = ledgerMetadataFormat.getEnsembleSize(); + this.writeQuorumSize = ledgerMetadataFormat.getQuorumSize(); + this.ackQuorumSize = ledgerMetadataFormat.getAckQuorumSize(); + this.lastEntryId = ledgerMetadataFormat.getLastEntryId(); + this.length = ledgerMetadataFormat.getLength(); + this.digestType = ledgerMetadataFormat.getDigestType(); + this.ctime = ledgerMetadataFormat.getCtime(); + this.state = org.apache.bookkeeper.client.api.LedgerMetadata.State.valueOf( + ledgerMetadataFormat.getState().toString()); + + if (ledgerMetadataFormat.getCustomMetadataCount() > 0) { + ledgerMetadataFormat.getCustomMetadataList().forEach( + entry -> this.customMetadata.put(entry.getKey(), entry.getValue().toByteArray())); + } + + ledgerMetadataFormat.getSegmentList().forEach(segment -> { + ArrayList addressArrayList = new ArrayList<>(); + segment.getEnsembleMemberList().forEach(address -> { + try { + addressArrayList.add(BookieId.parse(address)); + } catch (IllegalArgumentException e) { + log.error("Exception when create BookieSocketAddress. ", e); + } + }); + this.ensembles.put(segment.getFirstEntryId(), addressArrayList); + }); + } + + @Override + public long getLedgerId() { + throw new UnsupportedOperationException(); + } + + @Override + public int getEnsembleSize() { + return this.ensembleSize; + } + + @Override + public int getWriteQuorumSize() { + return this.writeQuorumSize; + } + + @Override + public int getAckQuorumSize() { + return this.ackQuorumSize; + } + + @Override + public long getLastEntryId() { + return this.lastEntryId; + } + + @Override + public long getLength() { + return this.length; + } + + @Override + public DigestType getDigestType() { + switch (this.digestType) { + case HMAC: + return DigestType.MAC; + case CRC32: + return DigestType.CRC32; + case CRC32C: + return DigestType.CRC32C; + case DUMMY: + return DigestType.DUMMY; + default: + throw new IllegalArgumentException("Unable to convert digest type " + digestType); + } + } + + @Override + public long getCtime() { + return this.ctime; + } + + @Override + public boolean isClosed() { + return this.state == State.CLOSED; + } + + @Override + public Map getCustomMetadata() { + return this.customMetadata; + } + + @Override + public List getEnsembleAt(long entryId) { + return ensembles.get(ensembles.headMap(entryId + 1).lastKey()); + } + + @Override + public NavigableMap> getAllEnsembles() { + return this.ensembles; + } + + @Override + public long getCToken() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public int getMetadataFormatVersion() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public byte[] getPassword() { + // TODO Auto-generated method stub + return null; + } + + @Override + public State getState() { + return this.state; + } + + @Override + public boolean hasPassword() { + // TODO Auto-generated method stub + return false; + } + + @Override + public String toSafeString() { + // TODO Auto-generated method stub + return null; + } + } + + private static LedgerMetadata parseLedgerMetadata(byte[] bytes) throws IOException { + LedgerMetadataFormat.Builder builder = LedgerMetadataFormat.newBuilder(); + builder.mergeFrom(bytes); + return new InternalLedgerMetadata(builder.build()); + } + + private OffloadIndexBlock fromStream(DataInputStream dis) throws IOException { + dis.readInt(); // no used index block length + this.dataObjectLength = dis.readLong(); + this.dataHeaderLength = dis.readLong(); + int indexEntryCount = dis.readInt(); + int segmentMetadataLength = dis.readInt(); + + byte[] metadataBytes = new byte[segmentMetadataLength]; + dis.readFully(metadataBytes); + this.segmentMetadata = parseLedgerMetadata(metadataBytes); + + for (int i = 0; i < indexEntryCount; i++) { + long entryId = dis.readLong(); + this.indexEntries.putIfAbsent(entryId, OffloadIndexEntryImpl.of(entryId, dis.readInt(), + dis.readLong(), dataHeaderLength)); + } + return this; + } + + public static int getIndexMagicWord() { + return INDEX_MAGIC_WORD; + } + + @Override + public void close() { + recycle(); + } + +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockV2BuilderImpl.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockV2BuilderImpl.java new file mode 100644 index 0000000000000..d5761fa0e4f72 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockV2BuilderImpl.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import static com.google.common.base.Preconditions.checkState; +import com.google.common.collect.Lists; +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlock; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockBuilder; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockV2; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockV2Builder; +import org.apache.bookkeeper.mledger.proto.MLDataFormats.ManagedLedgerInfo.LedgerInfo; + +/** + * Interface for builder of index block used for offload a ledger to long term storage. + */ +public class OffloadIndexBlockV2BuilderImpl implements OffloadIndexBlockBuilder, OffloadIndexBlockV2Builder { + + private final Map ledgerMetadataMap; + private LedgerMetadata ledgerMetadata; + private long dataObjectLength; + private long dataHeaderLength; + private List entries; + private int lastBlockSize; + private int lastStreamingBlockSize; + private long streamingOffset = 0; + private final SortedMap> entryMap = new TreeMap<>(); + + + public OffloadIndexBlockV2BuilderImpl() { + this.entries = Lists.newArrayList(); + this.ledgerMetadataMap = new HashMap<>(); + } + + @Override + public OffloadIndexBlockV2BuilderImpl withDataObjectLength(long dataObjectLength) { + this.dataObjectLength = dataObjectLength; + return this; + } + + @Override + public OffloadIndexBlockV2BuilderImpl withDataBlockHeaderLength(long dataHeaderLength) { + this.dataHeaderLength = dataHeaderLength; + return this; + } + + @Override + public OffloadIndexBlockV2BuilderImpl withLedgerMetadata(LedgerMetadata metadata) { + this.ledgerMetadata = metadata; + return this; + } + + @Override + public OffloadIndexBlockV2BuilderImpl addLedgerMeta(Long ledgerId, LedgerInfo metadata) { + this.ledgerMetadataMap.put(ledgerId, metadata); + return this; + } + + @Override + public OffloadIndexBlockBuilder addBlock(long firstEntryId, int partId, int blockSize) { + checkState(dataHeaderLength > 0); + + // we should added one by one. + long offset; + if (firstEntryId == 0) { + checkState(entries.size() == 0); + offset = 0; + } else { + checkState(entries.size() > 0); + offset = entries.get(entries.size() - 1).getOffset() + lastBlockSize; + } + lastBlockSize = blockSize; + + this.entries.add(OffloadIndexEntryImpl.of(firstEntryId, partId, offset, dataHeaderLength)); + return this; + } + + @Override + public OffloadIndexBlockV2Builder addBlock(long ledgerId, long firstEntryId, int partId, int blockSize) { + checkState(dataHeaderLength > 0); + + streamingOffset = streamingOffset + lastStreamingBlockSize; + lastStreamingBlockSize = blockSize; + + final List list = entryMap.getOrDefault(ledgerId, new LinkedList<>()); + list.add(OffloadIndexEntryImpl.of(firstEntryId, partId, streamingOffset, dataHeaderLength)); + entryMap.put(ledgerId, list); + return this; + } + + @Override + public OffloadIndexBlockV2 fromStream(InputStream is) throws IOException { + final DataInputStream dataInputStream = new DataInputStream(is); + final int magic = dataInputStream.readInt(); + if (magic == OffloadIndexBlockImpl.getIndexMagicWord()) { + return OffloadIndexBlockImpl.get(magic, dataInputStream); + } else if (magic == OffloadIndexBlockV2Impl.getIndexMagicWord()) { + return OffloadIndexBlockV2Impl.get(magic, dataInputStream); + } else { + throw new IOException(String.format("Invalid MagicWord. read: 0x%x expected: 0x%x or 0x%x", + magic, OffloadIndexBlockImpl.getIndexMagicWord(), + OffloadIndexBlockV2Impl.getIndexMagicWord())); + } + } + + @Override + public OffloadIndexBlock build() { + checkState(ledgerMetadata != null); + checkState(!entries.isEmpty()); + checkState(dataObjectLength > 0); + checkState(dataHeaderLength > 0); + return OffloadIndexBlockImpl.get(ledgerMetadata, dataObjectLength, dataHeaderLength, entries); + } + + @Override + public OffloadIndexBlockV2 buildV2() { + checkState(!ledgerMetadataMap.isEmpty()); + checkState(true); + checkState(!entryMap.isEmpty()); + checkState(dataObjectLength > 0); + checkState(dataHeaderLength > 0); + return OffloadIndexBlockV2Impl.get(ledgerMetadataMap, dataObjectLength, dataHeaderLength, entryMap); + } + +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockV2Impl.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockV2Impl.java new file mode 100644 index 0000000000000..93ae53abce392 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexBlockV2Impl.java @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Maps; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufInputStream; +import io.netty.util.Recycler; +import io.netty.util.Recycler.Handle; +import java.io.DataInputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.TreeMap; +import org.apache.bookkeeper.client.api.DigestType; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlock.IndexInputStream; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockV2; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexEntry; +import org.apache.bookkeeper.mledger.proto.MLDataFormats.ManagedLedgerInfo.LedgerInfo; +import org.apache.bookkeeper.net.BookieId; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class OffloadIndexBlockV2Impl implements OffloadIndexBlockV2 { + private static final Logger log = LoggerFactory.getLogger(OffloadIndexBlockImpl.class); + + private static final int INDEX_MAGIC_WORD = 0x3D1FB0BC; + + private Map segmentMetadata; + private final Map compatibleMetadata = Maps.newTreeMap(); + private long dataObjectLength; + private long dataHeaderLength; + // private TreeMap indexEntries; + private Map> indexEntries; + + + private final Handle recyclerHandle; + + private static final Recycler RECYCLER = new Recycler() { + @Override + protected OffloadIndexBlockV2Impl newObject(Handle handle) { + return new OffloadIndexBlockV2Impl(handle); + } + }; + + private OffloadIndexBlockV2Impl(Handle recyclerHandle) { + this.recyclerHandle = recyclerHandle; + } + + public static OffloadIndexBlockV2Impl get(Map metadata, long dataObjectLength, + long dataHeaderLength, + Map> entries) { + OffloadIndexBlockV2Impl block = RECYCLER.get(); + block.indexEntries = new HashMap<>(); + entries.forEach((ledgerId, list) -> { + final TreeMap inLedger = block.indexEntries + .getOrDefault(ledgerId, new TreeMap<>()); + list.forEach(indexEntry -> { + inLedger.put(indexEntry.getEntryId(), indexEntry); + }); + block.indexEntries.put(ledgerId, inLedger); + }); + + block.segmentMetadata = metadata; + block.dataObjectLength = dataObjectLength; + block.dataHeaderLength = dataHeaderLength; + return block; + } + + public static OffloadIndexBlockV2Impl get(int magic, DataInputStream stream) throws IOException { + OffloadIndexBlockV2Impl block = RECYCLER.get(); + block.indexEntries = Maps.newTreeMap(); + block.segmentMetadata = Maps.newTreeMap(); + if (magic != INDEX_MAGIC_WORD) { + throw new IOException(String.format("Invalid MagicWord. read: 0x%x expected: 0x%x", + magic, INDEX_MAGIC_WORD)); + } + block.fromStream(stream); + return block; + } + + public void recycle() { + dataObjectLength = -1; + dataHeaderLength = -1; + segmentMetadata = null; + indexEntries.clear(); + indexEntries = null; + if (recyclerHandle != null) { + recyclerHandle.recycle(this); + } + } + + @Override + public OffloadIndexEntry getIndexEntryForEntry(long ledgerId, long messageEntryId) throws IOException { + if (messageEntryId > getLedgerMetadata(ledgerId).getLastEntryId()) { + log.warn("Try to get entry: {}, which beyond lastEntryId {}, return null", + messageEntryId, getLedgerMetadata(ledgerId).getLastEntryId()); + throw new IndexOutOfBoundsException("Entry index: " + messageEntryId + + " beyond lastEntryId: " + getLedgerMetadata(ledgerId).getLastEntryId()); + } + // find the greatest mapping Id whose entryId <= messageEntryId + return this.indexEntries.get(ledgerId).floorEntry(messageEntryId).getValue(); + } + + public long getStartEntryId(long ledgerId) { + return this.indexEntries.get(ledgerId).firstEntry().getValue().getEntryId(); + } + + @Override + public int getEntryCount() { + int ans = 0; + for (TreeMap v : this.indexEntries.values()) { + ans += v.size(); + } + + return ans; + } + + @Override + public LedgerMetadata getLedgerMetadata(long ledgerId) { + if (compatibleMetadata.containsKey(ledgerId)) { + return compatibleMetadata.get(ledgerId); + } else if (segmentMetadata.containsKey(ledgerId)) { + final CompatibleMetadata result = new CompatibleMetadata(segmentMetadata.get(ledgerId)); + compatibleMetadata.put(ledgerId, result); + return result; + } else { + return null; + } + } + + @Override + public long getDataObjectLength() { + return this.dataObjectLength; + } + + @Override + public long getDataBlockHeaderLength() { + return this.dataHeaderLength; + } + + /** + * Get the content of the index block as InputStream. + * Read out in format: + * | index_magic_header | index_block_len | data_object_len | data_header_len | + * | index_entry_count | segment_metadata_len | segment metadata | index entries... | + */ + @Override + public IndexInputStream toStream() throws IOException { + + int indexBlockLength = 4 /* magic header */ + + 4 /* index block length */ + + 8 /* data object length */ + + 8; /* data header length */ + + Map metaBytesMap = new HashMap<>(); + for (Map.Entry> e : this.indexEntries.entrySet()) { + Long ledgerId = e.getKey(); + TreeMap ledgerIndexEntries = e.getValue(); + int indexEntryCount = ledgerIndexEntries.size(); + byte[] ledgerMetadataByte = this.segmentMetadata.get(ledgerId).toByteArray(); + int segmentMetadataLength = ledgerMetadataByte.length; + indexBlockLength += 8 /* ledger id length */ + + 4 /* index entry count */ + + 4 /* segment metadata length */ + + segmentMetadataLength + + indexEntryCount * (8 + 4 + 8); + metaBytesMap.put(ledgerId, ledgerMetadataByte); + } + + ByteBuf out = PulsarByteBufAllocator.DEFAULT.buffer(indexBlockLength, indexBlockLength); + + out.writeInt(INDEX_MAGIC_WORD) + .writeInt(indexBlockLength) + .writeLong(dataObjectLength) + .writeLong(dataHeaderLength); + + for (Map.Entry> e : this.indexEntries.entrySet()) { + Long ledgerId = e.getKey(); + TreeMap ledgerIndexEntries = e.getValue(); + int indexEntryCount = ledgerIndexEntries.size(); + byte[] ledgerMetadataByte = metaBytesMap.get(ledgerId); + out.writeLong(ledgerId) + .writeInt(indexEntryCount) + .writeInt(ledgerMetadataByte.length) + .writeBytes(ledgerMetadataByte); + ledgerIndexEntries.values().forEach(idxEntry -> { + out.writeLong(idxEntry.getEntryId()) + .writeInt(idxEntry.getPartId()) + .writeLong(idxEntry.getOffset()); + }); + } + + return new IndexInputStream(new ByteBufInputStream(out, true), indexBlockLength); + } + + private static LedgerInfo parseLedgerInfo(byte[] bytes) throws IOException { + return LedgerInfo.newBuilder().mergeFrom(bytes).build(); + } + + private OffloadIndexBlockV2 fromStream(DataInputStream dis) throws IOException { + + dis.readInt(); // no used index block length + this.dataObjectLength = dis.readLong(); + this.dataHeaderLength = dis.readLong(); + while (dis.available() > 0) { + long ledgerId = dis.readLong(); + int indexEntryCount = dis.readInt(); + int segmentMetadataLength = dis.readInt(); + + byte[] metadataBytes = new byte[segmentMetadataLength]; + + if (segmentMetadataLength != dis.read(metadataBytes)) { + log.error("Read ledgerMetadata from bytes failed"); + throw new IOException("Read ledgerMetadata from bytes failed"); + } + final LedgerInfo ledgerInfo = parseLedgerInfo(metadataBytes); + this.segmentMetadata.put(ledgerId, ledgerInfo); + final TreeMap indexEntries = new TreeMap<>(); + + for (int i = 0; i < indexEntryCount; i++) { + long entryId = dis.readLong(); + indexEntries.putIfAbsent(entryId, OffloadIndexEntryImpl.of(entryId, dis.readInt(), + dis.readLong(), dataHeaderLength)); + } + this.indexEntries.put(ledgerId, indexEntries); + } + + return this; + } + + public static int getIndexMagicWord() { + return INDEX_MAGIC_WORD; + } + + @Override + public void close() { + recycle(); + } + + @VisibleForTesting + static class CompatibleMetadata implements LedgerMetadata { + LedgerInfo ledgerInfo; + + public CompatibleMetadata(LedgerInfo ledgerInfo) { + this.ledgerInfo = ledgerInfo; + } + + @Override + public long getLedgerId() { + return ledgerInfo.getLedgerId(); + } + + @Override + public int getEnsembleSize() { + return 0; + } + + @Override + public int getWriteQuorumSize() { + return 0; + } + + @Override + public int getAckQuorumSize() { + return 0; + } + + @Override + public long getLastEntryId() { + return ledgerInfo.getEntries() - 1; + } + + @Override + public long getLength() { + return ledgerInfo.getSize(); + } + + @Override + public boolean hasPassword() { + return false; + } + + @Override + public byte[] getPassword() { + return new byte[0]; + } + + @Override + public DigestType getDigestType() { + return null; + } + + @Override + public long getCtime() { + return 0; + } + + @Override + public boolean isClosed() { + return true; + } + + @Override + public Map getCustomMetadata() { + return null; + } + + @Override + public List getEnsembleAt(long entryId) { + return null; + } + + @Override + public NavigableMap> getAllEnsembles() { + return null; + } + + @Override + public State getState() { + return null; + } + + @Override + public String toSafeString() { + return null; + } + + @Override + public int getMetadataFormatVersion() { + return 0; + } + + @Override + public long getCToken() { + return 0; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + CompatibleMetadata that = (CompatibleMetadata) o; + return ledgerInfo.equals(that.ledgerInfo); + } + + @Override + public int hashCode() { + return Objects.hash(ledgerInfo); + } + } +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexEntryImpl.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexEntryImpl.java new file mode 100644 index 0000000000000..2faffa7e25c12 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffloadIndexEntryImpl.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import java.util.Objects; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexEntry; + +/** + * The Index Entry in OffloadIndexBlock. + */ +public class OffloadIndexEntryImpl implements OffloadIndexEntry { + public static OffloadIndexEntryImpl of(long entryId, int partId, long offset, long blockHeaderSize) { + return new OffloadIndexEntryImpl(entryId, partId, offset, blockHeaderSize); + } + + private final long entryId; + private final int partId; + private final long offset; + private final long blockHeaderSize; + + @Override + public long getEntryId() { + return entryId; + } + + @Override + public int getPartId() { + return partId; + } + + @Override + public long getOffset() { + return offset; + } + + @Override + public long getDataOffset() { + return offset + blockHeaderSize; + } + + private OffloadIndexEntryImpl(long entryId, int partId, long offset, long blockHeaderSize) { + this.entryId = entryId; + this.partId = partId; + this.offset = offset; + this.blockHeaderSize = blockHeaderSize; + } + + @Override + public String toString() { + return String.format("[eid:%d, part:%d, offset:%d, doffset:%d]", + entryId, partId, offset, getDataOffset()); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof OffloadIndexEntryImpl that)) { + return false; + } + return entryId == that.entryId && partId == that.partId && offset == that.offset + && blockHeaderSize == that.blockHeaderSize; + } + + @Override + public int hashCode() { + return Objects.hash(entryId, partId, offset, blockHeaderSize); + } +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffsetsCache.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffsetsCache.java new file mode 100644 index 0000000000000..6651b199e4e60 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/OffsetsCache.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +public class OffsetsCache implements AutoCloseable { + private static final int CACHE_TTL_SECONDS = + Integer.getInteger("pulsar.jclouds.readhandleimpl.offsetsscache.ttl.seconds", 5 * 60); + // limit the cache size to avoid OOM + // 1 million entries consumes about 60MB of heap space + private static final int CACHE_MAX_SIZE = + Integer.getInteger("pulsar.jclouds.readhandleimpl.offsetsscache.max.size", 1_000_000); + private final ScheduledExecutorService cacheEvictionExecutor; + + record Key(long ledgerId, long entryId) { + + } + + private final Cache entryOffsetsCache; + + public OffsetsCache() { + if (CACHE_MAX_SIZE > 0) { + entryOffsetsCache = CacheBuilder + .newBuilder() + .expireAfterAccess(CACHE_TTL_SECONDS, TimeUnit.SECONDS) + .maximumSize(CACHE_MAX_SIZE) + .build(); + cacheEvictionExecutor = + Executors.newSingleThreadScheduledExecutor( + new ThreadFactoryBuilder().setNameFormat("jcloud-offsets-cache-eviction").build()); + int period = Math.max(CACHE_TTL_SECONDS / 2, 1); + cacheEvictionExecutor.scheduleAtFixedRate(() -> { + entryOffsetsCache.cleanUp(); + }, period, period, TimeUnit.SECONDS); + } else { + cacheEvictionExecutor = null; + entryOffsetsCache = null; + } + } + + public void put(long ledgerId, long entryId, long currentPosition) { + if (entryOffsetsCache != null) { + entryOffsetsCache.put(new Key(ledgerId, entryId), currentPosition); + } + } + + public Long getIfPresent(long ledgerId, long entryId) { + return entryOffsetsCache != null ? entryOffsetsCache.getIfPresent(new Key(ledgerId, entryId)) : null; + } + + public void clear() { + if (entryOffsetsCache != null) { + entryOffsetsCache.invalidateAll(); + } + } + + @Override + public void close() { + if (cacheEvictionExecutor != null) { + cacheEvictionExecutor.shutdownNow(); + } + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/StreamingDataBlockHeaderImpl.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/StreamingDataBlockHeaderImpl.java new file mode 100644 index 0000000000000..3ce1e1fecf025 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/StreamingDataBlockHeaderImpl.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; + +import com.google.common.io.CountingInputStream; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufInputStream; +import java.io.DataInputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import org.apache.bookkeeper.mledger.offload.jcloud.DataBlockHeader; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; + +/** + * The data block header in tiered storage for each data block. + */ +public class StreamingDataBlockHeaderImpl implements DataBlockHeader { + // Magic Word for streaming data block. + // It is a sequence of bytes used to identify the start of a block. + static final int MAGIC_WORD = 0x26A66D32; + // This is bigger than header size. Leaving some place for alignment and future enhancement. + // Payload use this as the start offset. + public static final int HEADER_MAX_SIZE = 128; + private static final int HEADER_BYTES_USED = 4 /* magic */ + + 8 /* header len */ + + 8 /* block len */ + + 8 /* first entry id */ + + 8 /* ledger id */; + private static final byte[] PADDING = new byte[HEADER_MAX_SIZE - HEADER_BYTES_USED]; + + public long getLedgerId() { + return ledgerId; + } + + private final long ledgerId; + + public static StreamingDataBlockHeaderImpl of(int blockLength, long ledgerId, long firstEntryId) { + return new StreamingDataBlockHeaderImpl(HEADER_MAX_SIZE, blockLength, ledgerId, firstEntryId); + } + + private final long headerLength; + private final long blockLength; + private final long firstEntryId; + + public static int getBlockMagicWord() { + return MAGIC_WORD; + } + + public static int getDataStartOffset() { + return HEADER_MAX_SIZE; + } + + @Override + public long getBlockLength() { + return this.blockLength; + } + + @Override + public long getHeaderLength() { + return this.headerLength; + } + + @Override + public long getFirstEntryId() { + return this.firstEntryId; + } + + public StreamingDataBlockHeaderImpl(long headerLength, long blockLength, long ledgerId, long firstEntryId) { + this.headerLength = headerLength; + this.blockLength = blockLength; + this.firstEntryId = firstEntryId; + this.ledgerId = ledgerId; + } + + // Construct DataBlockHeader from InputStream, which contains `HEADER_MAX_SIZE` bytes readable. + public static StreamingDataBlockHeaderImpl fromStream(InputStream stream) throws IOException { + CountingInputStream countingStream = new CountingInputStream(stream); + DataInputStream dis = new DataInputStream(countingStream); + int magic = dis.readInt(); + if (magic != MAGIC_WORD) { + throw new IOException("Data block header magic word not match. read: " + magic + + " expected: " + MAGIC_WORD); + } + + long headerLen = dis.readLong(); + long blockLen = dis.readLong(); + long firstEntryId = dis.readLong(); + long ledgerId = dis.readLong(); + long toSkip = headerLen - countingStream.getCount(); + if (dis.skip(toSkip) != toSkip) { + throw new EOFException("Header was too small"); + } + + return new StreamingDataBlockHeaderImpl(headerLen, blockLen, ledgerId, firstEntryId); + } + + /** + * Get the content of the data block header as InputStream. + * Read out in format: + * [ magic_word -- int ][ block_len -- int ][ first_entry_id -- long] [padding zeros] + */ + @Override + public InputStream toStream() { + ByteBuf out = PulsarByteBufAllocator.DEFAULT.buffer(HEADER_MAX_SIZE, HEADER_MAX_SIZE); + out.writeInt(MAGIC_WORD) + .writeLong(headerLength) + .writeLong(blockLength) + .writeLong(firstEntryId) + .writeLong(ledgerId) + .writeBytes(PADDING); + + // true means the input stream will release the ByteBuf on close + return new ByteBufInputStream(out, true); + } + + @Override + public String toString() { + return String.format("StreamingDataBlockHeader(len:%d,hlen:%d,firstEntry:%d,ledger:%d)", + blockLength, headerLength, firstEntryId, ledgerId); + } +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/package-info.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/package-info.java new file mode 100644 index 0000000000000..346fe3262b163 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/impl/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud.impl; diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/package-info.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/package-info.java new file mode 100644 index 0000000000000..4daf3fff26f78 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/jcloud/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.jcloud; diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/OpenDALLedgerOffloaderFactory.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/OpenDALLedgerOffloaderFactory.java new file mode 100644 index 0000000000000..a89352d754a7b --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/OpenDALLedgerOffloaderFactory.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import org.apache.bookkeeper.common.util.OrderedScheduler; +import org.apache.bookkeeper.mledger.LedgerOffloaderFactory; +import org.apache.bookkeeper.mledger.LedgerOffloaderStats; +import org.apache.bookkeeper.mledger.LedgerOffloaderStatsDisable; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.OffsetsCache; +import org.apache.bookkeeper.mledger.offload.opendal.impl.OpenDALManagedLedgerOffloader; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALOperatorProvider; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALTieredStorageConfiguration; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OperatorCache; +import org.apache.pulsar.common.policies.data.OffloadPoliciesImpl; + +/** + * OpenDAL based offloader factory. + * + *

This module is introduced as a skeleton first, to make sure the NAR can be discovered and loaded. + * The actual offloader implementation will be provided in follow-up PRs. + */ +public class OpenDALLedgerOffloaderFactory implements LedgerOffloaderFactory { + + // Keep aligned with the existing driver names so that broker configs don't need to change. + private static final Set SUPPORTED_DRIVERS = Set.of( + "aws-s3", + "S3", + "aliyun-oss", + "google-cloud-storage", + "azureblob", + "transient" + ); + + private final OffsetsCache entryOffsetsCache = new OffsetsCache(); + private final OperatorCache operatorCache = new OperatorCache(); + + @Override + public boolean isDriverSupported(String driverName) { + return driverName != null && SUPPORTED_DRIVERS.stream().anyMatch(d -> d.equalsIgnoreCase(driverName)); + } + + @Override + public OpenDALManagedLedgerOffloader create(OffloadPoliciesImpl offloadPolicies, + Map userMetadata, + OrderedScheduler scheduler) throws IOException { + return create(offloadPolicies, userMetadata, scheduler, LedgerOffloaderStatsDisable.INSTANCE); + } + + @Override + public OpenDALManagedLedgerOffloader create(OffloadPoliciesImpl offloadPolicies, + Map userMetadata, + OrderedScheduler scheduler, + LedgerOffloaderStats offloaderStats) throws IOException { + return create(offloadPolicies, userMetadata, scheduler, scheduler, offloaderStats); + } + + @Override + public OpenDALManagedLedgerOffloader create(OffloadPoliciesImpl offloadPolicies, + Map userMetadata, + OrderedScheduler scheduler, + OrderedScheduler readExecutor, + LedgerOffloaderStats offloaderStats) throws IOException { + OpenDALTieredStorageConfiguration config = + OpenDALTieredStorageConfiguration.create(offloadPolicies.toProperties()); + OpenDALOperatorProvider operatorProvider = new OpenDALOperatorProvider(config, operatorCache); + return OpenDALManagedLedgerOffloader.create(config, userMetadata, scheduler, readExecutor, offloaderStats, + entryOffsetsCache, operatorProvider); + } + + @Override + public void close() throws Exception { + operatorCache.close(); + entryOffsetsCache.close(); + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/DataBlockUtils.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/DataBlockUtils.java new file mode 100644 index 0000000000000..8e19f36c6d119 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/DataBlockUtils.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.impl; + +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +/** + * Format and naming utilities that must remain compatible with the existing tiered-storage-jcloud implementation. + */ +public final class DataBlockUtils { + + /** + * Keep the same metadata key used by tiered-storage-jcloud. + * + *

Note: Some backends normalize user-metadata keys; callers should treat it as case-insensitive. + */ + public static final String METADATA_FORMAT_VERSION_KEY = "S3ManagedLedgerOffloaderFormatVersion"; + + /** + * Keep the same format version written by tiered-storage-jcloud. + */ + static final String CURRENT_VERSION = String.valueOf(1); + + private DataBlockUtils() { + } + + public static String dataBlockOffloadKey(long ledgerId, UUID uuid) { + return String.format("%s-ledger-%d", uuid.toString(), ledgerId); + } + + public static String indexBlockOffloadKey(long ledgerId, UUID uuid) { + return String.format("%s-ledger-%d-index", uuid.toString(), ledgerId); + } + + public static String indexBlockOffloadKey(UUID uuid) { + return String.format("%s-index", uuid.toString()); + } + + public static Map withVersionInfo(Map userMetadata) { + Map metadata = new HashMap<>(); + if (userMetadata != null && !userMetadata.isEmpty()) { + metadata.putAll(userMetadata); + } + // Follow tiered-storage-jcloud behavior: write the version key using lower-case. + metadata.put(METADATA_FORMAT_VERSION_KEY.toLowerCase(), CURRENT_VERSION); + return metadata; + } + + public static Long parseLedgerId(String name) { + if (name == null || name.isEmpty()) { + return null; + } + if (name.endsWith("-index")) { + name = name.substring(0, name.length() - "-index".length()); + } + int pos = name.indexOf("-ledger-"); + if (pos < 0) { + return null; + } + try { + return Long.parseLong(name.substring(pos + 8)); + } catch (NumberFormatException err) { + return null; + } + } + + public static String parseContextUuid(String name, Long ledgerId) { + if (ledgerId == null || name == null) { + return null; + } + int pos = name.indexOf("-ledger-" + ledgerId); + if (pos <= 0) { + return null; + } + return name.substring(0, pos); + } +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedInputStream.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedInputStream.java new file mode 100644 index 0000000000000..d4d2abd833f40 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedInputStream.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.impl; + +import io.netty.buffer.ByteBuf; +import java.io.IOException; +import java.io.InputStream; +import java.util.concurrent.TimeUnit; +import lombok.extern.slf4j.Slf4j; +import org.apache.bookkeeper.mledger.LedgerOffloaderStats; +import org.apache.bookkeeper.mledger.offload.jcloud.BackedInputStream; +import org.apache.bookkeeper.mledger.offload.opendal.storage.OpenDALStorage; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; +import org.apache.pulsar.common.naming.TopicName; + +@Slf4j +class OpenDALBackedInputStream extends BackedInputStream { + + private final OpenDALStorage storage; + private final String key; + private final ByteBuf buffer; + private final long objectLen; + private final int bufferSize; + private final LedgerOffloaderStats offloaderStats; + private final String topicName; + + private long cursor; + private long bufferOffsetStart; + private long bufferOffsetEnd; + + OpenDALBackedInputStream(OpenDALStorage storage, + String key, + long objectLen, + int bufferSize, + LedgerOffloaderStats offloaderStats, + String managedLedgerName) { + this.storage = storage; + this.key = key; + this.buffer = PulsarByteBufAllocator.DEFAULT.buffer(bufferSize, bufferSize); + this.objectLen = objectLen; + this.bufferSize = bufferSize; + this.offloaderStats = offloaderStats; + this.topicName = managedLedgerName != null ? TopicName.fromPersistenceNamingEncoding(managedLedgerName) : null; + this.cursor = 0; + this.bufferOffsetStart = this.bufferOffsetEnd = -1; + } + + private boolean refillBufferIfNeeded() throws IOException { + if (buffer.readableBytes() != 0) { + return true; + } + if (cursor >= objectLen) { + return false; + } + long startRange = cursor; + long endRange = Math.min(cursor + bufferSize - 1, objectLen - 1); + long startReadTime = System.nanoTime(); + try (InputStream stream = storage.readRange(key, startRange, endRange)) { + buffer.clear(); + bufferOffsetStart = startRange; + bufferOffsetEnd = endRange; + int bytesToCopy = (int) (endRange - startRange + 1); + fillBuffer(stream, bytesToCopy); + cursor += buffer.readableBytes(); + } catch (Throwable t) { + if (offloaderStats != null && topicName != null) { + offloaderStats.recordReadOffloadError(topicName); + } + if (t instanceof IOException) { + throw (IOException) t; + } + throw new IOException("Error reading from OpenDAL", t); + } finally { + if (offloaderStats != null && topicName != null) { + offloaderStats.recordReadOffloadDataLatency(topicName, + System.nanoTime() - startReadTime, TimeUnit.NANOSECONDS); + offloaderStats.recordReadOffloadBytes(topicName, endRange - startRange + 1); + } + } + return true; + } + + private void fillBuffer(InputStream is, int bytesToCopy) throws IOException { + while (bytesToCopy > 0) { + int writeBytes = buffer.writeBytes(is, bytesToCopy); + if (writeBytes < 0) { + break; + } + bytesToCopy -= writeBytes; + } + } + + @Override + public int read() throws IOException { + if (refillBufferIfNeeded()) { + return buffer.readUnsignedByte(); + } + return -1; + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if (refillBufferIfNeeded()) { + int bytesToRead = Math.min(len, buffer.readableBytes()); + buffer.readBytes(b, off, bytesToRead); + return bytesToRead; + } + return -1; + } + + @Override + public void seek(long position) { + if (log.isDebugEnabled()) { + log.debug("Seeking to {} on {}, current position {} (bufStart:{}, bufEnd:{})", + position, key, cursor, bufferOffsetStart, bufferOffsetEnd); + } + if (position >= bufferOffsetStart && position <= bufferOffsetEnd) { + long newIndex = position - bufferOffsetStart; + buffer.readerIndex((int) newIndex); + } else { + bufferOffsetStart = bufferOffsetEnd = -1; + cursor = position; + buffer.clear(); + } + } + + @Override + public void seekForward(long position) throws IOException { + if (position >= cursor) { + seek(position); + } else { + throw new IOException(String.format("Error seeking, new position %d < current position %d", + position, cursor)); + } + } + + @Override + public long getCurrentPosition() { + if (bufferOffsetStart != -1) { + return bufferOffsetStart + buffer.readerIndex(); + } + return cursor + buffer.readerIndex(); + } + + @Override + public void close() { + buffer.release(); + } + + @Override + public int available() { + long available = objectLen - cursor + buffer.readableBytes(); + return available > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) available; + } +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImpl.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImpl.java new file mode 100644 index 0000000000000..c2ba6114e03e6 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImpl.java @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.impl; + +import com.google.common.annotations.VisibleForTesting; +import io.netty.buffer.ByteBuf; +import java.io.DataInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; +import java.util.concurrent.atomic.AtomicReference; +import lombok.extern.slf4j.Slf4j; +import org.apache.bookkeeper.client.BKException; +import org.apache.bookkeeper.client.api.LastConfirmedAndEntry; +import org.apache.bookkeeper.client.api.LedgerEntries; +import org.apache.bookkeeper.client.api.LedgerEntry; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.client.api.ReadHandle; +import org.apache.bookkeeper.client.impl.LedgerEntriesImpl; +import org.apache.bookkeeper.client.impl.LedgerEntryImpl; +import org.apache.bookkeeper.mledger.LedgerOffloaderStats; +import org.apache.bookkeeper.mledger.ManagedLedgerException; +import org.apache.bookkeeper.mledger.OffloadedLedgerHandle; +import org.apache.bookkeeper.mledger.offload.jcloud.BackedInputStream; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlock; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockBuilder; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexEntry; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.OffsetsCache; +import org.apache.bookkeeper.mledger.offload.opendal.storage.OpenDALStorage; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; +import org.apache.pulsar.common.naming.TopicName; + +@Slf4j +public class OpenDALBackedReadHandleImpl implements ReadHandle, OffloadedLedgerHandle { + + protected static final AtomicIntegerFieldUpdater PENDING_READ_UPDATER = + AtomicIntegerFieldUpdater.newUpdater(OpenDALBackedReadHandleImpl.class, "pendingRead"); + + private final long ledgerId; + private final OffloadIndexBlock index; + private final BackedInputStream inputStream; + private final DataInputStream dataStream; + private final ExecutorService executor; + private final OffsetsCache entryOffsetsCache; + private final AtomicReference> closeFuture = new AtomicReference<>(); + + enum State { + Opened, + Closed + } + + private volatile State state; + private volatile int pendingRead; + private volatile long lastAccessTimestamp = System.currentTimeMillis(); + + @VisibleForTesting + OpenDALBackedReadHandleImpl(long ledgerId, + OffloadIndexBlock index, + BackedInputStream inputStream, + ExecutorService executor, + OffsetsCache entryOffsetsCache) { + this.ledgerId = ledgerId; + this.index = index; + this.inputStream = inputStream; + this.dataStream = new DataInputStream(inputStream); + this.executor = executor; + this.entryOffsetsCache = entryOffsetsCache; + state = State.Opened; + } + + @Override + public long getId() { + return ledgerId; + } + + @Override + public LedgerMetadata getLedgerMetadata() { + return index.getLedgerMetadata(); + } + + @Override + public CompletableFuture closeAsync() { + if (closeFuture.get() != null || !closeFuture.compareAndSet(null, new CompletableFuture<>())) { + return closeFuture.get(); + } + + CompletableFuture promise = closeFuture.get(); + executor.execute(() -> { + try { + index.close(); + inputStream.close(); + state = State.Closed; + promise.complete(null); + } catch (IOException t) { + promise.completeExceptionally(t); + } + }); + return promise; + } + + private class ReadTask implements Runnable { + private final long firstEntry; + private final long lastEntry; + private final CompletableFuture promise; + private int seekedAndTryTimes = 0; + + ReadTask(long firstEntry, long lastEntry, CompletableFuture promise) { + this.firstEntry = firstEntry; + this.lastEntry = lastEntry; + this.promise = promise; + } + + @Override + public void run() { + if (state == State.Closed) { + log.warn("Reading a closed read handler. Ledger ID: {}, Read range: {}-{}", + ledgerId, firstEntry, lastEntry); + promise.completeExceptionally(new ManagedLedgerException.OffloadReadHandleClosedException()); + return; + } + + List entryCollector = new ArrayList<>(); + try { + if (firstEntry > lastEntry || firstEntry < 0 || lastEntry > getLastAddConfirmed()) { + promise.completeExceptionally(new BKException.BKIncorrectParameterException()); + return; + } + long entriesToRead = (lastEntry - firstEntry) + 1; + long expectedEntryId = firstEntry; + seekToEntryOffset(firstEntry); + seekedAndTryTimes++; + + while (entriesToRead > 0) { + long currentPosition = inputStream.getCurrentPosition(); + int length = dataStream.readInt(); + if (length < 0) { // hit padding or new block + seekToEntryOffset(expectedEntryId); + continue; + } + long entryId = dataStream.readLong(); + if (entryId == expectedEntryId) { + entryOffsetsCache.put(ledgerId, entryId, currentPosition); + ByteBuf buf = PulsarByteBufAllocator.DEFAULT.buffer(length, length); + entryCollector.add(LedgerEntryImpl.create(ledgerId, entryId, length, buf)); + int toWrite = length; + while (toWrite > 0) { + toWrite -= buf.writeBytes(dataStream, toWrite); + } + entriesToRead--; + expectedEntryId++; + } else { + handleUnexpectedEntryId(expectedEntryId, entryId); + } + } + promise.complete(LedgerEntriesImpl.create(entryCollector)); + } catch (Throwable t) { + log.error("Failed to read entries {} - {} from the offloader in ledger {}, current position of input" + + " stream is {}", firstEntry, lastEntry, ledgerId, inputStream.getCurrentPosition(), t); + if (t instanceof FileNotFoundException) { + promise.completeExceptionally(new BKException.BKNoSuchLedgerExistsException()); + } else { + promise.completeExceptionally(t); + } + entryCollector.forEach(LedgerEntry::close); + } + } + + private void handleUnexpectedEntryId(long expectedId, long actEntryId) throws Exception { + LedgerMetadata ledgerMetadata = getLedgerMetadata(); + OffloadIndexEntry offsetOfExpectedId = index.getIndexEntryForEntry(expectedId); + OffloadIndexEntry offsetOfActId = actEntryId <= getLedgerMetadata().getLastEntryId() && actEntryId >= 0 + ? index.getIndexEntryForEntry(actEntryId) : null; + String logLine = String.format("Failed to read [ %s ~ %s ] of the ledger %s." + + " Because got a incorrect entry id %s, the offset is %s." + + " The expected entry id is %s, the offset is %s." + + " Have seeked and retry read times: %s. LAC is %s.", + firstEntry, lastEntry, ledgerId, + actEntryId, offsetOfActId == null ? "null because it does not exist" + : String.valueOf(offsetOfActId), + expectedId, String.valueOf(offsetOfExpectedId), + seekedAndTryTimes, ledgerMetadata != null ? ledgerMetadata.getLastEntryId() : "unknown"); + long maxTryTimes = Math.max(3, (lastEntry - firstEntry + 1) >> 2); + if (seekedAndTryTimes > maxTryTimes) { + log.error(logLine); + throw new BKException.BKUnexpectedConditionException(); + } else { + log.warn(logLine); + } + seekToEntryOffset(expectedId); + seekedAndTryTimes++; + } + + private void skipPreviousEntry(long startEntryId, long expectedEntryId) throws IOException, BKException { + long nextExpectedEntryId = startEntryId; + while (nextExpectedEntryId < expectedEntryId) { + long offset = inputStream.getCurrentPosition(); + int len = dataStream.readInt(); + if (len < 0) { + LedgerMetadata ledgerMetadata = getLedgerMetadata(); + OffloadIndexEntry offsetOfExpectedId = index.getIndexEntryForEntry(expectedEntryId); + log.error("Failed to read [ {} ~ {} ] of the ledger {}." + + " Because failed to skip a previous entry {}, len: {}, got a negative len." + + " The expected entry id is {}, the offset is {}." + + " Have seeked and retry read times: {}. LAC is {}.", + firstEntry, lastEntry, ledgerId, + nextExpectedEntryId, len, + expectedEntryId, String.valueOf(offsetOfExpectedId), + seekedAndTryTimes, ledgerMetadata != null ? ledgerMetadata.getLastEntryId() : "unknown"); + throw new BKException.BKUnexpectedConditionException(); + } + long entryId = dataStream.readLong(); + if (entryId == nextExpectedEntryId) { + long skipped = inputStream.skip(len); + if (skipped != len) { + LedgerMetadata ledgerMetadata = getLedgerMetadata(); + OffloadIndexEntry offsetOfExpectedId = index.getIndexEntryForEntry(expectedEntryId); + log.error("Failed to read [ {} ~ {} ] of the ledger {}." + + " Because failed to skip a previous entry {}, offset: {}, len: {}," + + " there is no more data." + + " The expected entry id is {}, the offset is {}." + + " Have seeked and retry read times: {}. LAC is {}.", + firstEntry, lastEntry, ledgerId, + entryId, offset, len, + expectedEntryId, String.valueOf(offsetOfExpectedId), + seekedAndTryTimes, ledgerMetadata != null ? ledgerMetadata.getLastEntryId() + : "unknown"); + throw new BKException.BKUnexpectedConditionException(); + } + nextExpectedEntryId++; + } else { + LedgerMetadata ledgerMetadata = getLedgerMetadata(); + OffloadIndexEntry offsetOfExpectedId = index.getIndexEntryForEntry(expectedEntryId); + log.error("Failed to read [ {} ~ {} ] of the ledger {}." + + " Because got a incorrect entry id {},." + + " The expected entry id is {}, the offset is {}." + + " Have seeked and retry read times: {}. LAC is {}.", + firstEntry, lastEntry, ledgerId, + entryId, expectedEntryId, String.valueOf(offsetOfExpectedId), + seekedAndTryTimes, ledgerMetadata != null ? ledgerMetadata.getLastEntryId() : "unknown"); + throw new BKException.BKUnexpectedConditionException(); + } + } + } + + private void seekToEntryOffset(long expectedEntryId) throws IOException, BKException { + Long cachedPreciseIndex = entryOffsetsCache.getIfPresent(ledgerId, expectedEntryId); + if (cachedPreciseIndex != null) { + inputStream.seek(cachedPreciseIndex); + return; + } + + OffloadIndexEntry indexOfNearestEntry = index.getIndexEntryForEntry(expectedEntryId); + if (indexOfNearestEntry.getEntryId() == expectedEntryId) { + inputStream.seek(indexOfNearestEntry.getDataOffset()); + return; + } + + Long cachedPreviousKnownOffset = entryOffsetsCache.getIfPresent(ledgerId, expectedEntryId - 1); + if (cachedPreviousKnownOffset != null) { + inputStream.seek(cachedPreviousKnownOffset); + skipPreviousEntry(expectedEntryId - 1, expectedEntryId); + return; + } + + if (indexOfNearestEntry.getEntryId() < expectedEntryId) { + inputStream.seek(indexOfNearestEntry.getDataOffset()); + skipPreviousEntry(indexOfNearestEntry.getEntryId(), expectedEntryId); + } else { + LedgerMetadata ledgerMetadata = getLedgerMetadata(); + log.error("Failed to read [ {} ~ {} ] of the ledger {}." + + " Because got a incorrect index {} of the entry {}, which is greater than expected." + + " Have seeked and retry read times: {}. LAC is {}.", + firstEntry, lastEntry, ledgerId, + String.valueOf(indexOfNearestEntry), expectedEntryId, + seekedAndTryTimes, ledgerMetadata != null ? ledgerMetadata.getLastEntryId() : "unknown"); + throw new BKException.BKUnexpectedConditionException(); + } + } + } + + @Override + public CompletableFuture readAsync(long firstEntry, long lastEntry) { + if (log.isDebugEnabled()) { + log.debug("Ledger {}: reading {} - {} ({} entries}", + getId(), firstEntry, lastEntry, (1 + lastEntry - firstEntry)); + } + CompletableFuture promise = new CompletableFuture<>(); + + PENDING_READ_UPDATER.incrementAndGet(this); + promise.whenComplete((__, ex) -> { + lastAccessTimestamp = System.currentTimeMillis(); + PENDING_READ_UPDATER.decrementAndGet(OpenDALBackedReadHandleImpl.this); + }); + executor.execute(new ReadTask(firstEntry, lastEntry, promise)); + return promise; + } + + @Override + public CompletableFuture readUnconfirmedAsync(long firstEntry, long lastEntry) { + return readAsync(firstEntry, lastEntry); + } + + @Override + public CompletableFuture readLastAddConfirmedAsync() { + return CompletableFuture.completedFuture(getLastAddConfirmed()); + } + + @Override + public CompletableFuture tryReadLastAddConfirmedAsync() { + return CompletableFuture.completedFuture(getLastAddConfirmed()); + } + + @Override + public long getLastAddConfirmed() { + return getLedgerMetadata().getLastEntryId(); + } + + @Override + public long getLength() { + return getLedgerMetadata().getLength(); + } + + @Override + public boolean isClosed() { + return getLedgerMetadata().isClosed(); + } + + @Override + public CompletableFuture readLastAddConfirmedAndEntryAsync(long entryId, + long timeOutInMillis, + boolean parallel) { + CompletableFuture promise = new CompletableFuture<>(); + promise.completeExceptionally(new UnsupportedOperationException()); + return promise; + } + + public static ReadHandle open(ScheduledExecutorService executor, + OpenDALStorage storage, + String dataKey, + String indexKey, + long ledgerId, + int readBufferSize, + LedgerOffloaderStats offloaderStats, + String managedLedgerName, + OffsetsCache entryOffsetsCache) + throws IOException, BKException.BKNoSuchLedgerExistsException { + int retryCount = 3; + OffloadIndexBlock index = null; + IOException lastException = null; + String topicName = TopicName.fromPersistenceNamingEncoding(managedLedgerName); + while (retryCount-- > 0) { + long readIndexStartTime = System.nanoTime(); + try { + OpenDALStorage.ObjectMetadata meta = storage.stat(indexKey); + if (meta.getSize() <= 0) { + throw new IOException("Index object is empty: " + indexKey); + } + try (InputStream in = storage.readRange(indexKey, 0, meta.getSize() - 1)) { + OffloadIndexBlockBuilder indexBuilder = OffloadIndexBlockBuilder.create(); + index = (OffloadIndexBlock) indexBuilder.fromStream(in); + } + offloaderStats.recordReadOffloadIndexLatency(topicName, + System.nanoTime() - readIndexStartTime, TimeUnit.NANOSECONDS); + lastException = null; + break; + } catch (FileNotFoundException notFound) { + log.error("{} not found for ledger {}", indexKey, ledgerId); + throw new BKException.BKNoSuchLedgerExistsException(); + } catch (IOException e) { + log.warn("Failed to get index block from the offloaded index file {}, still have {} times to retry", + indexKey, retryCount, e); + lastException = e; + } + } + if (lastException != null) { + throw lastException; + } + if (index == null) { + throw new IOException("Failed to open offloaded index " + indexKey); + } + + BackedInputStream inputStream = new OpenDALBackedInputStream(storage, dataKey, + index.getDataObjectLength(), readBufferSize, offloaderStats, managedLedgerName); + return new OpenDALBackedReadHandleImpl(ledgerId, index, inputStream, executor, entryOffsetsCache); + } + + @VisibleForTesting + State getState() { + return this.state; + } + + @Override + public long lastAccessTimestamp() { + return lastAccessTimestamp; + } + + @Override + public int getPendingRead() { + return PENDING_READ_UPDATER.get(this); + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImplV2.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImplV2.java new file mode 100644 index 0000000000000..c6d9bf663fec5 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImplV2.java @@ -0,0 +1,387 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.impl; + +import static com.google.common.base.Preconditions.checkArgument; +import io.netty.buffer.ByteBuf; +import java.io.DataInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; +import java.util.concurrent.atomic.AtomicReference; +import lombok.extern.slf4j.Slf4j; +import org.apache.bookkeeper.client.BKException; +import org.apache.bookkeeper.client.api.LastConfirmedAndEntry; +import org.apache.bookkeeper.client.api.LedgerEntries; +import org.apache.bookkeeper.client.api.LedgerEntry; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.client.api.ReadHandle; +import org.apache.bookkeeper.client.impl.LedgerEntriesImpl; +import org.apache.bookkeeper.client.impl.LedgerEntryImpl; +import org.apache.bookkeeper.mledger.LedgerOffloaderStats; +import org.apache.bookkeeper.mledger.ManagedLedgerException; +import org.apache.bookkeeper.mledger.OffloadedLedgerHandle; +import org.apache.bookkeeper.mledger.offload.jcloud.BackedInputStream; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockV2; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockV2Builder; +import org.apache.bookkeeper.mledger.offload.opendal.storage.OpenDALStorage; +import org.apache.pulsar.common.allocator.PulsarByteBufAllocator; +import org.apache.pulsar.common.naming.TopicName; + +@Slf4j +public class OpenDALBackedReadHandleImplV2 implements ReadHandle, OffloadedLedgerHandle { + + private static final AtomicIntegerFieldUpdater PENDING_READ_UPDATER = + AtomicIntegerFieldUpdater.newUpdater(OpenDALBackedReadHandleImplV2.class, "pendingRead"); + + private final long ledgerId; + private final List indices; + private final List inputStreams; + private final List dataStreams; + private final ExecutorService executor; + private final AtomicReference> closeFuture = new AtomicReference<>(); + + private volatile State state; + private volatile int pendingRead; + private volatile long lastAccessTimestamp = System.currentTimeMillis(); + + enum State { + Opened, + Closed + } + + static class GroupedReader { + public final long ledgerId; + public final long firstEntry; + public final long lastEntry; + OffloadIndexBlockV2 index; + BackedInputStream inputStream; + DataInputStream dataStream; + + GroupedReader(long ledgerId, + long firstEntry, + long lastEntry, + OffloadIndexBlockV2 index, + BackedInputStream inputStream, + DataInputStream dataStream) { + this.ledgerId = ledgerId; + this.firstEntry = firstEntry; + this.lastEntry = lastEntry; + this.index = index; + this.inputStream = inputStream; + this.dataStream = dataStream; + } + + @Override + public String toString() { + return "GroupedReader{ledgerId=" + ledgerId + ", firstEntry=" + firstEntry + ", lastEntry=" + lastEntry + + '}'; + } + } + + private OpenDALBackedReadHandleImplV2(long ledgerId, + List indices, + List inputStreams, + ExecutorService executor) { + this.ledgerId = ledgerId; + this.indices = indices; + this.inputStreams = inputStreams; + this.dataStreams = new LinkedList<>(); + for (BackedInputStream inputStream : inputStreams) { + dataStreams.add(new DataInputStream(inputStream)); + } + this.executor = executor; + this.state = State.Opened; + } + + @Override + public long getId() { + return ledgerId; + } + + @Override + public LedgerMetadata getLedgerMetadata() { + // Return the most complete one. + return indices.get(indices.size() - 1).getLedgerMetadata(ledgerId); + } + + @Override + public CompletableFuture closeAsync() { + if (closeFuture.get() != null || !closeFuture.compareAndSet(null, new CompletableFuture<>())) { + return closeFuture.get(); + } + + CompletableFuture promise = closeFuture.get(); + executor.execute(() -> { + try { + for (OffloadIndexBlockV2 indexBlock : indices) { + indexBlock.close(); + } + for (DataInputStream dataStream : dataStreams) { + dataStream.close(); + } + state = State.Closed; + promise.complete(null); + } catch (IOException t) { + promise.completeExceptionally(t); + } + }); + return promise; + } + + @Override + public CompletableFuture readAsync(long firstEntry, long lastEntry) { + if (log.isDebugEnabled()) { + log.debug("Ledger {}: reading {} - {}", getId(), firstEntry, lastEntry); + } + lastAccessTimestamp = System.currentTimeMillis(); + + CompletableFuture promise = new CompletableFuture<>(); + executor.execute(() -> { + if (state == State.Closed) { + log.warn("Reading a closed read handler. Ledger ID: {}, Read range: {}-{}", + ledgerId, firstEntry, lastEntry); + promise.completeExceptionally(new ManagedLedgerException.OffloadReadHandleClosedException()); + return; + } + + if (firstEntry > lastEntry + || firstEntry < 0 + || lastEntry > getLastAddConfirmed()) { + promise.completeExceptionally(new BKException.BKIncorrectParameterException()); + return; + } + + List entries = new ArrayList<>(); + List groupedReaders; + try { + groupedReaders = getGroupedReader(firstEntry, lastEntry); + } catch (Exception e) { + promise.completeExceptionally(e); + return; + } + + PENDING_READ_UPDATER.incrementAndGet(this); + try { + for (GroupedReader groupedReader : groupedReaders) { + long entriesToRead = (groupedReader.lastEntry - groupedReader.firstEntry) + 1; + long nextExpectedId = groupedReader.firstEntry; + while (entriesToRead > 0) { + int length = groupedReader.dataStream.readInt(); + if (length < 0) { // hit padding or new block + groupedReader.inputStream.seek(groupedReader.index + .getIndexEntryForEntry(groupedReader.ledgerId, nextExpectedId) + .getDataOffset()); + continue; + } + long entryId = groupedReader.dataStream.readLong(); + + if (entryId == nextExpectedId) { + ByteBuf buf = PulsarByteBufAllocator.DEFAULT.buffer(length, length); + entries.add(LedgerEntryImpl.create(ledgerId, entryId, length, buf)); + int toWrite = length; + while (toWrite > 0) { + toWrite -= buf.writeBytes(groupedReader.dataStream, toWrite); + } + entriesToRead--; + nextExpectedId++; + } else if (entryId > nextExpectedId) { + groupedReader.inputStream.seek(groupedReader.index + .getIndexEntryForEntry(groupedReader.ledgerId, nextExpectedId) + .getDataOffset()); + } else if (entryId < nextExpectedId + && !groupedReader.index.getIndexEntryForEntry(groupedReader.ledgerId, nextExpectedId) + .equals(groupedReader.index.getIndexEntryForEntry(groupedReader.ledgerId, entryId))) { + groupedReader.inputStream.seek(groupedReader.index + .getIndexEntryForEntry(groupedReader.ledgerId, nextExpectedId) + .getDataOffset()); + } else if (entryId > groupedReader.lastEntry) { + log.info("Expected to read {}, but read {}, which is greater than last entry {}", + nextExpectedId, entryId, groupedReader.lastEntry); + throw new BKException.BKUnexpectedConditionException(); + } else { + skipFully(groupedReader.inputStream, length); + } + } + } + promise.complete(LedgerEntriesImpl.create(entries)); + } catch (Throwable t) { + if (t instanceof FileNotFoundException) { + promise.completeExceptionally(new BKException.BKNoSuchLedgerExistsException()); + } else { + promise.completeExceptionally(t); + } + entries.forEach(LedgerEntry::close); + } finally { + PENDING_READ_UPDATER.decrementAndGet(this); + } + }); + return promise; + } + + private List getGroupedReader(long firstEntry, long lastEntry) throws Exception { + List groupedReaders = new LinkedList<>(); + for (int i = indices.size() - 1; i >= 0 && firstEntry <= lastEntry; i--) { + OffloadIndexBlockV2 index = indices.get(i); + long startEntryId = index.getStartEntryId(ledgerId); + if (startEntryId > lastEntry) { + log.debug("Entries are in earlier indices, skip this segment. ledgerId={}, beginEntryId={}", + ledgerId, startEntryId); + } else { + groupedReaders.add(new GroupedReader(ledgerId, startEntryId, lastEntry, + index, inputStreams.get(i), dataStreams.get(i))); + lastEntry = startEntryId - 1; + } + } + + checkArgument(firstEntry > lastEntry); + for (int i = 0; i < groupedReaders.size() - 1; i++) { + GroupedReader readerI = groupedReaders.get(i); + GroupedReader readerII = groupedReaders.get(i + 1); + checkArgument(readerI.ledgerId == readerII.ledgerId); + checkArgument(readerI.firstEntry >= readerII.lastEntry); + } + return groupedReaders; + } + + @Override + public CompletableFuture readUnconfirmedAsync(long firstEntry, long lastEntry) { + return readAsync(firstEntry, lastEntry); + } + + @Override + public CompletableFuture readLastAddConfirmedAsync() { + return CompletableFuture.completedFuture(getLastAddConfirmed()); + } + + @Override + public CompletableFuture tryReadLastAddConfirmedAsync() { + return CompletableFuture.completedFuture(getLastAddConfirmed()); + } + + @Override + public long getLastAddConfirmed() { + return getLedgerMetadata().getLastEntryId(); + } + + @Override + public long getLength() { + return getLedgerMetadata().getLength(); + } + + @Override + public boolean isClosed() { + return getLedgerMetadata().isClosed(); + } + + @Override + public CompletableFuture readLastAddConfirmedAndEntryAsync(long entryId, + long timeOutInMillis, + boolean parallel) { + CompletableFuture promise = new CompletableFuture<>(); + promise.completeExceptionally(new UnsupportedOperationException()); + return promise; + } + + public static ReadHandle open(ScheduledExecutorService executor, + OpenDALStorage storage, + List keys, + List indexKeys, + long ledgerId, + int readBufferSize, + LedgerOffloaderStats offloaderStats, + String managedLedgerName) + throws IOException, BKException.BKNoSuchLedgerExistsException { + List inputStreams = new LinkedList<>(); + List indices = new LinkedList<>(); + String topicName = managedLedgerName != null + ? TopicName.fromPersistenceNamingEncoding(managedLedgerName) + : null; + + for (int i = 0; i < indexKeys.size(); i++) { + String indexKey = indexKeys.get(i); + String key = keys.get(i); + + long startTime = System.nanoTime(); + OpenDALStorage.ObjectMetadata meta; + try { + meta = storage.stat(indexKey); + } catch (FileNotFoundException notFound) { + log.error("{} not found while opening V2 offloaded ledger {}", indexKey, ledgerId); + throw new BKException.BKNoSuchLedgerExistsException(); + } + if (offloaderStats != null && topicName != null) { + offloaderStats.recordReadOffloadIndexLatency(topicName, + System.nanoTime() - startTime, TimeUnit.NANOSECONDS); + } + if (meta.getSize() <= 0) { + throw new IOException("Index object is empty: " + indexKey); + } + + OffloadIndexBlockV2Builder indexBuilder = OffloadIndexBlockV2Builder.create(); + OffloadIndexBlockV2 index; + try (InputStream payloadStream = storage.readRange(indexKey, 0, meta.getSize() - 1)) { + index = indexBuilder.fromStream(payloadStream); + } catch (FileNotFoundException notFound) { + log.error("{} not found while opening V2 offloaded ledger {}", indexKey, ledgerId); + throw new BKException.BKNoSuchLedgerExistsException(); + } + + BackedInputStream inputStream = new OpenDALBackedInputStream(storage, key, + index.getDataObjectLength(), readBufferSize, offloaderStats, managedLedgerName); + inputStreams.add(inputStream); + indices.add(index); + } + + return new OpenDALBackedReadHandleImplV2(ledgerId, indices, inputStreams, executor); + } + + @Override + public long lastAccessTimestamp() { + return lastAccessTimestamp; + } + + @Override + public int getPendingRead() { + return PENDING_READ_UPDATER.get(this); + } + + private static void skipFully(InputStream in, long bytes) throws IOException { + long remaining = bytes; + while (remaining > 0) { + long skipped = in.skip(remaining); + if (skipped > 0) { + remaining -= skipped; + continue; + } + if (in.read() < 0) { + throw new IOException("Unexpected EOF while skipping " + bytes + " bytes"); + } + remaining--; + } + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALManagedLedgerOffloader.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALManagedLedgerOffloader.java new file mode 100644 index 0000000000000..503926b85858d --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALManagedLedgerOffloader.java @@ -0,0 +1,679 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.impl; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.time.Duration; +import java.time.Instant; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.UUID; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import lombok.extern.slf4j.Slf4j; +import org.apache.bookkeeper.client.BKException; +import org.apache.bookkeeper.client.api.ReadHandle; +import org.apache.bookkeeper.common.util.OrderedScheduler; +import org.apache.bookkeeper.mledger.Entry; +import org.apache.bookkeeper.mledger.LedgerOffloader; +import org.apache.bookkeeper.mledger.LedgerOffloader.OffloadHandle; +import org.apache.bookkeeper.mledger.LedgerOffloader.OffloadHandle.OfferEntryResult; +import org.apache.bookkeeper.mledger.LedgerOffloaderStats; +import org.apache.bookkeeper.mledger.ManagedLedger; +import org.apache.bookkeeper.mledger.ManagedLedgerException; +import org.apache.bookkeeper.mledger.OffloadedLedgerMetadata; +import org.apache.bookkeeper.mledger.OffloadedLedgerMetadataConsumer; +import org.apache.bookkeeper.mledger.Position; +import org.apache.bookkeeper.mledger.PositionFactory; +import org.apache.bookkeeper.mledger.impl.EntryImpl; +import org.apache.bookkeeper.mledger.impl.OffloadSegmentInfoImpl; +import org.apache.bookkeeper.mledger.offload.jcloud.BlockAwareSegmentInputStream; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlock; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlock.IndexInputStream; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockBuilder; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockV2; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlockV2Builder; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.BlockAwareSegmentInputStreamImpl; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.BufferedOffloadStream; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.OffsetsCache; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.StreamingDataBlockHeaderImpl; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALOperatorProvider; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALTieredStorageConfiguration; +import org.apache.bookkeeper.mledger.offload.opendal.storage.OpenDALStorage; +import org.apache.bookkeeper.mledger.proto.MLDataFormats; +import org.apache.pulsar.common.naming.TopicName; +import org.apache.pulsar.common.policies.data.OffloadPolicies; +import org.apache.pulsar.common.policies.data.OffloadPoliciesImpl; + +/** + * Tiered Storage Offloader backed by OpenDAL. + * + *

This implementation keeps the same object key naming and on-disk formats as the existing jcloud implementation. + */ +@Slf4j +public class OpenDALManagedLedgerOffloader implements LedgerOffloader { + + private static final String MANAGED_LEDGER_NAME = "ManagedLedgerName"; + + private final OrderedScheduler scheduler; + private final OrderedScheduler readExecutor; + private final OpenDALTieredStorageConfiguration config; + private final OffloadPolicies policies; + private final Map userMetadata; + private final OpenDALOperatorProvider operatorProvider; + private final OffsetsCache entryOffsetsCache; + private final LedgerOffloaderStats offloaderStats; + + private final AtomicLong bufferLength = new AtomicLong(0); + private final AtomicLong segmentLength = new AtomicLong(0); + private final long maxBufferLength; + private final ConcurrentLinkedQueue offloadBuffer = new ConcurrentLinkedQueue<>(); + private final Duration maxSegmentCloseTime; + private final long minSegmentCloseTimeMillis; + private final long segmentBeginTimeMillis; + private final long maxSegmentLength; + private final int streamingBlockSize; + + private CompletableFuture offloadResult; + private volatile Position lastOfferedPosition = PositionFactory.LATEST; + private OffloadIndexBlockV2Builder streamingIndexBuilder; + private OffloadSegmentInfoImpl segmentInfo; + private OpenDALStorage streamingStorage; + private OutputStream streamingDataOut; + private String streamingDataBlockKey; + private String streamingDataIndexKey; + + public static OpenDALManagedLedgerOffloader create(OpenDALTieredStorageConfiguration config, + Map userMetadata, + OrderedScheduler scheduler, + OrderedScheduler readExecutor, + LedgerOffloaderStats offloaderStats, + OffsetsCache entryOffsetsCache, + OpenDALOperatorProvider operatorProvider) throws IOException { + config.validate(); + return new OpenDALManagedLedgerOffloader(config, userMetadata, scheduler, readExecutor, offloaderStats, + entryOffsetsCache, operatorProvider); + } + + OpenDALManagedLedgerOffloader(OpenDALTieredStorageConfiguration config, + Map userMetadata, + OrderedScheduler scheduler, + OrderedScheduler readExecutor, + LedgerOffloaderStats offloaderStats, + OffsetsCache entryOffsetsCache, + OpenDALOperatorProvider operatorProvider) { + this.scheduler = scheduler; + this.readExecutor = readExecutor; + this.userMetadata = userMetadata != null ? userMetadata : Collections.emptyMap(); + this.config = config; + Properties properties = new Properties(); + properties.putAll(config.getConfigProperties()); + this.policies = OffloadPoliciesImpl.create(properties); + this.entryOffsetsCache = entryOffsetsCache; + this.operatorProvider = operatorProvider; + this.offloaderStats = offloaderStats; + this.streamingBlockSize = config.getMinBlockSizeInBytes(); + this.maxSegmentCloseTime = Duration.ofSeconds(config.getMaxSegmentTimeInSecond()); + this.maxSegmentLength = config.getMaxSegmentSizeInBytes(); + this.minSegmentCloseTimeMillis = Duration.ofSeconds(config.getMinSegmentTimeInSecond()).toMillis(); + this.maxBufferLength = Math.max(config.getWriteBufferSizeInBytes(), config.getMinBlockSizeInBytes()); + this.segmentBeginTimeMillis = System.currentTimeMillis(); + log.info("OpenDAL offloader created. driver={}, scheme={}, bucket={}, endpoint={}, region={}", + config.getDriver(), config.getScheme(), config.getBucket(), config.getServiceEndpoint(), + config.getRegion()); + } + + @Override + public String getOffloadDriverName() { + return config.getDriver(); + } + + @Override + public Map getOffloadDriverMetadata() { + return config.getOffloadDriverMetadata(); + } + + @Override + public CompletableFuture offload(ReadHandle readHandle, UUID uuid, Map extraMetadata) { + final String managedLedgerName = extraMetadata != null ? extraMetadata.get(MANAGED_LEDGER_NAME) : null; + final String topicName = managedLedgerName != null + ? TopicName.fromPersistenceNamingEncoding(managedLedgerName) + : "unknown"; + + CompletableFuture promise = new CompletableFuture<>(); + scheduler.chooseThread(readHandle.getId()).execute(() -> { + OpenDALStorage storage = new OpenDALStorage(operatorProvider, config.getOffloadDriverMetadata()); + String dataKey = DataBlockUtils.dataBlockOffloadKey(readHandle.getId(), uuid); + String indexKey = DataBlockUtils.indexBlockOffloadKey(readHandle.getId(), uuid); + + if (!readHandle.isClosed() || readHandle.getLastAddConfirmed() < 0) { + promise.completeExceptionally( + new IllegalArgumentException("An empty or open ledger should never be offloaded")); + return; + } + + long dataObjectLength = 0; + try { + OffloadIndexBlockBuilder indexBuilder = OffloadIndexBlockBuilder.create() + .withLedgerMetadata(readHandle.getLedgerMetadata()) + .withDataBlockHeaderLength(BlockAwareSegmentInputStreamImpl.getHeaderSize()); + + long startEntry = 0; + int partId = 1; + long entryBytesWritten = 0; + try (OutputStream dataOut = storage.openOutputStream(dataKey)) { + while (startEntry <= readHandle.getLastAddConfirmed()) { + int blockSize = BlockAwareSegmentInputStreamImpl.calculateBlockSize( + config.getMaxBlockSizeInBytes(), readHandle, startEntry, entryBytesWritten); + + try (BlockAwareSegmentInputStream blockStream = new BlockAwareSegmentInputStreamImpl( + readHandle, startEntry, blockSize, this.offloaderStats, managedLedgerName)) { + copyStream(blockStream, dataOut); + indexBuilder.addBlock(startEntry, partId, blockSize); + + if (blockStream.getEndEntryId() != -1) { + startEntry = blockStream.getEndEntryId() + 1; + } else { + break; + } + entryBytesWritten += blockStream.getBlockEntryBytesCount(); + partId++; + dataObjectLength += blockSize; + this.offloaderStats.recordOffloadBytes(topicName, blockStream.getBlockEntryBytesCount()); + } + } + } + + try (OffloadIndexBlock index = indexBuilder.withDataObjectLength(dataObjectLength).build(); + IndexInputStream indexStream = index.toStream()) { + byte[] indexBytes = readAllBytes(indexStream, indexStream.getStreamSize()); + Map objectMetadata = new HashMap<>(userMetadata); + objectMetadata.put("role", "index"); + if (extraMetadata != null) { + objectMetadata.putAll(extraMetadata); + } + storage.writeBytes(indexKey, indexBytes, DataBlockUtils.withVersionInfo(objectMetadata)); + } + + promise.complete(null); + } catch (Throwable t) { + try { + storage.delete(dataKey); + } catch (Throwable t2) { + log.warn("Failed to cleanup data object {}", dataKey, t2); + } + try { + storage.delete(indexKey); + } catch (Throwable t2) { + log.warn("Failed to cleanup index object {}", indexKey, t2); + } + this.offloaderStats.recordWriteToStorageError(topicName); + this.offloaderStats.recordOffloadError(topicName); + promise.completeExceptionally(t); + } + }); + return promise; + } + + @Override + public CompletableFuture readOffloaded(long ledgerId, UUID uid, + Map offloadDriverMetadata) { + CompletableFuture promise = new CompletableFuture<>(); + String dataKey = DataBlockUtils.dataBlockOffloadKey(ledgerId, uid); + String indexKey = DataBlockUtils.indexBlockOffloadKey(ledgerId, uid); + String managedLedgerName = offloadDriverMetadata != null + ? offloadDriverMetadata.get(MANAGED_LEDGER_NAME) + : null; + + readExecutor.chooseThread(ledgerId).execute(() -> { + try { + OpenDALStorage storage = new OpenDALStorage(operatorProvider, + offloadDriverMetadata != null ? offloadDriverMetadata : Collections.emptyMap()); + promise.complete(OpenDALBackedReadHandleImpl.open(readExecutor.chooseThread(ledgerId), storage, + dataKey, indexKey, ledgerId, config.getReadBufferSizeInBytes(), this.offloaderStats, + managedLedgerName, this.entryOffsetsCache)); + } catch (BKException.BKNoSuchLedgerExistsException e) { + promise.completeExceptionally(e); + } catch (Throwable t) { + log.error("Failed readOffloaded ledger {}", ledgerId, t); + promise.completeExceptionally(t); + } + }); + return promise; + } + + @Override + public CompletableFuture readOffloaded(long ledgerId, + MLDataFormats.OffloadContext ledgerContext, + Map offloadDriverMetadata) { + CompletableFuture promise = new CompletableFuture<>(); + + List keys = new LinkedList<>(); + List indexKeys = new LinkedList<>(); + for (MLDataFormats.OffloadSegment seg : ledgerContext.getOffloadSegmentList()) { + UUID uuid = new UUID(seg.getUidMsb(), seg.getUidLsb()); + keys.add(uuid.toString()); + indexKeys.add(DataBlockUtils.indexBlockOffloadKey(uuid)); + } + + String managedLedgerName = offloadDriverMetadata != null + ? offloadDriverMetadata.get(MANAGED_LEDGER_NAME) + : null; + readExecutor.chooseThread(ledgerId).execute(() -> { + try { + OpenDALStorage storage = new OpenDALStorage(operatorProvider, + offloadDriverMetadata != null ? offloadDriverMetadata : Collections.emptyMap()); + promise.complete(OpenDALBackedReadHandleImplV2.open(readExecutor.chooseThread(ledgerId), storage, + keys, indexKeys, ledgerId, config.getReadBufferSizeInBytes(), this.offloaderStats, + managedLedgerName)); + } catch (BKException.BKNoSuchLedgerExistsException e) { + promise.completeExceptionally(e); + } catch (Throwable t) { + log.error("Failed readOffloaded (V2) ledger {}", ledgerId, t); + promise.completeExceptionally(t); + } + }); + return promise; + } + + @Override + public CompletableFuture streamingOffload(ManagedLedger ml, + UUID uuid, + long beginLedger, + long beginEntry, + Map driverMetadata) { + if (this.segmentInfo != null) { + CompletableFuture result = new CompletableFuture<>(); + result.completeExceptionally(new IllegalStateException("streamingOffload should only be called once")); + return result; + } + + this.segmentInfo = new OffloadSegmentInfoImpl(uuid, beginLedger, beginEntry, config.getDriver(), + driverMetadata != null ? driverMetadata : Collections.emptyMap()); + this.offloadResult = new CompletableFuture<>(); + + this.streamingIndexBuilder = OffloadIndexBlockV2Builder.create() + .withDataBlockHeaderLength(StreamingDataBlockHeaderImpl.getDataStartOffset()); + this.streamingDataBlockKey = segmentInfo.uuid.toString(); + this.streamingDataIndexKey = DataBlockUtils.indexBlockOffloadKey(segmentInfo.uuid); + this.streamingStorage = new OpenDALStorage(operatorProvider, config.getOffloadDriverMetadata()); + + try { + this.streamingDataOut = streamingStorage.openOutputStream(streamingDataBlockKey); + } catch (IOException e) { + CompletableFuture result = new CompletableFuture<>(); + result.completeExceptionally(e); + return result; + } + + scheduler.chooseThread(segmentInfo).execute(() -> { + log.info("Start streaming offload segment: {}", segmentInfo); + streamingOffloadLoop(ml, 1, 0); + }); + scheduler.schedule(this::closeSegment, maxSegmentCloseTime.toMillis(), TimeUnit.MILLISECONDS); + + return CompletableFuture.completedFuture(new OffloadHandle() { + @Override + public Position lastOffered() { + return OpenDALManagedLedgerOffloader.this.lastOffered(); + } + + @Override + public CompletableFuture lastOfferedAsync() { + return CompletableFuture.completedFuture(lastOffered()); + } + + @Override + public OfferEntryResult offerEntry(Entry entry) { + return OpenDALManagedLedgerOffloader.this.offerEntry(entry); + } + + @Override + public CompletableFuture offerEntryAsync(Entry entry) { + return CompletableFuture.completedFuture(offerEntry(entry)); + } + + @Override + public CompletableFuture getOffloadResultAsync() { + return OpenDALManagedLedgerOffloader.this.getOffloadResultAsync(); + } + + @Override + public boolean close() { + return OpenDALManagedLedgerOffloader.this.closeSegment(); + } + }); + } + + @Override + public CompletableFuture deleteOffloaded(long ledgerId, UUID uid, Map offloadDriverMetadata) { + CompletableFuture promise = new CompletableFuture<>(); + String dataKey = DataBlockUtils.dataBlockOffloadKey(ledgerId, uid); + String indexKey = DataBlockUtils.indexBlockOffloadKey(ledgerId, uid); + final String managedLedgerName = + offloadDriverMetadata != null ? offloadDriverMetadata.get(MANAGED_LEDGER_NAME) : null; + final String topicName = managedLedgerName != null + ? TopicName.fromPersistenceNamingEncoding(managedLedgerName) + : "unknown"; + scheduler.execute(() -> { + try { + OpenDALStorage storage = new OpenDALStorage(operatorProvider, + offloadDriverMetadata != null ? offloadDriverMetadata : Collections.emptyMap()); + storage.delete(dataKey); + storage.delete(indexKey); + promise.complete(null); + } catch (Throwable t) { + log.error("Failed delete offloaded objects for ledger {}", ledgerId, t); + promise.completeExceptionally(t); + } + }); + return promise.whenComplete((__, t) -> this.offloaderStats.recordDeleteOffloadOps(topicName, t == null)); + } + + @Override + public CompletableFuture deleteOffloaded(UUID uid, Map offloadDriverMetadata) { + CompletableFuture promise = new CompletableFuture<>(); + String dataKey = uid.toString(); + String indexKey = DataBlockUtils.indexBlockOffloadKey(uid); + scheduler.execute(() -> { + try { + OpenDALStorage storage = new OpenDALStorage(operatorProvider, + offloadDriverMetadata != null ? offloadDriverMetadata : Collections.emptyMap()); + storage.delete(dataKey); + storage.delete(indexKey); + promise.complete(null); + } catch (Throwable t) { + log.error("Failed delete offloaded objects for uuid {}", uid, t); + promise.completeExceptionally(t); + } + }); + return promise; + } + + @Override + public OffloadPolicies getOffloadPolicies() { + return policies; + } + + @Override + public void close() { + // All shared resources are managed by the factory (e.g. OperatorCache, OffsetsCache). + } + + @Override + public void scanLedgers(OffloadedLedgerMetadataConsumer consumer, + Map offloadDriverMetadata) throws ManagedLedgerException { + int batchSize = 100; + String marker = null; + OpenDALStorage storage = new OpenDALStorage(operatorProvider, + offloadDriverMetadata != null ? offloadDriverMetadata : Collections.emptyMap()); + do { + OpenDALStorage.ListResult list; + try { + list = storage.list("", marker, batchSize); + } catch (IOException e) { + throw ManagedLedgerException.getManagedLedgerException(e); + } + for (OpenDALStorage.Item item : list.getItems()) { + String name = item.getPath(); + Long ledgerId = DataBlockUtils.parseLedgerId(name); + if (ledgerId == null) { + continue; + } + String contextUuid = DataBlockUtils.parseContextUuid(name, ledgerId); + Instant lastModified = item.getMetadata().getLastModified(); + long lastModifiedMillis = lastModified != null ? lastModified.toEpochMilli() : 0; + OffloadedLedgerMetadata offloadedLedgerMetadata = OffloadedLedgerMetadata.builder() + .name(name) + .bucketName(config.getBucket()) + .uuid(contextUuid) + .ledgerId(ledgerId) + .lastModified(lastModifiedMillis) + .size(item.getMetadata().getSize()) + .uri(null) + .userMetadata(Collections.emptyMap()) + .build(); + try { + boolean canContinue = consumer.accept(offloadedLedgerMetadata); + if (!canContinue) { + log.info("Iteration stopped by the OffloadedLedgerMetadataConsumer"); + return; + } + } catch (Exception err) { + if (err instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw ManagedLedgerException.getManagedLedgerException(err); + } + } + marker = list.getNextMarker(); + } while (marker != null); + } + + private void streamingOffloadLoop(ManagedLedger ml, int partId, long dataObjectLength) { + if (offloadResult != null && offloadResult.isDone()) { + return; + } + if (segmentInfo == null) { + return; + } + if (segmentInfo.isClosed() && offloadBuffer.isEmpty()) { + buildIndexAndCompleteResult(dataObjectLength); + return; + } + + if ((segmentInfo.isClosed() && !offloadBuffer.isEmpty()) || bufferLength.get() >= streamingBlockSize) { + List entries = new LinkedList<>(); + int blockEntrySize = 0; + + Entry firstEntry = offloadBuffer.poll(); + if (firstEntry == null) { + scheduler.chooseThread(segmentInfo).schedule(() -> streamingOffloadLoop(ml, partId, dataObjectLength), + 100, TimeUnit.MILLISECONDS); + return; + } + int firstEntrySize = firstEntry.getLength(); + bufferLength.addAndGet(-firstEntrySize); + blockEntrySize += firstEntrySize; + entries.add(firstEntry); + long blockLedgerId = firstEntry.getLedgerId(); + long blockEntryId = firstEntry.getEntryId(); + + while (!offloadBuffer.isEmpty() + && offloadBuffer.peek().getLedgerId() == blockLedgerId + && blockEntrySize <= streamingBlockSize) { + Entry entryInBlock = offloadBuffer.poll(); + if (entryInBlock == null) { + break; + } + int entrySize = entryInBlock.getLength(); + bufferLength.addAndGet(-entrySize); + blockEntrySize += entrySize; + entries.add(entryInBlock); + } + + int blockSize = BufferedOffloadStream.calculateBlockSize( + streamingBlockSize, entries.size(), blockEntrySize); + buildBlockAndWrite(ml, blockSize, entries, blockLedgerId, blockEntryId, partId); + streamingOffloadLoop(ml, partId + 1, dataObjectLength + blockSize); + } else { + scheduler.chooseThread(segmentInfo).schedule(() -> streamingOffloadLoop(ml, partId, dataObjectLength), + 100, TimeUnit.MILLISECONDS); + } + } + + private void buildBlockAndWrite(ManagedLedger ml, + int blockSize, + List entries, + long blockLedgerId, + long beginEntryId, + int partId) { + try (BufferedOffloadStream payloadStream = new BufferedOffloadStream(blockSize, entries, + blockLedgerId, beginEntryId)) { + copyStream(payloadStream, streamingDataOut); + streamingIndexBuilder.withDataBlockHeaderLength(StreamingDataBlockHeaderImpl.getDataStartOffset()); + streamingIndexBuilder.addBlock(blockLedgerId, beginEntryId, partId, blockSize); + + MLDataFormats.ManagedLedgerInfo.LedgerInfo ledgerInfo = ml.getLedgerInfo(blockLedgerId).get(); + MLDataFormats.ManagedLedgerInfo.LedgerInfo.Builder ledgerInfoBuilder = + MLDataFormats.ManagedLedgerInfo.LedgerInfo.newBuilder(); + if (ledgerInfo != null) { + ledgerInfoBuilder.mergeFrom(ledgerInfo); + } + if (ledgerInfoBuilder.getEntries() == 0) { + ledgerInfoBuilder.setEntries(payloadStream.getEndEntryId() + 1); + } + streamingIndexBuilder.addLedgerMeta(blockLedgerId, ledgerInfoBuilder.build()); + } catch (Throwable e) { + failStreamingOffload(e); + } + } + + private void buildIndexAndCompleteResult(long dataObjectLength) { + try { + if (streamingDataOut != null) { + streamingDataOut.close(); + streamingDataOut = null; + } + + streamingIndexBuilder.withDataObjectLength(dataObjectLength); + OffloadIndexBlockV2 index = streamingIndexBuilder.buildV2(); + try (IndexInputStream indexStream = index.toStream()) { + byte[] indexBytes = readAllBytes(indexStream, indexStream.getStreamSize()); + streamingStorage.writeBytes(streamingDataIndexKey, indexBytes, + DataBlockUtils.withVersionInfo(userMetadata)); + } + + offloadResult.complete(segmentInfo.result()); + log.info("Streaming offload segment completed {}", segmentInfo.result()); + } catch (Throwable t) { + failStreamingOffload(t); + } + } + + private void failStreamingOffload(Throwable error) { + if (offloadResult != null && offloadResult.isDone()) { + return; + } + try { + if (streamingDataOut != null) { + streamingDataOut.close(); + streamingDataOut = null; + } + } catch (Throwable t) { + log.warn("Failed to close streaming data output stream", t); + } + try { + if (streamingStorage != null && streamingDataBlockKey != null) { + streamingStorage.delete(streamingDataBlockKey); + } + } catch (Throwable t) { + log.warn("Failed to cleanup streaming data object {}", streamingDataBlockKey, t); + } + try { + if (streamingStorage != null && streamingDataIndexKey != null) { + streamingStorage.delete(streamingDataIndexKey); + } + } catch (Throwable t) { + log.warn("Failed to cleanup streaming index object {}", streamingDataIndexKey, t); + } + + offloadResult.completeExceptionally(error); + } + + private CompletableFuture getOffloadResultAsync() { + return this.offloadResult; + } + + private synchronized OfferEntryResult offerEntry(Entry entry) { + if (segmentInfo == null) { + return OfferEntryResult.FAIL_SEGMENT_CLOSED; + } + if (segmentInfo.isClosed()) { + return OfferEntryResult.FAIL_SEGMENT_CLOSED; + } + if (maxBufferLength <= bufferLength.get()) { + return OfferEntryResult.FAIL_BUFFER_FULL; + } + + EntryImpl entryImpl = EntryImpl.create(entry.getLedgerId(), entry.getEntryId(), entry.getDataBuffer()); + offloadBuffer.add(entryImpl); + bufferLength.getAndAdd(entryImpl.getLength()); + segmentLength.getAndAdd(entryImpl.getLength()); + lastOfferedPosition = entryImpl.getPosition(); + if (segmentLength.get() >= maxSegmentLength + && System.currentTimeMillis() - segmentBeginTimeMillis >= minSegmentCloseTimeMillis) { + closeSegment(); + } + return OfferEntryResult.SUCCESS; + } + + private synchronized boolean closeSegment() { + if (segmentInfo == null) { + return false; + } + boolean result = !segmentInfo.isClosed(); + if (result) { + segmentInfo.closeSegment(lastOfferedPosition.getLedgerId(), lastOfferedPosition.getEntryId()); + } + return result; + } + + private Position lastOffered() { + return lastOfferedPosition; + } + + private static void copyStream(InputStream in, OutputStream out) throws IOException { + byte[] buffer = new byte[1024 * 64]; + int read; + while ((read = in.read(buffer)) >= 0) { + if (read == 0) { + continue; + } + out.write(buffer, 0, read); + } + } + + private static byte[] readAllBytes(InputStream in, long size) throws IOException { + if (size > Integer.MAX_VALUE) { + throw new IOException("Stream too large: " + size); + } + byte[] out = new byte[(int) size]; + int offset = 0; + while (offset < out.length) { + int read = in.read(out, offset, out.length - offset); + if (read < 0) { + break; + } + offset += read; + } + if (offset != out.length) { + throw new IOException("Unexpected EOF: expected " + out.length + " bytes, got " + offset); + } + return out; + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/package-info.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/package-info.java new file mode 100644 index 0000000000000..eb26e27aed303 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/impl/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.impl; diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/package-info.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/package-info.java new file mode 100644 index 0000000000000..82b800e57e2fc --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal; diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OpenDALOperatorProvider.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OpenDALOperatorProvider.java new file mode 100644 index 0000000000000..33cd3a54e606d --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OpenDALOperatorProvider.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.provider; + +import java.util.HashMap; +import java.util.Map; +import lombok.RequiredArgsConstructor; +import org.apache.commons.lang3.StringUtils; +import org.apache.opendal.Operator; +import org.apache.opendal.ServiceConfig; + +@RequiredArgsConstructor +public class OpenDALOperatorProvider { + + private static final String GCS_SERVICE_ACCOUNT_KEY_FILE = "gcsManagedLedgerOffloadServiceAccountKeyFile"; + private static final String ENV_AZURE_STORAGE_ACCOUNT = "AZURE_STORAGE_ACCOUNT"; + private static final String ENV_AZURE_STORAGE_ACCESS_KEY = "AZURE_STORAGE_ACCESS_KEY"; + + private final OpenDALTieredStorageConfiguration config; + private final OperatorCache operatorCache; + + public Operator getOperator(Map offloadDriverMetadata) { + Map effective = effectiveLocation(offloadDriverMetadata); + String scheme = config.getScheme(); + + Map operatorConfig = new HashMap<>(); + if ("s3".equalsIgnoreCase(scheme)) { + operatorConfig.putAll(buildS3Config(effective)); + } else if ("gcs".equalsIgnoreCase(scheme)) { + operatorConfig.putAll(buildGcsConfig(effective)); + } else if ("azblob".equalsIgnoreCase(scheme)) { + operatorConfig.putAll(buildAzblobConfig(effective)); + } else if ("memory".equalsIgnoreCase(scheme)) { + operatorConfig.putAll(ServiceConfig.Memory.builder().build().configMap()); + } else { + throw new IllegalArgumentException("Unsupported OpenDAL scheme: " + scheme); + } + + // Apply universal extra config passthrough last so that it can override defaults. + operatorConfig.putAll(config.getExtraConfig()); + + OperatorCacheKey cacheKey = OperatorCacheKey.of( + scheme, + effective.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_BUCKET), + effective.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_REGION), + effective.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_ENDPOINT), + operatorConfig); + + return operatorCache.getOrCreate(cacheKey, () -> Operator.of(scheme, operatorConfig)); + } + + private Map buildS3Config(Map effectiveLocation) { + String bucket = effectiveLocation.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_BUCKET); + String region = effectiveLocation.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_REGION); + String endpoint = effectiveLocation.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_ENDPOINT); + + ServiceConfig.S3.S3Builder builder = ServiceConfig.S3.builder().bucket(bucket); + + if (StringUtils.isNotBlank(region)) { + builder.region(region); + } + if (StringUtils.isNotBlank(endpoint)) { + builder.endpoint(endpoint); + } + + // Align with tiered-storage-jcloud behavior: if a custom endpoint is configured, + // disable virtual host style by default to avoid DNS issues. + if (StringUtils.isNotBlank(endpoint) && !config.getExtraConfig().containsKey("enable_virtual_host_style")) { + builder.enableVirtualHostStyle(false); + } + + String accessKeyId = config.getConfigProperties().get("s3ManagedLedgerOffloadCredentialId"); + String secretAccessKey = config.getConfigProperties().get("s3ManagedLedgerOffloadCredentialSecret"); + boolean hasExplicitCredentials = StringUtils.isNotBlank(accessKeyId) && StringUtils.isNotBlank(secretAccessKey); + if (hasExplicitCredentials) { + builder.accessKeyId(accessKeyId).secretAccessKey(secretAccessKey); + } + + String roleArn = config.getConfigProperties().get("s3ManagedLedgerOffloadRole"); + boolean hasRole = StringUtils.isNotBlank(roleArn); + if (StringUtils.isNotBlank(roleArn)) { + builder.roleArn(roleArn); + } + String roleSessionName = config.getConfigProperties().get("s3ManagedLedgerOffloadRoleSessionName"); + if (StringUtils.isNotBlank(roleSessionName)) { + builder.roleSessionName(roleSessionName); + } + + // Keep behavior compatible with the existing S3 integration tests that use a mock endpoint without credentials. + // IMPORTANT: do NOT default to anonymous for real AWS S3 (endpoint not explicitly set). + String driver = config.getDriver(); + boolean hasEnvAwsCredentials = StringUtils.isNotBlank(System.getenv("AWS_ACCESS_KEY_ID")) + && StringUtils.isNotBlank(System.getenv("AWS_SECRET_ACCESS_KEY")); + boolean shouldDefaultAllowAnonymous = "aws-s3".equalsIgnoreCase(driver) + && StringUtils.isNotBlank(endpoint) + && !hasExplicitCredentials + && !hasRole + && !hasEnvAwsCredentials + && !config.getExtraConfig().containsKey("allow_anonymous"); + if (shouldDefaultAllowAnonymous) { + builder.allowAnonymous(true); + } + + return builder.build().configMap(); + } + + private Map buildGcsConfig(Map effectiveLocation) { + String bucket = effectiveLocation.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_BUCKET); + String endpoint = effectiveLocation.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_ENDPOINT); + + ServiceConfig.Gcs.GcsBuilder builder = ServiceConfig.Gcs.builder().bucket(bucket); + if (StringUtils.isNotBlank(endpoint)) { + builder.endpoint(endpoint); + } + + // Keep compatibility with tiered-storage-jcloud: allow providing the service account key via file path. + // If users explicitly pass credential settings via managedLedgerOffloadExtraConfig*, prefer those. + Map extra = config.getExtraConfig(); + boolean hasExplicitCredential = extra.containsKey("credential") + || extra.containsKey("credential_path") + || extra.containsKey("service_account") + || extra.containsKey("token"); + boolean isHttpEndpoint = endpoint != null && endpoint.regionMatches(true, 0, "http://", 0, "http://".length()); + String keyFilePath = StringUtils.trimToNull(config.getConfigProperties().get(GCS_SERVICE_ACCOUNT_KEY_FILE)); + if (!hasExplicitCredential) { + if (keyFilePath != null) { + builder.credentialPath(keyFilePath); + } else if (isHttpEndpoint) { + // Emulator-friendly defaults: + // - Fake GCS server typically does not require auth. + // - Disable local config load / VM metadata to avoid slow fallbacks when running in containers. + if (!extra.containsKey("allow_anonymous")) { + builder.allowAnonymous(true); + } + if (!extra.containsKey("disable_config_load")) { + builder.disableConfigLoad(true); + } + if (!extra.containsKey("disable_vm_metadata")) { + builder.disableVmMetadata(true); + } + } + } + return builder.build().configMap(); + } + + private Map buildAzblobConfig(Map effectiveLocation) { + // In OpenDAL, azblob uses "container" instead of "bucket". + String container = effectiveLocation.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_BUCKET); + String endpoint = effectiveLocation.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_ENDPOINT); + + ServiceConfig.Azblob.AzblobBuilder builder = ServiceConfig.Azblob.builder().container(container); + if (StringUtils.isNotBlank(endpoint)) { + builder.endpoint(endpoint); + } + + // Keep compatibility with tiered-storage-jcloud: by default Azure credentials are sourced from env vars. + // Users can override by passing managedLedgerOffloadExtraConfigaccountName/accountKey/sasToken, etc. + Map extra = config.getExtraConfig(); + boolean hasAccountName = extra.containsKey("account_name"); + boolean hasAccountKey = extra.containsKey("account_key"); + boolean hasSasToken = extra.containsKey("sas_token"); + + String accountName = StringUtils.trimToNull(System.getenv(ENV_AZURE_STORAGE_ACCOUNT)); + String accountKey = StringUtils.trimToNull(System.getenv(ENV_AZURE_STORAGE_ACCESS_KEY)); + if (!hasAccountName && accountName != null) { + builder.accountName(accountName); + } + // Don't apply account key when SAS auth is explicitly configured. + if (!hasSasToken && !hasAccountKey && accountKey != null) { + builder.accountKey(accountKey); + } + return builder.build().configMap(); + } + + private Map effectiveLocation(Map offloadDriverMetadata) { + Map effective = new HashMap<>(); + + effective.put(OpenDALTieredStorageConfiguration.METADATA_FIELD_BUCKET, + firstNonBlank(offloadDriverMetadata.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_BUCKET), + config.getBucket())); + effective.put(OpenDALTieredStorageConfiguration.METADATA_FIELD_REGION, + firstNonBlank(offloadDriverMetadata.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_REGION), + config.getRegion())); + effective.put(OpenDALTieredStorageConfiguration.METADATA_FIELD_ENDPOINT, + firstNonBlank(offloadDriverMetadata.get(OpenDALTieredStorageConfiguration.METADATA_FIELD_ENDPOINT), + config.getServiceEndpoint())); + + return effective; + } + + private static String firstNonBlank(String first, String second) { + if (StringUtils.isNotBlank(first)) { + return first; + } + return StringUtils.defaultString(second); + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OpenDALTieredStorageConfiguration.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OpenDALTieredStorageConfiguration.java new file mode 100644 index 0000000000000..d0321559a0f4b --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OpenDALTieredStorageConfiguration.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.provider; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.TreeMap; +import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; +import org.apache.pulsar.common.policies.data.OffloadPoliciesImpl; + +/** + * A configuration wrapper for OpenDAL based tiered storage offloaders. + * + *

The goal is to keep the same semantics as the current jcloud implementation: + * resolve both "new" (managedLedgerOffload*) and legacy (s3ManagedLedgerOffload* and gcsManagedLedgerOffload*) keys. + */ +public class OpenDALTieredStorageConfiguration { + + public static final String BLOB_STORE_PROVIDER_KEY = "managedLedgerOffloadDriver"; + public static final String METADATA_FIELD_BUCKET = "bucket"; + public static final String METADATA_FIELD_REGION = "region"; + public static final String METADATA_FIELD_ENDPOINT = "serviceEndpoint"; + public static final String METADATA_FIELD_MAX_BLOCK_SIZE = "maxBlockSizeInBytes"; + public static final String METADATA_FIELD_MIN_BLOCK_SIZE = "minBlockSizeInBytes"; + public static final String METADATA_FIELD_READ_BUFFER_SIZE = "readBufferSizeInBytes"; + public static final String METADATA_FIELD_WRITE_BUFFER_SIZE = "writeBufferSizeInBytes"; + + public static final String OFFLOADER_PROPERTY_PREFIX = "managedLedgerOffload"; + public static final String MAX_OFFLOAD_SEGMENT_ROLLOVER_TIME_SEC = "maxOffloadSegmentRolloverTimeInSeconds"; + public static final String MIN_OFFLOAD_SEGMENT_ROLLOVER_TIME_SEC = "minOffloadSegmentRolloverTimeInSeconds"; + public static final long DEFAULT_MAX_SEGMENT_TIME_IN_SECOND = 600; + public static final long DEFAULT_MIN_SEGMENT_TIME_IN_SECOND = 0; + public static final String MAX_OFFLOAD_SEGMENT_SIZE_IN_BYTES = "maxOffloadSegmentSizeInBytes"; + public static final long DEFAULT_MAX_SEGMENT_SIZE_IN_BYTES = 1024L * 1024 * 1024; + public static final String EXTRA_CONFIG_PREFIX = OffloadPoliciesImpl.EXTRA_CONFIG_PREFIX; + + private static final int MB = 1024 * 1024; + + private final Map configProperties; + + public static OpenDALTieredStorageConfiguration create(Properties props) { + Map map = props.entrySet().stream() + .collect(Collectors.toMap(e -> e.getKey().toString(), e -> e.getValue().toString())); + return new OpenDALTieredStorageConfiguration(map); + } + + public static OpenDALTieredStorageConfiguration create(Map props) { + return new OpenDALTieredStorageConfiguration(props); + } + + public OpenDALTieredStorageConfiguration(Map configProperties) { + this.configProperties = new HashMap<>(Objects.requireNonNull(configProperties, "configProperties")); + } + + public Map getConfigProperties() { + return new HashMap<>(configProperties); + } + + public String getDriver() { + // Keep the same behavior as tiered-storage-jcloud: default to aws-s3 if not explicitly set. + return configProperties.getOrDefault(BLOB_STORE_PROVIDER_KEY, "aws-s3"); + } + + public String getScheme() { + String driver = getDriver(); + if ("aws-s3".equalsIgnoreCase(driver) + || "s3".equalsIgnoreCase(driver) + || "aliyun-oss".equalsIgnoreCase(driver)) { + return "s3"; + } + if ("google-cloud-storage".equalsIgnoreCase(driver)) { + return "gcs"; + } + if ("azureblob".equalsIgnoreCase(driver)) { + return "azblob"; + } + if ("transient".equalsIgnoreCase(driver)) { + return "memory"; + } + throw new IllegalArgumentException("Unsupported OpenDAL offload driver: " + driver); + } + + public String getBucket() { + for (String key : getKeys(METADATA_FIELD_BUCKET)) { + if (configProperties.containsKey(key)) { + return configProperties.get(key); + } + } + return null; + } + + public String getRegion() { + for (String key : getKeys(METADATA_FIELD_REGION)) { + if (configProperties.containsKey(key)) { + return configProperties.get(key); + } + } + return null; + } + + public String getServiceEndpoint() { + for (String key : getKeys(METADATA_FIELD_ENDPOINT)) { + if (configProperties.containsKey(key)) { + return configProperties.get(key); + } + } + return null; + } + + public long getMaxSegmentTimeInSecond() { + Long value = getLongFromKeys( + MAX_OFFLOAD_SEGMENT_ROLLOVER_TIME_SEC, + getKeyName(MAX_OFFLOAD_SEGMENT_ROLLOVER_TIME_SEC), + EXTRA_CONFIG_PREFIX + MAX_OFFLOAD_SEGMENT_ROLLOVER_TIME_SEC); + return value != null ? value : DEFAULT_MAX_SEGMENT_TIME_IN_SECOND; + } + + public long getMinSegmentTimeInSecond() { + Long value = getLongFromKeys( + MIN_OFFLOAD_SEGMENT_ROLLOVER_TIME_SEC, + getKeyName(MIN_OFFLOAD_SEGMENT_ROLLOVER_TIME_SEC), + EXTRA_CONFIG_PREFIX + MIN_OFFLOAD_SEGMENT_ROLLOVER_TIME_SEC); + return value != null ? value : DEFAULT_MIN_SEGMENT_TIME_IN_SECOND; + } + + public long getMaxSegmentSizeInBytes() { + Long value = getLongFromKeys( + MAX_OFFLOAD_SEGMENT_SIZE_IN_BYTES, + getKeyName(MAX_OFFLOAD_SEGMENT_SIZE_IN_BYTES), + EXTRA_CONFIG_PREFIX + MAX_OFFLOAD_SEGMENT_SIZE_IN_BYTES); + return value != null ? value : DEFAULT_MAX_SEGMENT_SIZE_IN_BYTES; + } + + public int getMaxBlockSizeInBytes() { + for (String key : getKeys(METADATA_FIELD_MAX_BLOCK_SIZE)) { + if (configProperties.containsKey(key)) { + return Integer.parseInt(configProperties.get(key)); + } + } + return 64 * MB; + } + + public int getMinBlockSizeInBytes() { + for (String key : getKeys(METADATA_FIELD_MIN_BLOCK_SIZE)) { + if (configProperties.containsKey(key)) { + return Integer.parseInt(configProperties.get(key)); + } + } + return 5 * MB; + } + + public int getReadBufferSizeInBytes() { + for (String key : getKeys(METADATA_FIELD_READ_BUFFER_SIZE)) { + if (configProperties.containsKey(key)) { + return Integer.parseInt(configProperties.get(key)); + } + } + return MB; + } + + public int getWriteBufferSizeInBytes() { + for (String key : getKeys(METADATA_FIELD_WRITE_BUFFER_SIZE)) { + if (configProperties.containsKey(key)) { + return Integer.parseInt(configProperties.get(key)); + } + } + return 10 * MB; + } + + public Map getOffloadDriverMetadata() { + Map metadata = new HashMap<>(); + metadata.put(BLOB_STORE_PROVIDER_KEY, getDriver()); + metadata.put(METADATA_FIELD_BUCKET, StringUtils.defaultString(getBucket())); + metadata.put(METADATA_FIELD_REGION, StringUtils.defaultString(getRegion())); + metadata.put(METADATA_FIELD_ENDPOINT, StringUtils.defaultString(getServiceEndpoint())); + return metadata; + } + + public Map getExtraConfig() { + // Keep stable ordering for hashing/logging. + Map extra = new TreeMap<>(); + configProperties.forEach((key, value) -> { + if (!key.startsWith(EXTRA_CONFIG_PREFIX)) { + return; + } + String raw = key.substring(EXTRA_CONFIG_PREFIX.length()); + if (raw.isEmpty()) { + return; + } + extra.put(normalizeExtraConfigKey(raw), value); + }); + return Collections.unmodifiableMap(extra); + } + + public void validate() throws IOException { + String driver = getDriver(); + String bucket = getBucket(); + String region = getRegion(); + String endpoint = getServiceEndpoint(); + + if (StringUtils.isBlank(bucket) + && !"filesystem".equalsIgnoreCase(driver) + && !"transient".equalsIgnoreCase(driver)) { + throw new IOException("Bucket cannot be empty for driver " + driver + " offload"); + } + + if ("aws-s3".equalsIgnoreCase(driver)) { + if (StringUtils.isBlank(region) && StringUtils.isBlank(endpoint)) { + throw new IOException("Either Region or ServiceEndpoint must be specified for aws-s3 offload"); + } + } else if ("s3".equalsIgnoreCase(driver) || "aliyun-oss".equalsIgnoreCase(driver)) { + if (StringUtils.isBlank(endpoint)) { + throw new IOException("ServiceEndpoint must be specified for driver " + driver + " offload"); + } + } + + if (getMaxBlockSizeInBytes() < 5 * MB) { + throw new IOException("managedLedgerOffloadMaxBlockSizeInBytes cannot be less than 5MB"); + } + } + + List getKeys(String property) { + String backwardCompatible = getBackwardCompatibleKey(property); + String modern = getKeyName(property); + if (StringUtils.isBlank(backwardCompatible)) { + return List.of(modern); + } + return List.of(backwardCompatible, modern); + } + + private String getKeyName(String property) { + return OFFLOADER_PROPERTY_PREFIX + StringUtils.capitalize(property); + } + + private String getBackwardCompatibleKey(String property) { + String driver = getDriver(); + if ("aws-s3".equalsIgnoreCase(driver) + || "s3".equalsIgnoreCase(driver) + || "aliyun-oss".equalsIgnoreCase(driver)) { + return "s3ManagedLedgerOffload" + StringUtils.capitalize(property); + } + if ("google-cloud-storage".equalsIgnoreCase(driver)) { + return "gcsManagedLedgerOffload" + StringUtils.capitalize(property); + } + return null; + } + + private Long getLongFromKeys(String... keys) { + for (String key : keys) { + String value = configProperties.get(key); + if (value == null) { + continue; + } + return Long.parseLong(value); + } + return null; + } + + static String normalizeExtraConfigKey(String rawKeySuffix) { + // Some users prefer camelCase keys in broker.conf, while OpenDAL configs are snake_case. + // We accept both: + // - if key already contains '_' assume it's snake_case and keep as-is. + // - otherwise, convert camelCase/PascalCase to snake_case. + if (rawKeySuffix.indexOf('_') >= 0) { + return rawKeySuffix; + } + StringBuilder out = new StringBuilder(rawKeySuffix.length() + 8); + for (int i = 0; i < rawKeySuffix.length(); i++) { + char c = rawKeySuffix.charAt(i); + if (Character.isUpperCase(c)) { + if (i > 0) { + out.append('_'); + } + out.append(Character.toLowerCase(c)); + } else { + out.append(Character.toLowerCase(c)); + } + } + return out.toString().toLowerCase(Locale.ROOT); + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OperatorCache.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OperatorCache.java new file mode 100644 index 0000000000000..2af06cbb78d5d --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OperatorCache.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.provider; + +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.function.Supplier; +import lombok.extern.slf4j.Slf4j; +import org.apache.opendal.Operator; + +@Slf4j +public class OperatorCache implements AutoCloseable { + + private final ConcurrentMap operators = new ConcurrentHashMap<>(); + + public Operator getOrCreate(OperatorCacheKey key, Supplier creator) { + Operator base = operators.computeIfAbsent(key, __ -> creator.get()); + return base.duplicate(); + } + + @Override + public void close() { + operators.forEach((key, operator) -> { + try { + operator.close(); + } catch (Throwable t) { + log.warn("Failed to close OpenDAL operator for {}", key, t); + } + }); + operators.clear(); + } +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OperatorCacheKey.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OperatorCacheKey.java new file mode 100644 index 0000000000000..7c78ef976fded --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/OperatorCacheKey.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.provider; + +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.HexFormat; +import java.util.Map; +import java.util.Objects; +import java.util.TreeMap; + +final class OperatorCacheKey { + private static final HexFormat HEX = HexFormat.of(); + + private final String scheme; + private final String bucket; + private final String region; + private final String endpoint; + private final String configHash; + + static OperatorCacheKey of(String scheme, + String bucket, + String region, + String endpoint, + Map operatorConfig) { + return new OperatorCacheKey(scheme, bucket, region, endpoint, sha256Hex(operatorConfig)); + } + + private OperatorCacheKey(String scheme, String bucket, String region, String endpoint, String configHash) { + this.scheme = scheme; + this.bucket = bucket; + this.region = region; + this.endpoint = endpoint; + this.configHash = configHash; + } + + private static String sha256Hex(Map operatorConfig) { + Map sorted = new TreeMap<>(operatorConfig); + MessageDigest digest; + try { + digest = MessageDigest.getInstance("SHA-256"); + } catch (NoSuchAlgorithmException e) { + // SHA-256 is mandatory in every Java runtime. + throw new RuntimeException(e); + } + sorted.forEach((key, value) -> { + digest.update(key.getBytes(StandardCharsets.UTF_8)); + digest.update((byte) '='); + if (value != null) { + digest.update(value.getBytes(StandardCharsets.UTF_8)); + } + digest.update((byte) '\n'); + }); + return HEX.formatHex(digest.digest()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof OperatorCacheKey)) { + return false; + } + OperatorCacheKey that = (OperatorCacheKey) o; + return Objects.equals(scheme, that.scheme) + && Objects.equals(bucket, that.bucket) + && Objects.equals(region, that.region) + && Objects.equals(endpoint, that.endpoint) + && Objects.equals(configHash, that.configHash); + } + + @Override + public int hashCode() { + return Objects.hash(scheme, bucket, region, endpoint, configHash); + } + + @Override + public String toString() { + return "OperatorCacheKey{" + + "scheme='" + scheme + '\'' + + ", bucket='" + bucket + '\'' + + ", region='" + region + '\'' + + ", endpoint='" + endpoint + '\'' + + ", configHash='" + configHash + '\'' + + '}'; + } +} + diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/package-info.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/package-info.java new file mode 100644 index 0000000000000..e62a66d4b5065 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/provider/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.provider; diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/storage/OpenDALStorage.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/storage/OpenDALStorage.java new file mode 100644 index 0000000000000..150a66353797e --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/storage/OpenDALStorage.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.storage; + +import java.io.ByteArrayInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.time.Instant; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.Collectors; +import lombok.Value; +import lombok.extern.slf4j.Slf4j; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALOperatorProvider; +import org.apache.opendal.Entry; +import org.apache.opendal.ListOptions; +import org.apache.opendal.Metadata; +import org.apache.opendal.OpenDALException; +import org.apache.opendal.Operator; +import org.apache.opendal.WriteOptions; + +/** + * A thin wrapper around the OpenDAL Java binding. + * + *

All OpenDAL calls should be kept inside this class so that the offloader logic layer can stay clean + * (and provider backends can be changed without touching core offload algorithms). + */ +@Slf4j +public class OpenDALStorage { + + @Value + public static class ObjectMetadata { + long size; + Instant lastModified; + } + + @Value + public static class ListResult { + List items; + String nextMarker; + } + + @Value + public static class Item { + String path; + ObjectMetadata metadata; + } + + private final OpenDALOperatorProvider operatorProvider; + private final Map offloadDriverMetadata; + + public OpenDALStorage(OpenDALOperatorProvider operatorProvider, Map offloadDriverMetadata) { + this.operatorProvider = Objects.requireNonNull(operatorProvider, "operatorProvider"); + this.offloadDriverMetadata = Objects.requireNonNull(offloadDriverMetadata, "offloadDriverMetadata"); + } + + public OutputStream openOutputStream(String key) throws IOException { + Operator operator = operatorProvider.getOperator(offloadDriverMetadata); + try { + OutputStream out = operator.createOutputStream(key); + return new OutputStream() { + @Override + public void write(int b) throws IOException { + out.write(b); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + out.write(b, off, len); + } + + @Override + public void flush() throws IOException { + out.flush(); + } + + @Override + public void close() throws IOException { + try { + out.close(); + } finally { + operator.close(); + } + } + }; + } catch (Throwable t) { + operator.close(); + throw toIOException("openOutputStream", key, t); + } + } + + public void writeBytes(String key, byte[] data, Map userMetadata) throws IOException { + Map safeUserMeta = (userMetadata != null) ? userMetadata : Collections.emptyMap(); + WriteOptions options = WriteOptions.builder() + .contentType("application/octet-stream") + .userMetadata(safeUserMeta) + .build(); + + try (Operator operator = operatorProvider.getOperator(offloadDriverMetadata)) { + operator.write(key, data, options); + } catch (Throwable t) { + throw toIOException("writeBytes", key, t); + } + } + + public InputStream readRange(String key, long startInclusive, long endInclusive) throws IOException { + if (startInclusive < 0 || endInclusive < startInclusive) { + throw new IllegalArgumentException("Invalid range: " + startInclusive + "-" + endInclusive); + } + long length = endInclusive - startInclusive + 1; + if (length > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Range too large: " + length); + } + try (Operator operator = operatorProvider.getOperator(offloadDriverMetadata)) { + byte[] data = operator.read(key, startInclusive, length); + return new ByteArrayInputStream(data); + } catch (Throwable t) { + throw toIOException("readRange", key, t); + } + } + + public ObjectMetadata stat(String key) throws IOException { + try (Operator operator = operatorProvider.getOperator(offloadDriverMetadata)) { + Metadata md = operator.stat(key); + return new ObjectMetadata(md.getContentLength(), md.getLastModified()); + } catch (Throwable t) { + throw toIOException("stat", key, t); + } + } + + public void delete(String key) throws IOException { + try (Operator operator = operatorProvider.getOperator(offloadDriverMetadata)) { + operator.delete(key); + } catch (Throwable t) { + throw toIOException("delete", key, t); + } + } + + public void delete(List keys) throws IOException { + for (String key : keys) { + delete(key); + } + } + + public ListResult list(String prefix, String marker, long limit) throws IOException { + if (limit <= 0) { + throw new IllegalArgumentException("limit must be > 0"); + } + try (Operator operator = operatorProvider.getOperator(offloadDriverMetadata)) { + ListOptions.ListOptionsBuilder options = ListOptions.builder().recursive(true).limit(limit); + if (marker != null && !marker.isEmpty()) { + options.startAfter(marker); + } + List entries = operator.list(prefix == null ? "" : prefix, options.build()); + List items = entries.stream() + .filter(e -> e.getMetadata() != null && e.getMetadata().isFile()) + .map(e -> new Item(e.getPath(), new ObjectMetadata( + e.getMetadata().getContentLength(), e.getMetadata().getLastModified()))) + .collect(Collectors.toList()); + + String nextMarker = null; + if (items.size() == limit) { + nextMarker = Optional.ofNullable(items.get(items.size() - 1).getPath()).orElse(null); + } + return new ListResult(items, nextMarker); + } catch (Throwable t) { + throw toIOException("list", prefix, t); + } + } + + private static IOException toIOException(String op, String key, Throwable t) { + if (t instanceof OpenDALException) { + OpenDALException e = (OpenDALException) t; + if (e.getCode() == OpenDALException.Code.NotFound) { + return new FileNotFoundException(op + " not found: " + key); + } + } + if (t instanceof IOException) { + return (IOException) t; + } + return new IOException("OpenDAL " + op + " failed for key " + key, t); + } +} diff --git a/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/storage/package-info.java b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/storage/package-info.java new file mode 100644 index 0000000000000..c675781e8d562 --- /dev/null +++ b/tiered-storage/opendal/src/main/java/org/apache/bookkeeper/mledger/offload/opendal/storage/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.storage; diff --git a/tiered-storage/opendal/src/main/resources/META-INF/services/pulsar-offloader.yaml b/tiered-storage/opendal/src/main/resources/META-INF/services/pulsar-offloader.yaml new file mode 100644 index 0000000000000..0315080e3062d --- /dev/null +++ b/tiered-storage/opendal/src/main/resources/META-INF/services/pulsar-offloader.yaml @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: opendal +description: OpenDAL based offloader implementation +offloaderFactoryClass: org.apache.bookkeeper.mledger.offload.opendal.OpenDALLedgerOffloaderFactory + diff --git a/tiered-storage/opendal/src/main/resources/findbugsExclude.xml b/tiered-storage/opendal/src/main/resources/findbugsExclude.xml new file mode 100644 index 0000000000000..55ecc896f01a4 --- /dev/null +++ b/tiered-storage/opendal/src/main/resources/findbugsExclude.xml @@ -0,0 +1,23 @@ + + + + diff --git a/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImplTest.java b/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImplTest.java new file mode 100644 index 0000000000000..9d45fffa5f177 --- /dev/null +++ b/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALBackedReadHandleImplTest.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.impl; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.testng.Assert.assertEquals; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufAllocator; +import java.io.EOFException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import org.apache.bookkeeper.client.LedgerMetadataBuilder; +import org.apache.bookkeeper.client.api.DigestType; +import org.apache.bookkeeper.client.api.LedgerEntries; +import org.apache.bookkeeper.client.api.LedgerEntry; +import org.apache.bookkeeper.client.api.LedgerMetadata; +import org.apache.bookkeeper.mledger.offload.jcloud.BackedInputStream; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexBlock; +import org.apache.bookkeeper.mledger.offload.jcloud.OffloadIndexEntry; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.OffloadIndexEntryImpl; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.OffsetsCache; +import org.apache.bookkeeper.net.BookieId; +import org.apache.commons.lang3.tuple.Pair; +import org.testng.annotations.AfterClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class OpenDALBackedReadHandleImplTest { + + private final OffsetsCache offsetsCache = new OffsetsCache(); + private final ScheduledExecutorService executor = Executors.newScheduledThreadPool(2); + + @AfterClass(alwaysRun = true) + public void tearDown() throws Exception { + executor.shutdown(); + executor.awaitTermination(5, TimeUnit.SECONDS); + offsetsCache.close(); + } + + private String getExpectedEntryContent(int entryId) { + return "Entry " + entryId; + } + + private Pair createReadHandle( + long ledgerId, int entries, boolean hasDirtyData) throws Exception { + List> offsets = new ArrayList<>(); + int totalLen = 0; + ByteBuf data = ByteBufAllocator.DEFAULT.heapBuffer(1024); + data.writeInt(0); + data.writerIndex(128); + for (int i = 0; i < entries; i++) { + if (hasDirtyData && i == 1) { + data.writeBytes("dirty data".getBytes(UTF_8)); + } + offsets.add(Pair.of(i, data.writerIndex())); + offsetsCache.put(ledgerId, i, data.writerIndex()); + byte[] entryContent = getExpectedEntryContent(i).getBytes(UTF_8); + totalLen += entryContent.length; + data.writeInt(entryContent.length); + data.writeLong(i); + data.writeBytes(entryContent); + } + + LedgerMetadata metadata = LedgerMetadataBuilder.create() + .withId(ledgerId) + .withEnsembleSize(1) + .withWriteQuorumSize(1) + .withAckQuorumSize(1) + .withDigestType(DigestType.CRC32C) + .withPassword("pwd".getBytes(UTF_8)) + .withClosedState() + .withLastEntryId(entries) + .withLength(totalLen) + .newEnsembleEntry(0L, Arrays.asList(BookieId.parse("127.0.0.1:3181"))) + .build(); + + BackedInputStreamImpl inputStream = new BackedInputStreamImpl(data); + TestOffloadIndexBlock index = new TestOffloadIndexBlock(metadata); + for (Pair pair : offsets) { + index.put(pair.getLeft(), OffloadIndexEntryImpl.of(pair.getLeft(), 0, pair.getRight(), 0)); + } + return Pair.of(new OpenDALBackedReadHandleImpl(ledgerId, index, inputStream, executor, offsetsCache), data); + } + + private static class TestOffloadIndexBlock implements OffloadIndexBlock { + private final LedgerMetadata ledgerMetadata; + private final java.util.NavigableMap entries = new java.util.TreeMap<>(); + + private TestOffloadIndexBlock(LedgerMetadata ledgerMetadata) { + this.ledgerMetadata = ledgerMetadata; + } + + private void put(long entryId, OffloadIndexEntry entry) { + entries.put(entryId, entry); + } + + @Override + public OffloadIndexEntry getIndexEntryForEntry(long messageEntryId) throws IOException { + return entries.floorEntry(messageEntryId).getValue(); + } + + @Override + public int getEntryCount() { + return entries.size(); + } + + @Override + public LedgerMetadata getLedgerMetadata() { + return ledgerMetadata; + } + + @Override + public long getDataObjectLength() { + return 0; + } + + @Override + public long getDataBlockHeaderLength() { + return 0; + } + + @Override + public IndexInputStream toStream() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void close() { + // no-op + } + } + + private static class BackedInputStreamImpl extends BackedInputStream { + + private final ByteBuf data; + + private BackedInputStreamImpl(ByteBuf data) { + this.data = data; + } + + @Override + public void seek(long position) { + data.readerIndex((int) position); + } + + @Override + public void seekForward(long position) throws IOException { + data.readerIndex((int) position); + } + + @Override + public long getCurrentPosition() { + return data.readerIndex(); + } + + @Override + public int read() throws IOException { + if (data.readableBytes() == 0) { + throw new EOFException("The input-stream has no bytes to read"); + } + return data.readByte(); + } + + @Override + public int available() throws IOException { + return data.readableBytes(); + } + } + + @DataProvider + public Object[][] streamStartAt() { + return new Object[][]{ + {0, false}, + {1, false}, + {128, false}, + {0, true}, + {1, true}, + {128, true} + }; + } + + @Test(dataProvider = "streamStartAt") + public void testRead(int streamStartAt, boolean hasDirtyData) throws Exception { + int entryCount = 5; + Pair ledgerDataPair = createReadHandle(1, entryCount, hasDirtyData); + OpenDALBackedReadHandleImpl ledger = ledgerDataPair.getLeft(); + ByteBuf data = ledgerDataPair.getRight(); + data.readerIndex(streamStartAt); + + for (int i = 0; i < entryCount; i++) { + LedgerEntries entries = ledger.read(i, i); + assertEquals(new String(entries.iterator().next().getEntryBytes()), getExpectedEntryContent(i)); + } + + LedgerEntries entries1 = ledger.read(0, entryCount - 1); + Iterator iterator1 = entries1.iterator(); + for (int i = 0; i < entryCount; i++) { + assertEquals(new String(iterator1.next().getEntryBytes()), getExpectedEntryContent(i)); + } + + LedgerEntries entries2 = ledger.read(0, entryCount - 2); + Iterator iterator2 = entries2.iterator(); + for (int i = 0; i < entryCount - 1; i++) { + assertEquals(new String(iterator2.next().getEntryBytes()), getExpectedEntryContent(i)); + } + + LedgerEntries entries3 = ledger.read(0, entryCount - 1); + Iterator iterator3 = entries3.iterator(); + for (int i = 0; i < entryCount; i++) { + assertEquals(new String(iterator3.next().getEntryBytes()), getExpectedEntryContent(i)); + } + + ledger.close(); + } +} diff --git a/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALManagedLedgerOffloaderStreamingTest.java b/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALManagedLedgerOffloaderStreamingTest.java new file mode 100644 index 0000000000000..5e6860d90eb07 --- /dev/null +++ b/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/impl/OpenDALManagedLedgerOffloaderStreamingTest.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.impl; + +import static org.testng.Assert.assertEquals; +import java.lang.reflect.Proxy; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import org.apache.bookkeeper.client.api.LedgerEntries; +import org.apache.bookkeeper.client.api.LedgerEntry; +import org.apache.bookkeeper.client.api.ReadHandle; +import org.apache.bookkeeper.common.util.OrderedScheduler; +import org.apache.bookkeeper.mledger.LedgerOffloader; +import org.apache.bookkeeper.mledger.LedgerOffloader.OffloadHandle; +import org.apache.bookkeeper.mledger.LedgerOffloaderStatsDisable; +import org.apache.bookkeeper.mledger.ManagedLedger; +import org.apache.bookkeeper.mledger.impl.EntryImpl; +import org.apache.bookkeeper.mledger.offload.jcloud.impl.OffsetsCache; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALOperatorProvider; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALTieredStorageConfiguration; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OperatorCache; +import org.apache.bookkeeper.mledger.proto.MLDataFormats.ManagedLedgerInfo.LedgerInfo; +import org.apache.bookkeeper.mledger.proto.MLDataFormats.OffloadContext; +import org.apache.bookkeeper.mledger.proto.MLDataFormats.OffloadSegment; +import org.testng.annotations.Test; + +public class OpenDALManagedLedgerOffloaderStreamingTest { + + private static ManagedLedger createMockManagedLedger() { + return (ManagedLedger) Proxy.newProxyInstance( + ManagedLedger.class.getClassLoader(), + new Class[]{ManagedLedger.class}, + (proxy, method, args) -> { + if ("getLedgerInfo".equals(method.getName()) && args != null && args.length == 1) { + long ledgerId = (long) args[0]; + LedgerInfo ledgerInfo = LedgerInfo.newBuilder() + .setLedgerId(ledgerId) + .setSize(100) + .setEntries(100) + .build(); + return CompletableFuture.completedFuture(ledgerInfo); + } + if ("toString".equals(method.getName())) { + return "MockManagedLedger"; + } + if ("getName".equals(method.getName())) { + return "MockManagedLedger"; + } + throw new UnsupportedOperationException(method.toString()); + }); + } + + @Test + public void testStreamingOffloadWriteThenRead() throws Exception { + Random random = new Random(0); + List expected = new ArrayList<>(); + + OrderedScheduler scheduler = OrderedScheduler.newSchedulerBuilder() + .numThreads(2) + .name("opendal-offloader-test") + .build(); + OffsetsCache offsetsCache = new OffsetsCache(); + OperatorCache operatorCache = new OperatorCache(); + try { + HashMap props = new HashMap<>(); + props.put(OpenDALTieredStorageConfiguration.BLOB_STORE_PROVIDER_KEY, "transient"); + props.put(OpenDALTieredStorageConfiguration.MAX_OFFLOAD_SEGMENT_SIZE_IN_BYTES, "1000"); + props.put("managedLedgerOffloadMinBlockSizeInBytes", "1024"); + + OpenDALTieredStorageConfiguration config = OpenDALTieredStorageConfiguration.create(props); + OpenDALOperatorProvider operatorProvider = new OpenDALOperatorProvider(config, operatorCache); + + OpenDALManagedLedgerOffloader offloader = OpenDALManagedLedgerOffloader.create( + config, + new HashMap<>(), + scheduler, + scheduler, + LedgerOffloaderStatsDisable.INSTANCE, + offsetsCache, + operatorProvider); + + ManagedLedger managedLedger = createMockManagedLedger(); + UUID uuid = UUID.randomUUID(); + long ledgerId = 0; + + OffloadHandle offloadHandle = offloader.streamingOffload( + managedLedger, uuid, ledgerId, 0, config.getOffloadDriverMetadata()) + .get(5, TimeUnit.SECONDS); + + for (int i = 0; i < 10; i++) { + byte[] data = new byte[100]; + random.nextBytes(data); + expected.add(data.clone()); + EntryImpl entry = EntryImpl.create(ledgerId, i, data); + try { + assertEquals(offloadHandle.offerEntry(entry), OffloadHandle.OfferEntryResult.SUCCESS); + } finally { + entry.release(); + } + } + + offloadHandle.close(); + LedgerOffloader.OffloadResult offloadResult = offloadHandle.getOffloadResultAsync() + .get(30, TimeUnit.SECONDS); + assertEquals(offloadResult.endLedger, ledgerId); + assertEquals(offloadResult.endEntry, 9); + + OffloadContext offloadContext = OffloadContext.newBuilder() + .addOffloadSegment(OffloadSegment.newBuilder() + .setUidMsb(uuid.getMostSignificantBits()) + .setUidLsb(uuid.getLeastSignificantBits()) + .setComplete(true) + .setEndEntryId(9) + .build()) + .build(); + + ReadHandle readHandle = offloader.readOffloaded(ledgerId, offloadContext, config.getOffloadDriverMetadata()) + .get(5, TimeUnit.SECONDS); + try { + LedgerEntries ledgerEntries = readHandle.readAsync(0, 9).get(10, TimeUnit.SECONDS); + try { + for (LedgerEntry ledgerEntry : ledgerEntries) { + assertEquals(ledgerEntry.getEntryBytes(), expected.get((int) ledgerEntry.getEntryId())); + } + } finally { + ledgerEntries.close(); + } + } finally { + readHandle.close(); + } + } finally { + operatorCache.close(); + offsetsCache.close(); + scheduler.shutdownNow(); + } + } +} diff --git a/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/storage/OpenDALStorageTest.java b/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/storage/OpenDALStorageTest.java new file mode 100644 index 0000000000000..60db1f136c5ab --- /dev/null +++ b/tiered-storage/opendal/src/test/java/org/apache/bookkeeper/mledger/offload/opendal/storage/OpenDALStorageTest.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.bookkeeper.mledger.offload.opendal.storage; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import java.io.FileNotFoundException; +import java.io.InputStream; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALOperatorProvider; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OpenDALTieredStorageConfiguration; +import org.apache.bookkeeper.mledger.offload.opendal.provider.OperatorCache; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.Test; + +public class OpenDALStorageTest { + + private OperatorCache operatorCache; + + @AfterMethod(alwaysRun = true) + public void teardown() { + if (operatorCache != null) { + operatorCache.close(); + } + } + + @Test + public void testWriteReadStatListDeleteOnMemoryBackend() throws Exception { + Map props = new HashMap<>(); + props.put("managedLedgerOffloadDriver", "transient"); + + OpenDALTieredStorageConfiguration config = OpenDALTieredStorageConfiguration.create(props); + operatorCache = new OperatorCache(); + OpenDALOperatorProvider provider = new OpenDALOperatorProvider(config, operatorCache); + + OpenDALStorage storage = new OpenDALStorage(provider, Collections.emptyMap()); + + String key = "k1"; + byte[] payload = "hello-opendal".getBytes(UTF_8); + storage.writeBytes(key, payload, Map.of("role", "test")); + + OpenDALStorage.ObjectMetadata metadata = storage.stat(key); + assertEquals(metadata.getSize(), payload.length); + + try (InputStream in = storage.readRange(key, 0, payload.length - 1)) { + assertEquals(new String(in.readAllBytes(), UTF_8), "hello-opendal"); + } + + OpenDALStorage.ListResult list = storage.list("", null, 100); + assertTrue(list.getItems().stream().anyMatch(i -> key.equals(i.getPath()))); + + storage.delete(key); + try { + storage.stat(key); + } catch (FileNotFoundException expected) { + // ok + } + } +} diff --git a/tiered-storage/pom.xml b/tiered-storage/pom.xml index ce845693e0970..439f3d42a8301 100644 --- a/tiered-storage/pom.xml +++ b/tiered-storage/pom.xml @@ -33,11 +33,13 @@ ${project.version} + 0.48.2 jcloud file-system + opendal