NVIDIA · coreyjadams · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
@@ -153,6 +153,29 @@ the ``Dataset`` is responsible for the threaded execution of ``Reader``s and
     :members:
     :show-inheritance:
 
+MultiDataset
+^^^^^^^^^^^^
+
+The ``MultiDataset`` composes two or more ``Dataset`` instances behind a single
+index space (concatenation). Each sub-dataset can have its own Reader and
+transforms. Global indices are mapped to the owning sub-dataset and local index;
+metadata is enriched with ``dataset_index`` so batches can identify the source.
+Use ``MultiDataset`` when you want to train on multiple datasets with the same
+DataLoader, optionally enforcing that all outputs share the same TensorDict keys
+for  collation. See :const:`physicsnemo.datapipes.multi_dataset.DATASET_INDEX_METADATA_KEY`
+for the metadata key added to each sample.
+
+Note that to properly collate and stack outputs from different datasets, you
+can set ``output_strict=True`` in the constructor of a ``MultiDataset``.  Upon
+construction, it will load the first batch from every passed dataset and test
+that the tensordict produced by the ``Reader`` and ``Transform`` pipeline has
+consistent keys.  Because the exact collation details differ by dataset, the
+``MultiDataset`` does not check more aggressively than output key consistency.
+
+.. autoclass:: physicsnemo.datapipes.multi_dataset.MultiDataset
+    :members:
+    :show-inheritance:
+
 
 Readers
 ^^^^^^^

diff --git a/examples/cfd/darcy-multidataset/.gitignore b/examples/cfd/darcy-multidataset/.gitignore
@@ -0,0 +1,3 @@
+runs/
+outputs/
+output/
diff --git a/examples/cfd/darcy-multidataset/README.md b/examples/cfd/darcy-multidataset/README.md
@@ -0,0 +1,5 @@
+# Darcy Flow with multiple datasets
+
+This readme is a work in progress and will be updated.
+
+Don't approve the PR until it's updated!
diff --git a/examples/cfd/darcy-multidataset/benchmark_datasets.py b/examples/cfd/darcy-multidataset/benchmark_datasets.py
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Load and benchmark each dataset (numpy and hdf5) separately.
+# Usage: python benchmark_datasets.py data.numpy_path=/path/to/npz data.hdf5_path=/path/to/h5
+
+import time
+from collections import defaultdict
+
+import torch
+import hydra
+from omegaconf import DictConfig, OmegaConf
+
+from physicsnemo import datapipes
+
+
+def _bench_dataset(name: str, dataset, n_iters: int = 2, n_samples=None) -> None:
+    """Run n_iters passes over the dataset (or first n_samples per pass) and report throughput."""
+    n = len(dataset)
+    if n == 0:
+        print(f"  {name}: empty, skip")
+        return
+    count = n if n_samples is None else min(n_samples, n)
+
+    # Warmup
+    for i in range(min(3, count)):
+        data_dict, meta = dataset[i]
+
+    for key, val in data_dict.items():
+        print(f"  Key {key} has shape {val.shape}")
+
+    # Accumulate per-key running stats over the full dataset
+    sums = defaultdict(lambda: 0.0)
+    sq_sums = defaultdict(lambda: 0.0)
+    counts = defaultdict(lambda: 0)
+
+    start = time.perf_counter()
+    for _ in range(n_iters):
+        for i in range(count):
+            data_dict, meta = dataset[i]
+            for key, val in data_dict.items():
+                val_f = val.float()
+                sums[key] += val_f.sum().item()
+                sq_sums[key] += (val_f**2).sum().item()
+                counts[key] += val_f.numel()
+    elapsed = time.perf_counter() - start
+
+    total = n_iters * count
+    rate = total / elapsed if elapsed > 0 else 0
+    print(
+        f"  {name}: {total} loads in {elapsed:.3f}s -> {rate:.1f} samples/s (len={n})"
+    )
+
+    for key in sums:
+        mean = sums[key] / counts[key]
+        std = ((sq_sums[key] / counts[key]) - mean**2) ** 0.5
+        print(
+            f"  {name}/{key}: mean={mean:.6g}, std={std:.6g} (over {counts[key]} elements)"
+        )
+
+
+def _path_ok(path) -> bool:
+    """True if path looks set (not OmegaConf missing placeholder)."""
+    if path is None:
+        return False
+    s = str(path).strip()
+    return s != "" and s != "???"
+
+
+@hydra.main(version_base=None, config_path="./conf", config_name="config")
+def main(cfg: DictConfig) -> None:
+    OmegaConf.resolve(cfg)
+    print("Config (full):")
+    print(OmegaConf.to_yaml(cfg))
+    print()
+    n_iters = 1
+    n_samples = getattr(cfg, "bench_n_samples", 300)  # optional: cap samples per pass
+
+    print("Benchmarking individual datasets:\n")
+
+    for i, ds_cfg in enumerate(cfg.multi_dataset.datasets):
+        print(f"Benchmark Dataset {i}")
+        ds = hydra.utils.instantiate(ds_cfg)
+        _bench_dataset("name", ds, n_iters=n_iters, n_samples=n_samples)
+        ds.close()
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/cfd/darcy-multidataset/conf/config.yaml b/examples/cfd/darcy-multidataset/conf/config.yaml
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Darcy GeoTransolver multi-dataset example — main config.
+# Compose dataloader (numpy + hdf5). Model and training configs added later.
+#
+# Run: python load_and_visualize_data.py (or train.py when implemented)
+# Override paths: data.numpy_path=... data.pde_bench_darcy_flow_dir=...
+#
+# Dataset configs live in conf/datasets/*.yaml. PDEBench Darcy betas can be
+# mixed by commenting in/out lines in multi_dataset.datasets below.
+
+defaults:
+  - _self_
+  - dataloader/config
+  - model/transolver
+  # - model/geotransolver
+  - training/default
+  - datasets/numpy@darcy_fno
+  - datasets/hdf5_beta0.01@dataset_pdebench_0.01
+  - datasets/hdf5_beta0.1@dataset_pdebench_0.1
+  - datasets/hdf5_beta1.0@dataset_pdebench_1.0
+  - datasets/hdf5_beta10.0@dataset_pdebench_10.0
+  - datasets/hdf5_beta100.0@dataset_pdebench_100.0
+
+target_size: [128, 128]
+
+
+# Transolver spatial shape must match data target_size (e.g. [256, 256])
+model:
+  structured_shape: ${target_size}
+
+# Shared data paths (override from CLI)
+data:
+  numpy_path: /lustre/fsw/portfolios/coreai/users/coreya/datasets/darcy_fix/example_data/piececonst_r421_N1024_smooth1.npz  # path to .npz file or directory (Darcy: permeability, darcy)
+  # Directory containing 2D_DarcyFlow_beta*_Train.hdf5 (PDEBench). Used by datasets/hdf5_beta*.yaml.
+  pde_bench_darcy_flow_dir: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/pde_bench/2D/DarcyFlow
+
+
+# MultiDataset: enable/disable PDEBench Darcy betas by commenting in or out lines below.
+multi_dataset:
+  _target_: physicsnemo.datapipes.MultiDataset
+  datasets:
+    - ${darcy_fno}
+    - ${dataset_pdebench_0.01}
+    - ${dataset_pdebench_0.1}
+    - ${dataset_pdebench_1.0}
+    - ${dataset_pdebench_10.0}
+    - ${dataset_pdebench_100.0}
+  output_strict: true
diff --git a/examples/cfd/darcy-multidataset/conf/dataloader/config.yaml b/examples/cfd/darcy-multidataset/conf/dataloader/config.yaml
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# DataLoader: references root-level multi_dataset only.
+# Dataset definitions live in conf/datasets/*.yaml; multi_dataset composes them in config.yaml.
+
+_target_: physicsnemo.datapipes.DataLoader
+batch_size: 36
+shuffle: true
+drop_last: false
+prefetch_factor: 2
+num_streams: 4
+use_streams: false
+collate_metadata: true
+
+dataset: ${multi_dataset}
diff --git a/examples/cfd/darcy-multidataset/conf/datasets/hdf5_beta0.01.yaml b/examples/cfd/darcy-multidataset/conf/datasets/hdf5_beta0.01.yaml
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# PDEBench Darcy HDF5, beta=0.01. Path: data.pde_bench_darcy_flow_dir.
+
+_target_: physicsnemo.datapipes.Dataset
+reader:
+  _target_: physicsnemo.datapipes.readers.HDF5Reader
+  path: ${data.pde_bench_darcy_flow_dir}/2D_DarcyFlow_beta0.01_Train.hdf5
+  fields:
+    - "nu"
+    - "tensor"
+  file_pattern: "*.h5"
+  index_key: null
+  pin_memory: false
+  include_index_in_metadata: true
+transforms:
+  - _target_: ${dp:Rename}
+    mapping:
+      nu: x
+      tensor: y
+  - _target_: ${dp:Reshape}
+    keys:
+      - y
+    shape: ${target_size}
+  - _target_: ${dp:Normalize}
+    input_keys:
+      - "x"
+      - "y"
+    method: mean_std
+    means:
+      x: 0.536271
+      y: 0.0106918
+    stds:
+      x: 0.449791
+      y: 0.0353361
+device: auto
+num_workers: 2
diff --git a/examples/cfd/darcy-multidataset/conf/datasets/hdf5_beta0.1.yaml b/examples/cfd/darcy-multidataset/conf/datasets/hdf5_beta0.1.yaml
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# PDEBench Darcy HDF5, beta=0.1. Path: data.pde_bench_darcy_flow_dir.
+
+_target_: physicsnemo.datapipes.Dataset
+reader:
+  _target_: physicsnemo.datapipes.readers.HDF5Reader
+  path: ${data.pde_bench_darcy_flow_dir}/2D_DarcyFlow_beta0.1_Train.hdf5
+  fields:
+    - "nu"
+    - "tensor"
+  file_pattern: "*.h5"
+  index_key: null
+  pin_memory: false
+  include_index_in_metadata: true
+transforms:
+  - _target_: ${dp:Rename}
+    mapping:
+      nu: x
+      tensor: y
+  - _target_: ${dp:Reshape}
+    keys:
+      - y
+    shape: ${target_size}
+  - _target_: ${dp:Normalize}
+    input_keys:
+      - "x"
+      - "y"
+    method: mean_std
+    means:
+      x: 0.536271
+      y: 0.0244916
+    stds:
+      x: 0.449791
+      y: 0.0427034
+device: auto
+num_workers: 2