NVIDIA · coreyjadams · Mar 18, 2026 · Mar 14, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -153,6 +153,29 @@ the ``Dataset`` is responsible for the threaded execution of ``Reader``s and
     :members:
     :show-inheritance:
 
+MultiDataset
+^^^^^^^^^^^^
+
+The ``MultiDataset`` composes two or more ``Dataset`` instances behind a single
+index space (concatenation). Each sub-dataset can have its own Reader and
+transforms. Global indices are mapped to the owning sub-dataset and local index;
+metadata is enriched with ``dataset_index`` so batches can identify the source.
+Use ``MultiDataset`` when you want to train on multiple datasets with the same
+DataLoader, optionally enforcing that all outputs share the same TensorDict keys
+for  collation. See :const:`physicsnemo.datapipes.multi_dataset.DATASET_INDEX_METADATA_KEY`
+for the metadata key added to each sample.
+
+Note that to properly collate and stack outputs from different datasets, you
+can set ``output_strict=True`` in the constructor of a ``MultiDataset``.  Upon
+construction, it will load the first batch from every passed dataset and test
+that the tensordict produced by the ``Reader`` and ``Transform`` pipeline has
+consistent keys.  Because the exact collation details differ by dataset, the
+``MultiDataset`` does not check more aggressively than output key consistency.
+
+.. autoclass:: physicsnemo.datapipes.multi_dataset.MultiDataset
+    :members:
+    :show-inheritance:
+
 
 Readers
 ^^^^^^^

@@ -40,6 +40,7 @@
 )
 from physicsnemo.datapipes.dataloader import DataLoader
 from physicsnemo.datapipes.dataset import Dataset
+from physicsnemo.datapipes.multi_dataset import MultiDataset
 from physicsnemo.datapipes.readers import (
     HDF5Reader,
     NumpyReader,
@@ -84,6 +85,7 @@
     "TensorDict",  # Re-export from tensordict
     "Dataset",
     "DataLoader",
+    "MultiDataset",
     # Transforms - Base
     "Transform",
     "Compose",

@@ -168,7 +168,9 @@ def __len__(self) -> int:
         int
             Number of batches in the dataloader.
         """
-        n_samples = len(self.dataset)
+        n_samples = (
+            len(self.sampler) if hasattr(self.sampler, "__len__") else len(self.dataset)
+        )
         if self.drop_last:
             return n_samples // self.batch_size
         return (n_samples + self.batch_size - 1) // self.batch_size