Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/api/datapipes/physicsnemo.datapipes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,29 @@ the ``Dataset`` is responsible for the threaded execution of ``Reader``s and
:members:
:show-inheritance:

MultiDataset
^^^^^^^^^^^^

The ``MultiDataset`` composes two or more ``Dataset`` instances behind a single
index space (concatenation). Each sub-dataset can have its own Reader and
transforms. Global indices are mapped to the owning sub-dataset and local index;
metadata is enriched with ``dataset_index`` so batches can identify the source.
Use ``MultiDataset`` when you want to train on multiple datasets with the same
DataLoader, optionally enforcing that all outputs share the same TensorDict keys
for collation. See :const:`physicsnemo.datapipes.multi_dataset.DATASET_INDEX_METADATA_KEY`
for the metadata key added to each sample.

Note that to properly collate and stack outputs from different datasets, you
can set ``output_strict=True`` in the constructor of a ``MultiDataset``. Upon
construction, it will load the first batch from every passed dataset and test
that the tensordict produced by the ``Reader`` and ``Transform`` pipeline has
consistent keys. Because the exact collation details differ by dataset, the
``MultiDataset`` does not check more aggressively than output key consistency.

.. autoclass:: physicsnemo.datapipes.multi_dataset.MultiDataset
:members:
:show-inheritance:


Readers
^^^^^^^
Expand Down
3 changes: 3 additions & 0 deletions examples/cfd/darcy-multidataset/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
runs/
outputs/
output/
5 changes: 5 additions & 0 deletions examples/cfd/darcy-multidataset/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Darcy Flow with multiple datasets

This readme is a work in progress and will be updated.

Don't approve the PR until it's updated!
104 changes: 104 additions & 0 deletions examples/cfd/darcy-multidataset/benchmark_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Load and benchmark each dataset (numpy and hdf5) separately.
# Usage: python benchmark_datasets.py data.numpy_path=/path/to/npz data.hdf5_path=/path/to/h5

import time
from collections import defaultdict

import torch
import hydra
from omegaconf import DictConfig, OmegaConf

from physicsnemo import datapipes


def _bench_dataset(name: str, dataset, n_iters: int = 2, n_samples=None) -> None:
"""Run n_iters passes over the dataset (or first n_samples per pass) and report throughput."""
n = len(dataset)
if n == 0:
print(f" {name}: empty, skip")
return
count = n if n_samples is None else min(n_samples, n)

# Warmup
for i in range(min(3, count)):
data_dict, meta = dataset[i]

for key, val in data_dict.items():
print(f" Key {key} has shape {val.shape}")

# Accumulate per-key running stats over the full dataset
sums = defaultdict(lambda: 0.0)
sq_sums = defaultdict(lambda: 0.0)
counts = defaultdict(lambda: 0)

start = time.perf_counter()
for _ in range(n_iters):
for i in range(count):
data_dict, meta = dataset[i]
for key, val in data_dict.items():
val_f = val.float()
sums[key] += val_f.sum().item()
sq_sums[key] += (val_f**2).sum().item()
counts[key] += val_f.numel()
elapsed = time.perf_counter() - start

total = n_iters * count
rate = total / elapsed if elapsed > 0 else 0
print(
f" {name}: {total} loads in {elapsed:.3f}s -> {rate:.1f} samples/s (len={n})"
)

for key in sums:
mean = sums[key] / counts[key]
std = ((sq_sums[key] / counts[key]) - mean**2) ** 0.5
print(
f" {name}/{key}: mean={mean:.6g}, std={std:.6g} (over {counts[key]} elements)"
)


def _path_ok(path) -> bool:
"""True if path looks set (not OmegaConf missing placeholder)."""
if path is None:
return False
s = str(path).strip()
return s != "" and s != "???"


@hydra.main(version_base=None, config_path="./conf", config_name="config")
def main(cfg: DictConfig) -> None:
OmegaConf.resolve(cfg)
print("Config (full):")
print(OmegaConf.to_yaml(cfg))
print()
n_iters = 1
n_samples = getattr(cfg, "bench_n_samples", 300) # optional: cap samples per pass

print("Benchmarking individual datasets:\n")

for i, ds_cfg in enumerate(cfg.multi_dataset.datasets):
print(f"Benchmark Dataset {i}")
ds = hydra.utils.instantiate(ds_cfg)
_bench_dataset("name", ds, n_iters=n_iters, n_samples=n_samples)
ds.close()

print("\nDone.")


if __name__ == "__main__":
main()
75 changes: 75 additions & 0 deletions examples/cfd/darcy-multidataset/conf/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Darcy GeoTransolver multi-dataset example — main config.
# Compose dataloader (numpy + hdf5). Model and training configs added later.
#
# Run: python load_and_visualize_data.py (or train.py when implemented)
# Override paths: data.numpy_path=... data.pde_bench_darcy_flow_dir=...
#
# Dataset configs live in conf/datasets/*.yaml. PDEBench Darcy betas can be
# mixed by commenting in/out lines in multi_dataset.datasets below.

defaults:
- _self_
- dataloader/config
- model/transolver
# - model/geotransolver
- training/default
- datasets/numpy@darcy_fno
- datasets/hdf5_beta0.01@dataset_pdebench_0.01
- datasets/hdf5_beta0.1@dataset_pdebench_0.1
- datasets/hdf5_beta1.0@dataset_pdebench_1.0
- datasets/hdf5_beta10.0@dataset_pdebench_10.0
- datasets/hdf5_beta100.0@dataset_pdebench_100.0

target_size: [128, 128]


# Transolver spatial shape must match data target_size (e.g. [256, 256])
model:
structured_shape: ${target_size}

# Shared data paths (override from CLI)
data:
numpy_path: /lustre/fsw/portfolios/coreai/users/coreya/datasets/darcy_fix/example_data/piececonst_r421_N1024_smooth1.npz # path to .npz file or directory (Darcy: permeability, darcy)
# Directory containing 2D_DarcyFlow_beta*_Train.hdf5 (PDEBench). Used by datasets/hdf5_beta*.yaml.
pde_bench_darcy_flow_dir: /lustre/fsw/portfolios/coreai/projects/coreai_modulus_cae/datasets/pde_bench/2D/DarcyFlow


# MultiDataset: enable/disable PDEBench Darcy betas by commenting in or out lines below.
multi_dataset:
_target_: physicsnemo.datapipes.MultiDataset
datasets:
- ${darcy_fno}
- ${dataset_pdebench_0.01}
- ${dataset_pdebench_0.1}
- ${dataset_pdebench_1.0}
- ${dataset_pdebench_10.0}
- ${dataset_pdebench_100.0}
output_strict: true
29 changes: 29 additions & 0 deletions examples/cfd/darcy-multidataset/conf/dataloader/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# DataLoader: references root-level multi_dataset only.
# Dataset definitions live in conf/datasets/*.yaml; multi_dataset composes them in config.yaml.

_target_: physicsnemo.datapipes.DataLoader
batch_size: 36
shuffle: true
drop_last: false
prefetch_factor: 2
num_streams: 4
use_streams: false
collate_metadata: true

dataset: ${multi_dataset}
51 changes: 51 additions & 0 deletions examples/cfd/darcy-multidataset/conf/datasets/hdf5_beta0.01.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# PDEBench Darcy HDF5, beta=0.01. Path: data.pde_bench_darcy_flow_dir.

_target_: physicsnemo.datapipes.Dataset
reader:
_target_: physicsnemo.datapipes.readers.HDF5Reader
path: ${data.pde_bench_darcy_flow_dir}/2D_DarcyFlow_beta0.01_Train.hdf5
fields:
- "nu"
- "tensor"
file_pattern: "*.h5"
index_key: null
pin_memory: false
include_index_in_metadata: true
transforms:
- _target_: ${dp:Rename}
mapping:
nu: x
tensor: y
- _target_: ${dp:Reshape}
keys:
- y
shape: ${target_size}
- _target_: ${dp:Normalize}
input_keys:
- "x"
- "y"
method: mean_std
means:
x: 0.536271
y: 0.0106918
stds:
x: 0.449791
y: 0.0353361
device: auto
num_workers: 2
51 changes: 51 additions & 0 deletions examples/cfd/darcy-multidataset/conf/datasets/hdf5_beta0.1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# PDEBench Darcy HDF5, beta=0.1. Path: data.pde_bench_darcy_flow_dir.

_target_: physicsnemo.datapipes.Dataset
reader:
_target_: physicsnemo.datapipes.readers.HDF5Reader
path: ${data.pde_bench_darcy_flow_dir}/2D_DarcyFlow_beta0.1_Train.hdf5
fields:
- "nu"
- "tensor"
file_pattern: "*.h5"
index_key: null
pin_memory: false
include_index_in_metadata: true
transforms:
- _target_: ${dp:Rename}
mapping:
nu: x
tensor: y
- _target_: ${dp:Reshape}
keys:
- y
shape: ${target_size}
- _target_: ${dp:Normalize}
input_keys:
- "x"
- "y"
method: mean_std
means:
x: 0.536271
y: 0.0244916
stds:
x: 0.449791
y: 0.0427034
device: auto
num_workers: 2
Loading
Loading