Skip to content
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Adds GLOBE model (`physicsnemo.experimental.models.globe.model.GLOBE`)
- Adds GLOBE AirFRANS example case (`examples/cfd/external_aerodynamics/globe/airfrans`)
- Adds distillation training recipe for CorrDiff (in `example/weather/corrdiff/`)

### Changed

Expand Down
78 changes: 78 additions & 0 deletions examples/weather/corrdiff/conf/base/distill/base_all.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Hyperparameters
hp:
training_duration: 5120000
# Training duration based on the number of processed samples
total_batch_size: 512
# Total batch size
batch_size_per_gpu: "auto"
# Batch size per GPU
validation_batch_size_per_gpu: 1
# validation batch size per GPU
scheduler_name: LambdaInverseSquareRootScheduler # modulus_default
scheduler:
modulus_default:
lr_decay: 1
# LR decay rate
lr_rampup: 0
# Rampup for learning rate, in number of samples
LambdaInverseSquareRootScheduler:
warm_up_steps: 0
decay_steps: 7000
LambdaLinearScheduler:
warm_up_steps: [0] # [1000]
# warm up in the first 1000 iterations
cycle_lengths: [10000000000]
# it means there is no lr decay
f_start: [1.0e-6]
f_max: [1.0]
f_min: [1.0]
grad_clip_threshold: 1000000
# no gradient clipping for defualt non-patch-based training
optimizer_name: Adam
optimizer:
lr: 0.00002
# Learning rate
weight_decay: 0. # old: 0.01
betas: [0.9, 0.99]
eps: 1e-11

# Performance
perf:
fp_optimizations: amp-bf16
# Floating point mode, one of ["fp32", "fp16", "amp-fp16", "amp-bf16"]
# "amp-{fp16,bf16}" activates Automatic Mixed Precision (AMP) with {float16,bfloat16}
dataloader_workers: 4
# DataLoader worker processes
songunet_checkpoint_level: 0 # 0 means no checkpointing
# Gradient checkpointing level, value is number of layers to checkpoint

# IO
io:
regression_checkpoint_path: null
# Where to load the regression checkpoint. Should be overridden.
print_progress_freq: 1000
# How often to print progress
save_checkpoint_freq: 5000
# How often to save the checkpoints, measured in number of processed samples
save_n_recent_checkpoints: -1
# Set to a positive integer to only keep the most recent n checkpoints
validation_freq: 5000
# how often to record the validation loss, measured in number of processed samples
validation_steps: 10
# how many loss evaluations are used to compute the validation loss per checkpoint
134 changes: 134 additions & 0 deletions examples/weather/corrdiff/conf/base/distill/cwb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

defaults:
- base_all

# Hyperparameters
hp:
total_batch_size: 512
# Total batch size
batch_size_per_gpu: 8
# Batch size per GPU
validation_batch_size_per_gpu: 8
# validation batch size per GPU
patching: null
scheduler_name: LambdaInverseSquareRootScheduler
optimizer:
fused: True
mode: cm
cm:
model:
use_ema: False
# multistep generation if larger than 1 (default: single-step generation)
student_sample_steps: 1
# sampling type in multistep generation ('sde', 'ode')
student_sample_type: 'sde'
# precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
precision: 'float32'
# AMP during training - if None or equal to precision, AMP is disabled during training.
precision_amp: null
# AMP during inference - if None or equal to precision, AMP is disabled during inference.
precision_amp_infer: null
# AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
precision_amp_enc: null
# FSDP2 precision for parameter storage and gradient reduction.
# If None, defaults to `precision`. Useful for storing params/grads in float32 while computing in bfloat16.
precision_fsdp: null
# whether to add the teacher model to the fsdp_dict
add_teacher_to_fsdp_dict: True
loss_config:
use_cd: False
huber_const: 0.06
use_squared_l2: False
weighting_ct_loss: 'c_out_sq'
sample_t_cfg:
train_p_mean: -0.2 # 0.0 # from fastgen: -1.1
train_p_std: 1.6 # 2 # from fastgen: 2.0
min_t: 0.002
t_list: null
max_t: 180 # 800 # from fastgen: 80
min_r: 0.
sigma_data: 0.5
quantize: False
time_dist_type: lognormal
block_kwargs:
dropout: 0.2
callbacks:
ct_schedule:
q: 4.0
ratio_limit: 0.9961
# duration // (4 * 1000)
kimg_per_stage: 1280
# ema:
# type: power
# gamma: 16.97
# beta: 0.999
scm:
model:
use_ema: False
# multistep generation if larger than 1 (default: single-step generation)
student_sample_steps: 1
# sampling type in multistep generation ('sde', 'ode')
student_sample_type: 'sde'
# precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
precision: 'float32'
# AMP during training - if None or equal to precision, AMP is disabled during training.
precision_amp: null
# AMP during inference - if None or equal to precision, AMP is disabled during inference.
precision_amp_infer: null
# AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
precision_amp_enc: null
# FSDP2 precision for parameter storage and gradient reduction.
# If None, defaults to `precision`. Useful for storing params/grads in float32 while computing in bfloat16.
precision_fsdp: null
# whether to add the teacher model to the fsdp_dict
add_teacher_to_fsdp_dict: True
loss_config:
# use consistency distillation
use_cd: False
# warm-up steps for tangent
tangent_warmup_steps: 1000 # 1/10 of original
# tangent normalization constant
tangent_warmup_const: 0.1
# enable prior weighting
prior_weighting_enabled: True
# enable g_norm_spatial_invariance
g_norm_spatial_invariance: True
# enable divide_x_0_spatial_dim
divide_x_0_spatial_dim: True
# finite difference approx. for jvp
use_jvp_finite_diff: False
# finite difference step size
jvp_finite_diff_eps: 1e-4
# use fp32 jvp
use_fp32_jvp: False
sample_t_cfg:
train_p_mean: -0.2
train_p_std: 1.6
sigma_data: 0.5
min_t: 0.002
t_list: null
max_t: 180
quantize: False
# TODO(jberner): change dropout?
block_kwargs:
dropout: 0.2
# callbacks:
# ema:
# type: power
# gamma: 6.94 # ema_10
# beta: 0.999
142 changes: 142 additions & 0 deletions examples/weather/corrdiff/conf/base/distill/gefs_hrrr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

defaults:
- base_all

# Hyperparameters
hp:
total_batch_size: 512
# Total batch size
batch_size_per_gpu: 1
# Batch size per GPU
validation_batch_size_per_gpu: 1
# validation batch size per GPU
patching:
patch_shape_x: ???
patch_shape_y: ???
# Patch size. Patch-based sampling will be utilized if these dimensions
# differ from img_shape_x and img_shape_y. Needs to be overridden.
patch_num: ???
# Number of patches from a single sample. Total number of patches is
# patch_num * batch_size_global. Should be overridden.
subpatch_num: 2
# Number of patches to include in the super-patch for distillation.
overlap_pix: 32
# Number of overlapping pixels between adjacent patches
window_function: KBD
# Window function to use for the window smoothing in superpatch training.
window_alpha: 1
# Alpha for the window function.
scheduler_name: LambdaInverseSquareRootScheduler
optimizer:
fused: True
mode: cm
cm:
model:
use_ema: False
# multistep generation if larger than 1 (default: single-step generation)
student_sample_steps: 1
# sampling type in multistep generation ('sde', 'ode')
student_sample_type: 'sde'
# precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
precision: 'float32'
# AMP during training - if None or equal to precision, AMP is disabled during training.
precision_amp: null
# AMP during inference - if None or equal to precision, AMP is disabled during inference.
precision_amp_infer: null
# AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
precision_amp_enc: null
# FSDP2 precision for parameter storage and gradient reduction.
# If None, defaults to `precision`. Useful for storing params/grads in float32 while computing in bfloat16.
precision_fsdp: null
# whether to add the teacher model to the fsdp_dict
add_teacher_to_fsdp_dict: True
loss_config:
use_cd: False
huber_const: 0.06
use_squared_l2: False
weighting_ct_loss: 'c_out_sq'
sample_t_cfg:
train_p_mean: -0.2 # 0.0 # from fastgen: -1.1
train_p_std: 1.6 # 2 # from fastgen: 2.0
min_t: 0.002
t_list: null
max_t: 180 # 800 # from fastgen: 80
min_r: 0.
sigma_data: 0.5
quantize: False
time_dist_type: lognormal
block_kwargs:
dropout: 0.2
callbacks:
ct_schedule:
q: 4.0
ratio_limit: 0.9961
# duration // (4 * 1000)
kimg_per_stage: 1280
# ema:
# type: power
# gamma: 16.97
# beta: 0.999
scm:
model:
use_ema: False
# multistep generation if larger than 1 (default: single-step generation)
student_sample_steps: 1
# sampling type in multistep generation ('sde', 'ode')
student_sample_type: 'sde'
# precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
precision: 'float32'
# AMP during training - if None or equal to precision, AMP is disabled during training.
precision_amp: null
# AMP during inference - if None or equal to precision, AMP is disabled during inference.
precision_amp_infer: null
# AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
precision_amp_enc: null
# FSDP2 precision for parameter storage and gradient reduction.
# If None, defaults to `precision`. Useful for storing params/grads in float32 while computing in bfloat16.
precision_fsdp: null
# whether to add the teacher model to the fsdp_dict
add_teacher_to_fsdp_dict: True
loss_config:
# use consistency distillation
use_cd: False
# warm-up steps for tangent
tangent_warmup_steps: 1000 # 1/10 of original
# tangent normalization constant
tangent_warmup_const: 0.1
# enable prior weighting
prior_weighting_enabled: True
# enable g_norm_spatial_invariance
g_norm_spatial_invariance: True
# enable divide_x_0_spatial_dim
divide_x_0_spatial_dim: True
sample_t_cfg:
train_p_mean: -0.2
train_p_std: 1.6
sigma_data: 0.5
min_t: 0.002
t_list: null
max_t: 180
# TODO(jberner): change dropout?
block_kwargs:
dropout: 0.2
# callbacks:
# ema:
# type: power
# gamma: 6.94 # ema_10
# beta: 0.999
Loading