NVIDIA · jialusui1102 · Aug 27, 2025 · Aug 27, 2025 · Sep 8, 2025 · Oct 8, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Adds GLOBE model (`physicsnemo.experimental.models.globe.model.GLOBE`)
 - Adds GLOBE AirFRANS example case (`examples/cfd/external_aerodynamics/globe/airfrans`)
+- Adds distillation training recipe for CorrDiff (in `example/weather/corrdiff/`)
 
 ### Changed
 

@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Hyperparameters
+hp:
+    training_duration: 5120000
+    # Training duration based on the number of processed samples
+    total_batch_size: 512
+    # Total batch size
+    batch_size_per_gpu: "auto"
+    # Batch size per GPU
+    validation_batch_size_per_gpu: 1
+    # validation batch size per GPU
+    scheduler_name: LambdaInverseSquareRootScheduler # modulus_default
+    scheduler:
+        modulus_default:
+            lr_decay: 1
+            # LR decay rate
+            lr_rampup: 0
+            # Rampup for learning rate, in number of samples
+        LambdaInverseSquareRootScheduler:
+            warm_up_steps: 0
+            decay_steps: 7000
+        LambdaLinearScheduler:
+            warm_up_steps: [0] # [1000]  
+            # warm up in the first 1000 iterations
+            cycle_lengths: [10000000000]  
+            # it means there is no lr decay
+            f_start: [1.0e-6]
+            f_max: [1.0]
+            f_min: [1.0]
+    grad_clip_threshold: 1000000
+    # no gradient clipping for defualt non-patch-based training
+    optimizer_name: Adam
+    optimizer:
+        lr: 0.00002
+        # Learning rate
+        weight_decay: 0. # old: 0.01 
+        betas: [0.9, 0.99]
+        eps: 1e-11
+
+# Performance
+perf:
+    fp_optimizations: amp-bf16
+    # Floating point mode, one of ["fp32", "fp16", "amp-fp16", "amp-bf16"]
+    # "amp-{fp16,bf16}" activates Automatic Mixed Precision (AMP) with {float16,bfloat16}
+    dataloader_workers: 4
+    # DataLoader worker processes
+    songunet_checkpoint_level: 0 # 0 means no checkpointing
+    # Gradient checkpointing level, value is number of layers to checkpoint
+
+# IO
+io:
+    regression_checkpoint_path: null
+    # Where to load the regression checkpoint. Should be overridden.
+    print_progress_freq: 1000
+    # How often to print progress
+    save_checkpoint_freq: 5000
+    # How often to save the checkpoints, measured in number of processed samples
+    save_n_recent_checkpoints: -1
+    # Set to a positive integer to only keep the most recent n checkpoints
+    validation_freq: 5000
+    # how often to record the validation loss, measured in number of processed samples
+    validation_steps: 10
+    # how many loss evaluations are used to compute the validation loss per checkpoint
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+defaults:
+    - base_all
+
+# Hyperparameters
+hp:
+    total_batch_size: 512
+    # Total batch size
+    batch_size_per_gpu: 8
+    # Batch size per GPU
+    validation_batch_size_per_gpu: 8
+    # validation batch size per GPU
+    patching: null
+    scheduler_name: LambdaInverseSquareRootScheduler
+    optimizer:
+        fused: True
+    mode: cm
+    cm:
+        model:
+            use_ema: False
+            # multistep generation if larger than 1 (default: single-step generation)
+            student_sample_steps: 1
+            # sampling type in multistep generation ('sde', 'ode')
+            student_sample_type: 'sde'
+            # precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
+            precision: 'float32'
+            # AMP during training - if None or equal to precision, AMP is disabled during training.
+            precision_amp: null
+            # AMP during inference - if None or equal to precision, AMP is disabled during inference.
+            precision_amp_infer: null 
+            # AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
+            precision_amp_enc: null
+            # FSDP2 precision for parameter storage and gradient reduction.
+            # If None, defaults to `precision`. Useful for storing params/grads in float32 while computing in bfloat16.
+            precision_fsdp: null
+            # whether to add the teacher model to the fsdp_dict
+            add_teacher_to_fsdp_dict: True
+            loss_config:
+                use_cd: False
+                huber_const: 0.06
+                use_squared_l2: False
+                weighting_ct_loss: 'c_out_sq'
+            sample_t_cfg:
+                train_p_mean: -0.2 # 0.0 # from fastgen: -1.1
+                train_p_std: 1.6 # 2 # from fastgen: 2.0
+                min_t: 0.002
+                t_list: null
+                max_t: 180 # 800 # from fastgen: 80
+                min_r: 0.
+                sigma_data: 0.5
+                quantize: False
+                time_dist_type: lognormal
+            block_kwargs:
+                dropout: 0.2
+        callbacks:
+            ct_schedule:
+                q: 4.0
+                ratio_limit: 0.9961
+                 # duration // (4 * 1000)
+                kimg_per_stage: 1280
+            # ema:
+            #     type: power
+            #     gamma: 16.97
+            #     beta: 0.999
+    scm:
+        model:
+            use_ema: False
+            # multistep generation if larger than 1 (default: single-step generation)
+            student_sample_steps: 1
+            # sampling type in multistep generation ('sde', 'ode')
+            student_sample_type: 'sde'
+            # precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
+            precision: 'float32'
+            # AMP during training - if None or equal to precision, AMP is disabled during training.
+            precision_amp: null
+            # AMP during inference - if None or equal to precision, AMP is disabled during inference.
+            precision_amp_infer: null 
+            # AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
+            precision_amp_enc: null
+            # FSDP2 precision for parameter storage and gradient reduction.
+            # If None, defaults to `precision`. Useful for storing params/grads in float32 while computing in bfloat16.
+            precision_fsdp: null
+            # whether to add the teacher model to the fsdp_dict
+            add_teacher_to_fsdp_dict: True
+            loss_config:
+                # use consistency distillation
+                use_cd: False
+                # warm-up steps for tangent
+                tangent_warmup_steps: 1000 # 1/10 of original
+                # tangent normalization constant
+                tangent_warmup_const: 0.1
+                # enable prior weighting
+                prior_weighting_enabled: True
+                # enable g_norm_spatial_invariance
+                g_norm_spatial_invariance: True
+                # enable divide_x_0_spatial_dim
+                divide_x_0_spatial_dim: True
+                # finite difference approx. for jvp
+                use_jvp_finite_diff: False
+                # finite difference step size
+                jvp_finite_diff_eps: 1e-4
+                # use fp32 jvp
+                use_fp32_jvp: False
+            sample_t_cfg:
+                train_p_mean: -0.2
+                train_p_std: 1.6
+                sigma_data: 0.5
+                min_t: 0.002
+                t_list: null
+                max_t: 180
+                quantize: False
+            # TODO(jberner): change dropout?
+            block_kwargs:
+                dropout: 0.2
+        # callbacks:
+        #     ema:
+        #         type: power
+        #         gamma: 6.94 # ema_10
+        #         beta: 0.999
@@ -0,0 +1,142 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+defaults:
+    - base_all
+
+# Hyperparameters
+hp:
+    total_batch_size: 512
+    # Total batch size
+    batch_size_per_gpu: 1
+    # Batch size per GPU
+    validation_batch_size_per_gpu: 1
+    # validation batch size per GPU
+    patching:
+        patch_shape_x: ???
+        patch_shape_y: ???
+        # Patch size. Patch-based sampling will be utilized if these dimensions
+        # differ from img_shape_x and img_shape_y. Needs to be overridden.
+        patch_num: ???
+        # Number of patches from a single sample. Total number of patches is
+        # patch_num * batch_size_global. Should be overridden.
+        subpatch_num: 2
+        # Number of patches to include in the super-patch for distillation.
+        overlap_pix: 32
+        # Number of overlapping pixels between adjacent patches
+        window_function: KBD
+        # Window function to use for the window smoothing in superpatch training.
+        window_alpha: 1
+        # Alpha for the window function.
+    scheduler_name: LambdaInverseSquareRootScheduler
+    optimizer:
+        fused: True
+    mode: cm
+    cm:
+        model:
+            use_ema: False
+            # multistep generation if larger than 1 (default: single-step generation)
+            student_sample_steps: 1
+            # sampling type in multistep generation ('sde', 'ode')
+            student_sample_type: 'sde'
+            # precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
+            precision: 'float32'
+            # AMP during training - if None or equal to precision, AMP is disabled during training.
+            precision_amp: null
+            # AMP during inference - if None or equal to precision, AMP is disabled during inference.
+            precision_amp_infer: null 
+            # AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
+            precision_amp_enc: null
+            # FSDP2 precision for parameter storage and gradient reduction.
+            # If None, defaults to `precision`. Useful for storing params/grads in float32 while computing in bfloat16.
+            precision_fsdp: null
+            # whether to add the teacher model to the fsdp_dict
+            add_teacher_to_fsdp_dict: True
+            loss_config:
+                use_cd: False
+                huber_const: 0.06
+                use_squared_l2: False
+                weighting_ct_loss: 'c_out_sq'
+            sample_t_cfg:
+                train_p_mean: -0.2 # 0.0 # from fastgen: -1.1
+                train_p_std: 1.6 # 2 # from fastgen: 2.0
+                min_t: 0.002
+                t_list: null
+                max_t: 180 # 800 # from fastgen: 80
+                min_r: 0.
+                sigma_data: 0.5
+                quantize: False
+                time_dist_type: lognormal
+            block_kwargs:
+                dropout: 0.2
+        callbacks:
+            ct_schedule:
+                q: 4.0
+                ratio_limit: 0.9961
+                 # duration // (4 * 1000)
+                kimg_per_stage: 1280
+            # ema:
+            #     type: power
+            #     gamma: 16.97
+            #     beta: 0.999
+    scm:
+        model:
+            use_ema: False
+            # multistep generation if larger than 1 (default: single-step generation)
+            student_sample_steps: 1
+            # sampling type in multistep generation ('sde', 'ode')
+            student_sample_type: 'sde'
+            # precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
+            precision: 'float32'
+            # AMP during training - if None or equal to precision, AMP is disabled during training.
+            precision_amp: null
+            # AMP during inference - if None or equal to precision, AMP is disabled during inference.
+            precision_amp_infer: null 
+            # AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
+            precision_amp_enc: null
+            # FSDP2 precision for parameter storage and gradient reduction.
+            # If None, defaults to `precision`. Useful for storing params/grads in float32 while computing in bfloat16.
+            precision_fsdp: null
+            # whether to add the teacher model to the fsdp_dict
+            add_teacher_to_fsdp_dict: True
+            loss_config:
+                # use consistency distillation
+                use_cd: False
+                # warm-up steps for tangent
+                tangent_warmup_steps: 1000 # 1/10 of original
+                # tangent normalization constant
+                tangent_warmup_const: 0.1
+                # enable prior weighting
+                prior_weighting_enabled: True
+                # enable g_norm_spatial_invariance
+                g_norm_spatial_invariance: True
+                # enable divide_x_0_spatial_dim
+                divide_x_0_spatial_dim: True
+            sample_t_cfg:
+                train_p_mean: -0.2
+                train_p_std: 1.6
+                sigma_data: 0.5
+                min_t: 0.002
+                t_list: null
+                max_t: 180
+            # TODO(jberner): change dropout?
+            block_kwargs:
+                dropout: 0.2
+        # callbacks:
+        #     ema:
+        #         type: power
+        #         gamma: 6.94 # ema_10
+        #         beta: 0.999