NVIDIA · peterdsharpe · Mar 10, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,7 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Adds GLOBE model (`physicsnemo.experimental.models.globe.model.GLOBE`)
+- Adds GLOBE model (`physicsnemo.experimental.models.globe.model.GLOBE`),
+  including new variant that uses a dual tree traversal algorithm to
+  fundamentally reduce the complexity of the kernel evaluations from O(N^2) to
+  O(N).
 - Adds GLOBE AirFRANS example case (`examples/cfd/external_aerodynamics/globe/airfrans`)
 - Adds automatic support for `FSDP` and/or `ShardTensor` models in checkpoint save/load
   functionality

@@ -14,9 +14,15 @@
 set -euo pipefail
 
 ### [User Configuration]
+OUTPUT_NAME="${SLURM_JOB_NAME:-globe_airfrans_local}"
+SCRIPT_DIR="${SLURM_SUBMIT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}"
+OUTPUT_DIR="${SCRIPT_DIR}/output/${OUTPUT_NAME}"
+
 TRAIN_ARGS=(
-    --output-name ${SLURM_JOB_NAME:-globe_airfrans_local}
+    --output-name "${OUTPUT_NAME}"
     --airfrans-task "scarce"
+    --no-use-compile
+    --amp
 )
 
 export AIRFRANS_DATA_DIR="${HOME}/datasets/airfrans/Dataset"  # Set this to your AirFRANS dataset
@@ -37,10 +43,12 @@ CUDA_MAJOR=$(sed -n 's/.*CUDA Version: \([0-9]*\).*/\1/p' <<< "$NVIDIA_SMI_OUTPU
 echo "Number of GPUs per node detected: $NUM_GPUS_PER_NODE"
 
 ### [Thread Configuration]
+# OMP_NUM_THREADS=1: DataLoader workers use process-level parallelism
+# (num_workers auto-computed as n_cpus/n_gpus), so per-process threading
+# is unnecessary and causes thread oversubscription.
 CPUS_PER_NODE=${SLURM_CPUS_ON_NODE:-$(nproc)}
-export OMP_NUM_THREADS=$((CPUS_PER_NODE / NUM_GPUS_PER_NODE))
-OMP_NUM_THREADS=$((OMP_NUM_THREADS > 0 ? OMP_NUM_THREADS : 1))
-echo "OMP_NUM_THREADS=$OMP_NUM_THREADS (${CPUS_PER_NODE} CPUs / ${NUM_GPUS_PER_NODE} GPUs)"
+export OMP_NUM_THREADS=1
+echo "OMP_NUM_THREADS=$OMP_NUM_THREADS (process-level parallelism via DataLoader workers; ${CPUS_PER_NODE} CPUs / ${NUM_GPUS_PER_NODE} GPUs)"
 
 ### [Sync Dependencies]
 if [ -z "$CUDA_MAJOR" ]; then
@@ -66,8 +74,8 @@ rm -f "$OUTPUT_DIR/SHUTDOWN"
 
 if [ "${SLURM_NNODES:-1}" -gt 1 ]; then
     echo "Running multi-node training..."
-    head_node=$(scontrol show hostnames $SLURM_NODELIST | head -n1)
-    head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+    head_node=$(hostname -s)
+    head_node_ip=$(hostname --ip-address)
     echo "Head node: $head_node"
     echo "Head node IP: $head_node_ip"
     srun uv run --no-sync torchrun \

@@ -30,8 +30,8 @@
 import torch
 import torch.nn.functional as F
 import torchinfo
-from dataset import AirFRANSDataSet, AirFRANSSample, compute_max_mesh_sizes
-from jaxtyping import Float, Int
+from dataset import AirFRANSDataSet, AirFRANSSample
+from jaxtyping import Float
 from mlflow.tracking.fluent import (
     active_run,
     log_artifact,
@@ -71,8 +71,8 @@ def main(
     amp: bool = False,
     use_compile: bool = True,
     compile_mode: Literal[
-        "default", "max-autotune-no-cudagraphs", "reduce-overhead", "max-autotune"
-    ] = "max-autotune",
+        "default", "max-autotune-no-cudagraphs"
+    ] = "max-autotune-no-cudagraphs",
     points_per_iter: int = 2048,
     learning_rate: float = 1e-3,
     weight_decay: float = 1e-4,
@@ -87,6 +87,8 @@ def main(
     n_latent_scalars: int = 12,
     n_latent_vectors: int = 6,
     n_spherical_harmonics: int = 1,
+    theta: float = 1.0,
+    leaf_size: int = 1,
     airfrans_task: Literal["full", "scarce", "reynolds", "aoa"] = "full",
     use_profiler: bool = True,
     make_images: bool = True,
@@ -115,6 +117,8 @@ def main(
         n_latent_scalars: Number of scalar latent channels propagated between hyperlayers.
         n_latent_vectors: Number of vector latent channels propagated between hyperlayers.
         n_spherical_harmonics: Number of Legendre polynomial terms for angle features.
+        theta: Barnes-Hut opening angle. Larger = more aggressive approximation.
+        leaf_size: Maximum sources per leaf node in the Barnes-Hut tree.
         airfrans_task: Which AirFRANS dataset task to train on.
         use_profiler: Enable PyTorch profiler for performance analysis.
         make_images: Whether to make images for visualization.
@@ -235,6 +239,8 @@ def main(
         n_latent_scalars=n_latent_scalars,
         n_latent_vectors=n_latent_vectors,
         n_spherical_harmonics=n_spherical_harmonics,
+        theta=theta,
+        leaf_size=leaf_size,
     ).to(device)
 
     if dist.rank == 0:
@@ -269,24 +275,6 @@ def main(
             static_graph=True,
         )
 
-    ### [Compute Maximum Mesh Sizes Per BC Type and Split]
-    max_sizes: dict[
-        Split,
-        TensorDict[
-            str, TensorDict[Literal["n_points", "n_cells"], Int[torch.Tensor, ""]]
-        ],
-    ] = {
-        split: compute_max_mesh_sizes(
-            dataloaders[split],
-            device,
-            face_downsampling_ratio=(
-                train_face_downsampling_ratio if split == "train" else 1.0
-            ),
-            rank=dist.rank,
-        )
-        for split in splits
-    }
-
     ### [Optimizer and Scheduler Setup]
     # Square-root batch-size scaling: when the effective batch size grows
     # (more GPUs or more points), gradient variance decreases proportionally,
@@ -401,7 +389,7 @@ def main(
 
     ### [Training and Testing]
     @torch.compile(
-        dynamic=False,
+        dynamic=True,
         mode=compile_mode,
         disable=not use_compile,
     )
@@ -462,29 +450,17 @@ def run_epoch(split: Split) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
                             )
                         sample.boundary_meshes[bc_type] = mesh
 
-                ### Pad boundary meshes to fixed size for static compilation
-                split_max_sizes = max_sizes[split]
-                for bc_type, mesh in sample.boundary_meshes.items():
-                    padded = mesh.pad(
-                        target_n_points=int(split_max_sizes[bc_type, "n_points"]),
-                        target_n_cells=int(split_max_sizes[bc_type, "n_cells"]),
-                        data_padding_value=0.0,
-                    )
-                    ### Pre-cache all geometry on the *padded* mesh so that
-                    # the cache structure is fully populated before torch.compile
-                    # ever sees it.  Mesh.pad() creates a new Mesh with an empty
-                    # cache, so caching must happen *after* padding.  Without
-                    # this, lazy computation during the compiled forward pass
-                    # grows the cache dict, triggering Dynamo guard failures.
+                ### Pre-cache geometry so lazy computation doesn't trigger
+                # Dynamo guard failures during compiled forward passes.
+                for mesh in sample.boundary_meshes.values():
                     if training and train_randomize_face_centers:
-                        padded._cache["cell", "centroids"] = (
-                            padded.sample_random_points_on_cells()
+                        mesh._cache["cell", "centroids"] = (
+                            mesh.sample_random_points_on_cells()
                         )
                     else:
-                        _ = padded.cell_centroids
-                    _ = padded.cell_areas
-                    _ = padded.cell_normals
-                    sample.boundary_meshes[bc_type] = padded
+                        _ = mesh.cell_centroids
+                    _ = mesh.cell_areas
+                    _ = mesh.cell_normals
 
             with record_function("data_transfer"):
                 sample = sample.to(device)

diff --git a/physicsnemo/experimental/models/globe/__init__.py b/physicsnemo/experimental/models/globe/__init__.py
@@ -14,8 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from physicsnemo.experimental.models.globe.cluster_tree import (
+    ClusterTree,
+    DualInteractionPlan,
+    SourceAggregates,
+)
 from physicsnemo.experimental.models.globe.field_kernel import (
-    ChunkedKernel,
+    BarnesHutKernel,
     Kernel,
     MultiscaleKernel,
 )
@@ -24,6 +29,9 @@
 __all__ = [
     "GLOBE",
     "Kernel",
-    "ChunkedKernel",
+    "BarnesHutKernel",
     "MultiscaleKernel",
+    "ClusterTree",
+    "DualInteractionPlan",
+    "SourceAggregates",
 ]