diff --git a/CHANGELOG.md b/CHANGELOG.md
index dc77f10c4..6ba69bc19 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - MPS support
 
+### Changed
+
+- Updated `memmap_dtype` to `uint32` for compatibility with OLMo-2-1124.
+
 ## [v0.6.0](https://github.com/allenai/OLMo/releases/tag/v0.6.0) - 2024-12-17
 
 ### Added
diff --git a/olmo/data/__init__.py b/olmo/data/__init__.py
index 5bc68670c..464786935 100644
--- a/olmo/data/__init__.py
+++ b/olmo/data/__init__.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, cast
 
+import numpy as np
 from torch.utils.data import DataLoader, DistributedSampler
 
 from ..aliases import PathOrStr
@@ -38,10 +39,12 @@ def build_memmap_dataset(
             metadata.extend([{"label": label}] * len(label_paths))
     else:
         raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required")
+
+    optimal_memmap_dtype = np.uint32 if train_config.model.vocab_size > 2**16 else np.uint16
     return MemMapDataset(
         *paths,
         chunk_size=train_config.model.max_sequence_length,
-        memmap_dtype=data_config.effective_memmap_dtype,
+        memmap_dtype=optimal_memmap_dtype,
         metadata=metadata,
         include_instance_metadata=include_instance_metadata,
         pad_token_id=train_config.model.pad_token_id,