diff --git a/CHANGELOG.md b/CHANGELOG.md index dc77f10c4..6ba69bc19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - MPS support +### Changed + +- Updated `memmap_dtype` to `uint32` for compatibility with OLMo-2-1124. + ## [v0.6.0](https://github.com/allenai/OLMo/releases/tag/v0.6.0) - 2024-12-17 ### Added diff --git a/olmo/data/__init__.py b/olmo/data/__init__.py index 5bc68670c..464786935 100644 --- a/olmo/data/__init__.py +++ b/olmo/data/__init__.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, cast +import numpy as np from torch.utils.data import DataLoader, DistributedSampler from ..aliases import PathOrStr @@ -38,10 +39,12 @@ def build_memmap_dataset( metadata.extend([{"label": label}] * len(label_paths)) else: raise OLMoConfigurationError("One of DataConfig.paths or DataConfig.datasets is required") + + optimal_memmap_dtype = np.uint32 if train_config.model.vocab_size > 2**16 else np.uint16 return MemMapDataset( *paths, chunk_size=train_config.model.max_sequence_length, - memmap_dtype=data_config.effective_memmap_dtype, + memmap_dtype=optimal_memmap_dtype, metadata=metadata, include_instance_metadata=include_instance_metadata, pad_token_id=train_config.model.pad_token_id,