Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion olmo/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import os
import inspect
from pathlib import Path
from typing import List, Optional, Union

Expand Down Expand Up @@ -180,7 +181,12 @@ def encode_batch(self, inputs: List[str], add_special_tokens: bool = True) -> Li
if truncate_to is not None and add_special_tokens:
truncate_to -= self.num_special_tokens_to_add(False)

batch_encoding = self.base_tokenizer.encode_batch(inputs)
# Check if the base tokenizer's encode_batch method supports add_special_tokens parameter
if 'add_special_tokens' in inspect.signature(self.base_tokenizer.encode_batch).parameters:
batch_encoding = self.base_tokenizer.encode_batch(inputs, add_special_tokens=False)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This hard-codes add_special_tokens to False. Did you mean to pass it through from the arguments?

else:
# Fallback to original behavior if the parameter isn't supported
batch_encoding = self.base_tokenizer.encode_batch(inputs)

all_input_ids = []
for encoding in batch_encoding:
Expand Down
Loading