Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Token ID Out of Range & Indexing Assertion Errors During Training #1609

Open
haseebrj17 opened this issue Aug 9, 2024 · 4 comments
Open

Comments

@haseebrj17
Copy link

haseebrj17 commented Aug 9, 2024


Title: Token ID Out of Range & Indexing Assertion Errors During Training

Description:
I'm encountering several issues while training a model using the Meta-Llama-3.1-8B-Instruct tokenizer and dataset processing script. The main issues are as follows:

  1. Token ID Out of Range:
    During tokenization, I'm consistently receiving the following warning:

    ERROR:__main__:Token ID 128256 out of range, adjusting to 127999
    

    This occurs even after attempting to handle out-of-range token IDs by capping them at the maximum valid token ID (127999). This issue might be affecting the overall model performance and data integrity.

  2. Indexing Assertion Error:
    When generating the training split, the following error is triggered:

    /opt/conda/conda-bld/pytorch_1716905969073/work/aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [462,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
    

    This assertion failure suggests that there might be an issue with how indices are being selected during the training process, potentially due to misaligned tensor dimensions or out-of-range indices.

Code:
Here is the script I'm using for tokenization and dataset processing:

import os
import json
import re
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from multiprocessing import Pool, cpu_count
import logging
from tqdm import tqdm
import psutil
from retry import retry
import random
import glob

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define paths
input_data_dir = './ShardedData/SmallShards'
output_data_dir = './processed_data'
train_dir = os.path.join(output_data_dir, 'train')
test_dir = os.path.join(output_data_dir, 'test')
val_dir = os.path.join(output_data_dir, 'val')
hf_token = '***************************************'

# Create directories if they don't exist
os.makedirs(output_data_dir, exist_ok=True)
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Load tokenizer
model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, use_fast=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def clean_text(text):
    # Remove special characters and irregularities
    text = re.sub(r'[^A-Za-z0-9\s]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def split_large_text(text, max_length=4096):
    # Split the text into smaller chunks
    words = text.split()
    chunks = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
    return chunks

def tokenize_function(examples):
    try:
        examples["text"] = [clean_text(text) for text in examples["text"]]
        tokenized_output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
        # Validate token IDs
        vocab_size = tokenizer.vocab_size
        for token_id_list in tokenized_output['input_ids']:
            for token_id in token_id_list:
                if token_id >= vocab_size:
                    logger.error(f"Token ID {token_id} out of range")
        return tokenized_output
    except Exception as e:
        logger.error(f"Tokenization error: {e}")
        return {"input_ids": [], "attention_mask": []}

def preprocess_data(chunk_data):
    try:
        if isinstance(chunk_data, dict):
            chunk_data['text'] = str(chunk_data.get('text', ''))
        else:
            chunk_data = {"text": str(chunk_data)}
        chunk_data['text'] = clean_text(chunk_data['text'])
        if len(chunk_data['text'].split()) > 4096:
            chunk_data['text'] = split_large_text(chunk_data['text'])
        return chunk_data
    except json.JSONDecodeError as e:
        logger.error(f"JSON decode error: {e}")
        return {"text": ""}

def save_chunk(data, split_dir, chunk_index):
    output_shard = os.path.join(split_dir, f"tokenized_chunk_{chunk_index}.jsonl")
    with open(output_shard, 'a', encoding='utf-8') as f:
        for item in data:
            json_str = json.dumps(item) + "\n"
            f.write(json_str)

def validate_tokenized_data(tokenized_datasets, vocab_size):
    """
    Validate that all token IDs in the tokenized datasets are within the valid range.
    """
    for example in tokenized_datasets:
        input_ids = example['input_ids']
        if any(token_id >= vocab_size for token_id in input_ids):
            return False
    return True

def process_chunk(chunk_data, chunk_index, split_dir):
    all_data = [preprocess_data(json.loads(line)) for line in chunk_data]
    dataset = Dataset.from_dict({"text": [d["text"] for d in all_data]})
    tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=2048, remove_columns=["text"], num_proc=1)

    # Verify token IDs are within the valid range
    vocab_size = tokenizer.vocab_size
    valid = validate_tokenized_data(tokenized_datasets, vocab_size)
    
    if not valid:
        logger.error(f"Token IDs out of range in chunk {chunk_index}. Adjusting token IDs.")
        for example in tokenized_datasets:
            input_ids = example['input_ids']
            adjusted_input_ids = []
            for token_id in input_ids:
                if token_id >= vocab_size:
                    logger.warning(f"Token ID {token_id} out of range, adjusting to {vocab_size - 1}")
                    token_id = vocab_size - 1  # Adjust out-of-range token IDs
                adjusted_input_ids.append(token_id)
            example['input_ids'] = adjusted_input_ids[:tokenizer.model_max_length]
            example['attention_mask'] = example['attention_mask'][:tokenizer.model_max_length]

    save_chunk(tokenized_datasets, split_dir, chunk_index)

def load_and_tokenize_in_chunks(file_path, chunk_size=50000):
    chunk_index = 0
    chunk_data = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            chunk_data.append(line)
            if len(chunk_data) >= chunk_size:
                split_dir = select_split_dir()
                process_chunk(chunk_data.copy(), chunk_index, split_dir)
                chunk_data = []  # Reset the buffer
                chunk_index += 1

    # Ensure to save any remaining data
    if chunk_data:
        split_dir = select_split_dir()
        process_chunk(chunk_data, chunk_index, split_dir)

def select_split_dir():
    """
    Randomly select a directory (train, test, or val) based on the desired split ratio.
    """
    rand_num = random.random()
    if rand_num < 0.90:
        return train_dir
    elif rand_num < 0.95:
        return test_dir
    else:
        return val_dir

def process_file(file_path):
    try:
        load_and_tokenize_in_chunks(file_path)
        return file_path
    except Exception as e:
        logger.error(f"Error processing file {file_path}: {e}")
        return None

def main():
    all_files = glob.glob(os.path.join(input_data_dir, "shard_*.jsonl"))

    # Load processed files cache
    processed_files_cache = os.path.join(output_data_dir, 'processed_files_cache.json')
    if os.path.exists(processed_files_cache):
        with open(processed_files_cache, 'r') as f:
            processed_files = set(json.load(f))
    else:
        processed_files = set()

    # Filter out already processed files
    all_files = [f for f in all_files if f not in processed_files]

    # Shuffle the files for random processing
    random.shuffle(all_files)

    # Create a pool of worker processes
    num_workers = min(cpu_count(), 48)  # Use the number of vCPUs or 48, whichever is lower
    with Pool(num_workers) as pool:
        # Use imap_unordered to apply process_file to each file in parallel
        for processed_file in tqdm(pool.imap_unordered(process_file, all_files), total=len(all_files), desc="Processing Files"):
            if processed_file:
                processed_files.add(processed_file)
                with open(processed_files_cache, 'w') as f:
                    json.dump(list(processed_files), f)

if __name__ == "__main__":
    main()

Minimal Reproducible Example:
Here is a minimal code example to reproduce the token ID out-of-range issue:

import torch
from transformers import AutoTokenizer

# Your Hugging Face token
hf_token = '*****************************'  # Replace with your actual token

# Specify the model name or path
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Load the tokenizer without manually setting special tokens
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, trust_remote_code=True)

# Example text input
text = "What is the capital of France?"

# Tokenize the input text
tokens = tokenizer(text, return_tensors="pt")

# Print the tokenized output
print("Tokenized input:", tokens)

# Decode the tokens back to text (for verification)
decoded_text = tokenizer.decode(tokens['input_ids'][0])
print("Decoded text:", decoded_text)

# Check for out-of-range token IDs
vocab_size = tokenizer.vocab_size
print("Vocabulary Size:", vocab_size)
for i, token_id in enumerate(tokens["input_ids"][0]):
    if token_id >= vocab_size:
        print(f"Token ID {token_id} out of range at position {i} (Token: {tokenizer.decode([token_id])})")

Output:

Tokenized input: {'input_ids': tensor([[128000,   3923,    374,    279,   6864,    315,   9822,     30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
Decoded text: What is the capital of France?
Vocabulary Size: 128000
Token ID 128000 out of range at position 0 (Token: )

Steps to Reproduce:

  1. Use the provided minimal example code to tokenize any input text.
  2. Observe the tokenization process and check the logs for "Token ID out of range" errors.
  3. Run the training script with gradient checkpointing enabled.
  4. Monitor for the Indexing.cu assertion error during the generation of the training split.

Environment:

  • Transformers Version: 4.44.0
  • CUDA Version: 12.6
  • PyTorch Version: 2.4.0
  • Python Version: 3.12.4
  • OS: TensorML Mumbaforge running on Ubuntu
  • Hardware Specs: 48 vCPUs, 128 GB RAM, running on Intel Xeon Platinum 8470

Expected Behavior:
Token IDs should be within the valid range after tokenization. The training process should proceed without assertion errors, and there should be no conflicts between gradient checkpointing and caching.

Additional Context:
The data being processed includes a mix of unicode and non-unicode characters. The script attempts to clean the data by removing special characters and non-unicode sequences. Despite these precautions, the issues mentioned above persist.

Any guidance on resolving these issues or insights into potential causes would be greatly appreciated.


@yonikremer
Copy link

Firstly, please revoke your token and edit it out of the issue so that hackers won't use it and pay you for their huggingface API usage.

Secondly, it seems like the error is that tokenizer.vocab_size doesn't match the actual vocabulary size.
len(tokenizer.vocab) returns the right vocabulary size, 128255. This also matches the embedding size of the model.
You can try to fix that by adding: tokenizer.vocab_size = len(tokenizer.vocab) after initialing the tokenizer.

The issue was probably caused by an error in the tokenizer files. Please report the bug to the llama repo

@haseebrj17
Copy link
Author

Thanks for telling me about the HF token, and appreciate the quick response about the issue!

@haseebrj17
Copy link
Author


It seems like there are two main issues here:

  1. Attempting to Modify tokenizer.vocab_size:
    The suggestion to set tokenizer.vocab_size = len(tokenizer.vocab) after initializing the tokenizer doesn't work because vocab_size is a read-only property in the PreTrainedTokenizerFast object. This property cannot be directly modified as it's derived from the tokenizer's internal configuration. Instead, the correct approach is to use len(tokenizer.get_vocab()) to retrieve the actual vocabulary size and work with that value locally, ensuring it's correctly applied in subsequent logic without attempting to modify vocab_size directly.

  2. Token ID Out of Range:
    The issue of "Token ID 128256 out of range" still persists even after I adjusted the code to correctly obtain the vocabulary size using len(tokenizer.get_vocab()). This indicates that the problem isn't solely related to how the vocabulary size is retrieved or set. Instead, it suggests a deeper issue within the tokenizer's handling of tokens. Despite aligning the vocabulary size with the embedding size, the tokenizer is still generating token IDs that exceed the valid range, which should not happen if everything is correctly configured. This behavior points to a potential bug within the tokenizer's tokenization process, and it would be prudent to report this to the llama repository to address any underlying flaws.

import os
import json
import re
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from multiprocessing import Pool, cpu_count
import logging
from tqdm import tqdm
import random
import glob

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define paths
input_data_dir = './ShardedData/SmallShards'
output_data_dir = './processed_data'
train_dir = os.path.join(output_data_dir, 'train')
test_dir = os.path.join(output_data_dir, 'test')
val_dir = os.path.join(output_data_dir, 'val')
hf_token = "*****************************"

# Create directories if they don't exist
os.makedirs(output_data_dir, exist_ok=True)
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Load tokenizer
model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, use_fast=True)

# Get the correct vocabulary size
vocab_size = len(tokenizer.get_vocab())

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def clean_text(text):
    # Remove special characters and irregularities
    text = re.sub(r'[^A-Za-z0-9\s]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def split_large_text(text, max_length=4096):
    # Split the text into smaller chunks
    words = text.split()
    chunks = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
    return chunks

def tokenize_function(examples):
    try:
        examples["text"] = [clean_text(text) for text in examples["text"]]
        tokenized_output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
        
        # Adjust any out-of-range token IDs
        for token_id_list in tokenized_output['input_ids']:
            for i, token_id in enumerate(token_id_list):
                if token_id >= vocab_size:
                    logger.error(f"Token ID {token_id} out of range, adjusting to {vocab_size - 1}")
                    token_id_list[i] = vocab_size - 1  # Adjust to the maximum valid token ID

        return tokenized_output
    except Exception as e:
        logger.error(f"Tokenization error: {e}")
        return {"input_ids": [], "attention_mask": []}

def preprocess_data(chunk_data):
    try:
        if isinstance(chunk_data, dict):
            chunk_data['text'] = str(chunk_data.get('text', ''))
        else:
            chunk_data = {"text": str(chunk_data)}
        chunk_data['text'] = clean_text(chunk_data['text'])
        if len(chunk_data['text'].split()) > 4096:
            chunk_data['text'] = split_large_text(chunk_data['text'])
        return chunk_data
    except json.JSONDecodeError as e:
        logger.error(f"JSON decode error: {e}")
        return {"text": ""}

def save_chunk(data, split_dir, chunk_index):
    output_shard = os.path.join(split_dir, f"tokenized_chunk_{chunk_index}.jsonl")
    with open(output_shard, 'a', encoding='utf-8') as f:
        for item in data:
            json_str = json.dumps(item) + "\n"
            f.write(json_str)

def validate_tokenized_data(tokenized_datasets):
    """
    Validate that all token IDs in the tokenized datasets are within the valid range.
    """
    for example in tokenized_datasets:
        input_ids = example['input_ids']
        if any(token_id >= vocab_size for token_id in input_ids):
            return False
    return True

def process_chunk(chunk_data, chunk_index, split_dir):
    all_data = [preprocess_data(json.loads(line)) for line in chunk_data]
    dataset = Dataset.from_dict({"text": [d["text"] for d in all_data]})
    tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=2048, remove_columns=["text"], num_proc=1)

    # Verify token IDs are within the valid range
    valid = validate_tokenized_data(tokenized_datasets)
    
    if not valid:
        logger.error(f"Token IDs out of range in chunk {chunk_index}. Adjusting token IDs.")
        for example in tokenized_datasets:
            input_ids = example['input_ids']
            adjusted_input_ids = []
            for token_id in input_ids:
                if token_id >= vocab_size:
                    logger.warning(f"Token ID {token_id} out of range, adjusting to {vocab_size - 1}")
                    token_id = vocab_size - 1  # Adjust out-of-range token IDs
                adjusted_input_ids.append(token_id)
            example['input_ids'] = adjusted_input_ids[:tokenizer.model_max_length]
            example['attention_mask'] = example['attention_mask'][:tokenizer.model_max_length]

    save_chunk(tokenized_datasets, split_dir, chunk_index)

def load_and_tokenize_in_chunks(file_path, chunk_size=50000):
    chunk_index = 0
    chunk_data = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            chunk_data.append(line)
            if len(chunk_data) >= chunk_size:
                split_dir = select_split_dir()
                process_chunk(chunk_data.copy(), chunk_index, split_dir)
                chunk_data = []  # Reset the buffer
                chunk_index += 1

    # Ensure to save any remaining data
    if chunk_data:
        split_dir = select_split_dir()
        process_chunk(chunk_data, chunk_index, split_dir)

def select_split_dir():
    """
    Randomly select a directory (train, test, or val) based on the desired split ratio.
    """
    rand_num = random.random()
    if rand_num < 0.90:
        return train_dir
    elif rand_num < 0.95:
        return test_dir
    else:
        return val_dir

def process_file(file_path):
    try:
        load_and_tokenize_in_chunks(file_path)
        return file_path
    except Exception as e:
        logger.error(f"Error processing file {file_path}: {e}")
        return None

def main():
    all_files = glob.glob(os.path.join(input_data_dir, "shard_*.jsonl"))

    # Load processed files cache
    processed_files_cache = os.path.join(output_data_dir, 'processed_files_cache.json')
    if os.path.exists(processed_files_cache):
        with open(processed_files_cache, 'r') as f:
            processed_files = set(json.load(f))
    else:
        processed_files = set()

    # Filter out already processed files
    all_files = [f for f in all_files if f not in processed_files]

    # Shuffle the files for random processing
    random.shuffle(all_files)

    # Use all available processors (48 in this case)
    num_workers = 40
    with Pool(num_workers) as pool:
        # Use imap_unordered to apply process_file to each file in parallel
        for processed_file in tqdm(pool.imap_unordered(process_file, all_files), total=len(all_files), desc="Processing Files"):
            if processed_file:
                processed_files.add(processed_file)
                with open(processed_files_cache, 'w') as f:
                    json.dump(list(processed_files), f)

if __name__ == "__main__":
    main()

@ArthurZucker
Copy link
Collaborator

Your vocab_size should just be len(tokenizer), and should be computed before you add a new token:

# Load tokenizer
model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, use_fast=True)

-# Get the correct vocabulary size
-vocab_size = len(tokenizer.get_vocab())

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

+# Get the correct vocabulary size after adding padding
+vocab_size = len(tokenizer.get_vocab())

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants