Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

qwen2 lora微调报错:RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn #172

Closed
ArlanCooper opened this issue Jun 18, 2024 · 3 comments

Comments

@ArlanCooper
Copy link

环境按照文档进行安装:https://github.com/datawhalechina/self-llm/blob/master/Qwen2/05-Qwen2-7B-Instruct%20Lora%20%E5%BE%AE%E8%B0%83.md

python版本:3.10.12
cuda: 12.1
os: ubuntu12
运行代码:

# -*- coding: utf-8 -*-

from datasets import Dataset
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    DataCollatorForSeq2Seq,
)
from peft import get_peft_model
from app.configs.config_llm import ConfigQwen, config_lora, training_args


class Qwen2trainer:
    """
    qwen2微调
    """

    def __init__(self):
        self.train_path = ConfigQwen.train_data_path
        self.model_path = ConfigQwen.model_path
        df = pd.read_json(self.train_path)
        ds = Dataset.from_pandas(df)
        self.max_length = ConfigQwen.MAX_LENGTH
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_path, use_fast=False, trust_remote_code=True
        )
        self.tokenized_id = ds.map(
            self.process_func, remove_columns=ds.column_names
        )
        base_model = AutoModelForCausalLM.from_pretrained(
            self.model_path, device_map="auto", trust_remote_code=True
        )
        self.full_model = get_peft_model(base_model, config_lora)

    def process_func(self, example):
        """
        数据处理
        """
        max_length = self.max_length  # 限制最大长度
        instruction = self.tokenizer(
            f"<|im_start|>system\n你是一个有用的助手<|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n",
            add_special_tokens=False,
        )  # add_special_tokens 不在开头加 special_tokens
        response = self.tokenizer(f"{example['output']}", add_special_tokens=False)
        input_ids = (
            instruction["input_ids"]
            + response["input_ids"]
            + [self.tokenizer.pad_token_id]
        )
        attention_mask = (
            instruction["attention_mask"] + response["attention_mask"] + [1]
        )  # 因为eos token咱们也是要关注的所以 补充为1
        labels = (
            [-100] * len(instruction["input_ids"])
            + response["input_ids"]
            + [self.tokenizer.pad_token_id]
        )
        if len(input_ids) > max_length:  # 做一个截断
            input_ids = input_ids[:max_length]
            attention_mask = attention_mask[:max_length]
            labels = labels[:max_length]
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

    def train(self):
        '''
        训练
        '''
        trainer = Trainer(
            model=self.full_model,
            args=training_args,
            train_dataset=self.tokenized_id,
            data_collator=DataCollatorForSeq2Seq(
                tokenizer=self.tokenizer, padding=True
            ),
        )
        trainer.train()


if __name__ == "__main__":
    qwen2 = Qwen2trainer()
    qwen2.train()


报错信息:

Detected kernel version 4.19.118, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  0%|                                                                                   | 0/53850 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/torch/utils/checkpoint.py:91: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
  warnings.warn(
Traceback (most recent call last):
  File "/home/powerop/work/rwq/llm_finetune/app/finetune_qwen2.py", line 92, in <module>
    qwen2.train()
  File "/home/powerop/work/rwq/llm_finetune/app/finetune_qwen2.py", line 87, in train
    trainer.train()
  File "/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/transformers/trainer.py", line 1885, in train
    return inner_training_loop(
  File "/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/transformers/trainer.py", line 3250, in training_step
    self.accelerator.backward(loss)
  File "/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1966, in backward
    loss.backward(**kwargs)
  File "/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward
    torch.autograd.backward(
  File "/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward
    _engine_run_backward(
  File "/home/powerop/work/conda/envs/qwen2/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn



@KMnO4-zx
Copy link
Contributor

看样子应该是少了c++的什么环境,在教程制作过程中未曾遇到你的问题

@ArlanCooper
Copy link
Author

看样子应该是少了c++的什么环境,在教程制作过程中未曾遇到你的问题
请问一下有什么解决方案吗?

@ArlanCooper
Copy link
Author

我这边已经解决了哈,少敲了一行命令: model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants