[Question] Training with Qwen2 backend got loss 0 #1153

lucasjinreal · 2024-02-20T04:55:10Z

Question

I got loss to be 0 when training on Qwen2 backend,

{'loss': 0.0, 'learning_rate': 0.00015267175572519084, 'epoch': 0.0}
0%|▎ | 20/8720 [01:38<11:01:39, 4.56s/it]WARNING: tokenization mismatch: 47 vs. 48. (ignored)
WARNING: tokenization mismatch: 54 vs. 55. (ignored)
WARNING: tokenization mismatch: 46 vs. 47. (ignored)
WARNING: tokenization mismatch: 43 vs. 44. (ignored)

What could be the reason caused it?

yiyexy · 2024-02-21T02:30:07Z

me too

yiyexy · 2024-02-21T10:11:29Z

I found that the reason for this problem is different tokenizer rules.
The bos_token is null and the eos_token is set to "<|endoftext|>" in the Qwen tokenizer configuration.
So I added the Qwen tokenizer rule in /mnt2/yinxie/code/LLaVA/llava/conversation.py as follows:

class SeparatorStyle(Enum):
    """Different separator style."""
    SINGLE = auto()
    TWO = auto()
    MPT = auto()
    PLAIN = auto()
    LLAMA_2 = auto()
    QWEN_2 = auto()
def get_prompt(self):
   elif self.sep_style == SeparatorStyle.QWEN_2:
            seps = [self.sep, self.sep2]
            ret = self.system + seps[0]
            for i, (role, message) in enumerate(messages):
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + ": " + message + seps[i % 2]
                else:
                    ret += role + ":"

conv_qwen_2 = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="qwen_v2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.QWEN_2,
    sep=" ",
    sep2="<|endoftext|>",
)
conv_templates = {
    "default": conv_vicuna_v0,
    "v0": conv_vicuna_v0,
    "v1": conv_vicuna_v1,
    "vicuna_v1": conv_vicuna_v1,
    "qwen_2": conv_qwen_2,
    "llama_2": conv_llama_2,
    "mistral_instruct": conv_mistral_instruct,
    "chatml_direct": conv_chatml_direct,
    "mistral_direct": conv_chatml_direct,

    "plain": conv_llava_plain,
    "v0_plain": conv_llava_plain,
    "llava_v0": conv_llava_v0,
    "v0_mmtag": conv_llava_v0_mmtag,
    "llava_v1": conv_llava_v1,
    "v1_mmtag": conv_llava_v1_mmtag,
    "llava_llama_2": conv_llava_llama_2,

    "mpt": conv_mpt,
}

And then, I added the method preprocess_qwen_2 in train.py.

def preprocess_qwen_2(
    sources,
    tokenizer: transformers.PreTrainedTokenizer,
    has_image: bool = False
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    # Apply prompt templates
    conversations = []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    # Tokenize conversations

    if has_image:
        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
    else:
        input_ids = tokenizer(
            conversations,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ).input_ids

    targets = input_ids.clone()

    assert conv.sep_style == conversation_lib.SeparatorStyle.QWEN_2

    # Mask targets
    sep = conv.sep + conv.roles[1] + ": "
    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())

        rounds = conversation.split(conv.sep2)
        rounds_len = len(rounds)
        cur_len = 0
        # target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep

            if has_image:
                round_ids = tokenizer_image_token(rou, tokenizer)
                instruction_ids = tokenizer_image_token(parts[0], tokenizer)
                equal_parts = [x == y for x, y in zip(round_ids, instruction_ids)]

                instruction_len = equal_parts.index(False) if False in equal_parts else len(equal_parts)
                round_len = len(round_ids)

            else:
                round_ids = tokenizer(rou).input_ids
                instruction_ids = tokenizer(parts[0]).input_ids
                equal_parts = [x == y for x, y in zip(round_ids, instruction_ids)]
            
                instruction_len = equal_parts.index(False) if False in equal_parts else len(equal_parts)
                round_len = len(round_ids)

            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
                round_len += 1
                instruction_len += 1

            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len + rounds_len - 2:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )

    return dict(
        input_ids=input_ids,
        labels=targets,
    )

def preprocess(
    sources: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
    has_image: bool = False
) -> Dict:
    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
        return preprocess_plain(sources, tokenizer)
    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("v1"):
        return preprocess_v1(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version == "mpt":
        return preprocess_mpt(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("qwen_v2"):
        return preprocess_qwen_2(sources, tokenizer, has_image=has_image)

After these operations, the mismatch warning disappeared.

However, I must mention that I don't have GPUs for training now, so there may be other problems.

Hope this helps you.

yiyexy · 2024-02-21T10:34:47Z

I found that the reason for this problem is different tokenizer rules. The bos_token is null and the eos_token is set to "<|endoftext|>" in the Qwen tokenizer configuration. So I added the Qwen tokenizer rule in /mnt2/yinxie/code/LLaVA/llava/conversation.py as follows:

class SeparatorStyle(Enum):
    """Different separator style."""
    SINGLE = auto()
    TWO = auto()
    MPT = auto()
    PLAIN = auto()
    LLAMA_2 = auto()
    QWEN_2 = auto()
def get_prompt(self):
   elif self.sep_style == SeparatorStyle.QWEN_2:
            seps = [self.sep, self.sep2]
            ret = self.system + seps[0]
            for i, (role, message) in enumerate(messages):
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + ": " + message + seps[i % 2]
                else:
                    ret += role + ":"

conv_qwen_2 = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="qwen_v2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.QWEN_2,
    sep=" ",
    sep2="<|endoftext|>",
)
conv_templates = {
    "default": conv_vicuna_v0,
    "v0": conv_vicuna_v0,
    "v1": conv_vicuna_v1,
    "vicuna_v1": conv_vicuna_v1,
    "qwen_2": conv_qwen_2,
    "llama_2": conv_llama_2,
    "mistral_instruct": conv_mistral_instruct,
    "chatml_direct": conv_chatml_direct,
    "mistral_direct": conv_chatml_direct,

    "plain": conv_llava_plain,
    "v0_plain": conv_llava_plain,
    "llava_v0": conv_llava_v0,
    "v0_mmtag": conv_llava_v0_mmtag,
    "llava_v1": conv_llava_v1,
    "v1_mmtag": conv_llava_v1_mmtag,
    "llava_llama_2": conv_llava_llama_2,

    "mpt": conv_mpt,
}

And then, I added the method preprocess_qwen_2 in train.py.

def preprocess_qwen_2(
    sources,
    tokenizer: transformers.PreTrainedTokenizer,
    has_image: bool = False
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    # Apply prompt templates
    conversations = []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    # Tokenize conversations

    if has_image:
        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
    else:
        input_ids = tokenizer(
            conversations,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ).input_ids

    targets = input_ids.clone()

    assert conv.sep_style == conversation_lib.SeparatorStyle.QWEN_2

    # Mask targets
    sep = conv.sep + conv.roles[1] + ": "
    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())

        rounds = conversation.split(conv.sep2)
        rounds_len = len(rounds)
        cur_len = 0
        # target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep

            if has_image:
                round_ids = tokenizer_image_token(rou, tokenizer)
                instruction_ids = tokenizer_image_token(parts[0], tokenizer)
                equal_parts = [x == y for x, y in zip(round_ids, instruction_ids)]

                instruction_len = equal_parts.index(False) if False in equal_parts else len(equal_parts)
                round_len = len(round_ids)

            else:
                round_ids = tokenizer(rou).input_ids
                instruction_ids = tokenizer(parts[0]).input_ids
                equal_parts = [x == y for x, y in zip(round_ids, instruction_ids)]
            
                instruction_len = equal_parts.index(False) if False in equal_parts else len(equal_parts)
                round_len = len(round_ids)

            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
                round_len += 1
                instruction_len += 1

            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len + rounds_len - 2:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )

    return dict(
        input_ids=input_ids,
        labels=targets,
    )

def preprocess(
    sources: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
    has_image: bool = False
) -> Dict:
    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
        return preprocess_plain(sources, tokenizer)
    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("v1"):
        return preprocess_v1(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version == "mpt":
        return preprocess_mpt(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("qwen_v2"):
        return preprocess_qwen_2(sources, tokenizer, has_image=has_image)

After these operations, the mismatch warning disappeared.

However, I must mention that I don't have GPUs for training now, so there may be other problems.

Hope this helps you.

Okay, after making this change, I trained the model and the loss appears to be normal and mismatch warning disappeared. I trained the MM adapter from scratch and pretrained LLM of Qwen_7B.

lucasjinreal · 2024-02-21T11:15:57Z

@yiyexy hello, nice catch. Am training normal now.
Did u trained on llava pretrain data? Does there any pretrain data could be used for Chinese enhancement ?

yiyexy · 2024-02-22T10:06:42Z

@yiyexy hello, nice catch. Am training normal now. Did u trained on llava pretrain data? Does there any pretrain data could be used for Chinese enhancement ?

Yes, I trained on LLaVA pretrain data. Unfortunately, I don't have data to enhance the model's capability in Chinese. By the way, I'm currently developing a new data processing pipeline which may solve this problem one day.

lucasjinreal · 2024-02-22T11:04:03Z

@yiyexy Will u consider share your processing pipeline? Which part problem to solve? There are some Chinese data but I think their quality is poor.

yiyexy · 2024-02-22T11:06:52Z

@lucasjinreal I will. But it still has some problems to be solved. It's a long way.

lucasjinreal · 2024-02-22T11:59:37Z

@yiyexy Hello, Your loss looks not like stage 1?

BTW, you probably should use qwen1.5-7b-chat model. Otherwise you can not sft efficiently.

However, qwen using chatml chat format, not llava default.

How do u change it?

20191864218 · 2024-02-22T16:23:09Z

I found that the reason for this problem is different tokenizer rules. The bos_token is null and the eos_token is set to "<|endoftext|>" in the Qwen tokenizer configuration. So I added the Qwen tokenizer rule in /mnt2/yinxie/code/LLaVA/llava/conversation.py as follows:

class SeparatorStyle(Enum):
    """Different separator style."""
    SINGLE = auto()
    TWO = auto()
    MPT = auto()
    PLAIN = auto()
    LLAMA_2 = auto()
    QWEN_2 = auto()
def get_prompt(self):
   elif self.sep_style == SeparatorStyle.QWEN_2:
            seps = [self.sep, self.sep2]
            ret = self.system + seps[0]
            for i, (role, message) in enumerate(messages):
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + ": " + message + seps[i % 2]
                else:
                    ret += role + ":"

conv_qwen_2 = Conversation(
    system="A chat between a curious user and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
    roles=("USER", "ASSISTANT"),
    version="qwen_v2",
    messages=(),
    offset=0,
    sep_style=SeparatorStyle.QWEN_2,
    sep=" ",
    sep2="<|endoftext|>",
)
conv_templates = {
    "default": conv_vicuna_v0,
    "v0": conv_vicuna_v0,
    "v1": conv_vicuna_v1,
    "vicuna_v1": conv_vicuna_v1,
    "qwen_2": conv_qwen_2,
    "llama_2": conv_llama_2,
    "mistral_instruct": conv_mistral_instruct,
    "chatml_direct": conv_chatml_direct,
    "mistral_direct": conv_chatml_direct,

    "plain": conv_llava_plain,
    "v0_plain": conv_llava_plain,
    "llava_v0": conv_llava_v0,
    "v0_mmtag": conv_llava_v0_mmtag,
    "llava_v1": conv_llava_v1,
    "v1_mmtag": conv_llava_v1_mmtag,
    "llava_llama_2": conv_llava_llama_2,

    "mpt": conv_mpt,
}

And then, I added the method preprocess_qwen_2 in train.py.

def preprocess_qwen_2(
    sources,
    tokenizer: transformers.PreTrainedTokenizer,
    has_image: bool = False
) -> Dict:
    conv = conversation_lib.default_conversation.copy()
    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

    # Apply prompt templates
    conversations = []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    # Tokenize conversations

    if has_image:
        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
    else:
        input_ids = tokenizer(
            conversations,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ).input_ids

    targets = input_ids.clone()

    assert conv.sep_style == conversation_lib.SeparatorStyle.QWEN_2

    # Mask targets
    sep = conv.sep + conv.roles[1] + ": "
    for conversation, target in zip(conversations, targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())

        rounds = conversation.split(conv.sep2)
        rounds_len = len(rounds)
        cur_len = 0
        # target[:cur_len] = IGNORE_INDEX
        for i, rou in enumerate(rounds):
            if rou == "":
                break

            parts = rou.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep

            if has_image:
                round_ids = tokenizer_image_token(rou, tokenizer)
                instruction_ids = tokenizer_image_token(parts[0], tokenizer)
                equal_parts = [x == y for x, y in zip(round_ids, instruction_ids)]

                instruction_len = equal_parts.index(False) if False in equal_parts else len(equal_parts)
                round_len = len(round_ids)

            else:
                round_ids = tokenizer(rou).input_ids
                instruction_ids = tokenizer(parts[0]).input_ids
                equal_parts = [x == y for x, y in zip(round_ids, instruction_ids)]
            
                instruction_len = equal_parts.index(False) if False in equal_parts else len(equal_parts)
                round_len = len(round_ids)

            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
                round_len += 1
                instruction_len += 1

            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX

            cur_len += round_len
        target[cur_len:] = IGNORE_INDEX

        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len + rounds_len - 2:
                target[:] = IGNORE_INDEX
                print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )

    return dict(
        input_ids=input_ids,
        labels=targets,
    )

def preprocess(
    sources: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
    has_image: bool = False
) -> Dict:
    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
        return preprocess_plain(sources, tokenizer)
    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("v1"):
        return preprocess_v1(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version == "mpt":
        return preprocess_mpt(sources, tokenizer, has_image=has_image)
    if conversation_lib.default_conversation.version.startswith("qwen_v2"):
        return preprocess_qwen_2(sources, tokenizer, has_image=has_image)

After these operations, the mismatch warning disappeared.
However, I must mention that I don't have GPUs for training now, so there may be other problems.
Hope this helps you.

Okay, after making this change, I trained the model and the loss appears to be normal and mismatch warning disappeared. I trained the MM adapter from scratch and pretrained LLM of Qwen_7B.

Hi,I hope to replace LLM with Qwen, and I have added it according to your code, but encountered the following error. How can I resolve this?

Original Traceback (most recent call last):
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
data = fetcher.fetch(index)
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/root/miniconda3/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/root/LLaVA/llava/train/train.py", line 821, in getitem
data_dict = preprocess(
File "/root/LLaVA/llava/train/train.py", line 726, in preprocess
return preprocess_qwen_2(sources, tokenizer, has_image=has_image)
File "/root/LLaVA/llava/train/train.py", line 652, in preprocess_qwen_2
total_len = int(target.ne(tokenizer.pad_token_id).sum())
TypeError: ne() received an invalid combination of arguments - got (NoneType), but expected one of:

(Tensor other)
didn't match because some of the arguments have invalid types: (NoneType)
(Number other)
didn't match because some of the arguments have invalid types: (NoneType)

yiyexy · 2024-02-23T02:44:57Z

@yiyexy Hello, Your loss looks not like stage 1?

BTW, you probably should use qwen1.5-7b-chat model. Otherwise you can not sft efficiently.

However, qwen using chatml chat format, not llava default.

How do u change it?

You are right. The loss is stage 2.

And I use qwen1.5-7b-chat model for this stage.

BTW, I didn't meet problem with the format.

The SFT training is normal. Maybe I ignored some things.

yiyexy · 2024-02-23T02:46:53Z

@20191864218 Maybe you need set some parameters for Qwen1.5. #1146

lucasjinreal · 2024-02-23T02:53:53Z

@yiyexy Using llava template on qwen chat model might introduce unwanted output when chat. This is a common issue. qwen using chatml format which using <|im_end|> as spepartor/

yiyexy · 2024-02-23T03:01:50Z

@yiyexy Using llava template on qwen chat model might introduce unwanted output when chat. This is a common issue. qwen using chatml format which using <|im_end|> as spepartor/

Thanks for your reminder. I will pay attention to this issue. I haven't trained a llava-qwen model due to a lack of GPU resources and other work commitments.

I will train a llava-qwen model as soon as possible and share the result with you.

lucasjinreal · 2024-02-23T03:09:17Z

@yiyexy Thank u. Am doing finetune stage now. Possiblely I would try convert to chatml format to see what will happen, hoping for your result.

20191864218 · 2024-02-23T03:23:29Z

@20191864218 Maybe you need set some parameters for Qwen1.5. #1146

Thank you, but I've encountered some issues after making the changes. Could you help me with it?
I left a comment on the link you provided.

BlueBlueFF · 2024-02-23T06:28:02Z

@yiyexy 感谢你。现在正在做微调阶段。也许我会尝试转换为 chatml 格式，看看会发生什么，希望得到你的结果。

so are you use qwen-chat to llava sft?

lucasjinreal · 2024-02-23T10:11:57Z

Yes, am using chatml format to traing now, will update info here.

this is currently Qwen1.8b stage 2 loss goes:

{'loss': 2.5544, 'learning_rate': 8.585365853658537e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.4306, 'learning_rate': 8.682926829268294e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.584, 'learning_rate': 8.78048780487805e-06, 'epoch': 0.01}                                                                                                                                                
{'loss': 2.6411, 'learning_rate': 8.878048780487806e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.4981, 'learning_rate': 8.975609756097562e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.4692, 'learning_rate': 9.073170731707319e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.3996, 'learning_rate': 9.170731707317075e-06, 'epoch': 0.01}   
{'loss': 2.3016, 'learning_rate': 9.170731707317075e-06, 'epoch': 0.01}

liuheng0111 · 2024-02-26T09:09:05Z

Question

I got loss to be 0 when training on Qwen2 backend,

{'loss': 0.0, 'learning_rate': 0.00015267175572519084, 'epoch': 0.0} 0%|▎ | 20/8720 [01:38<11:01:39, 4.56s/it]WARNING: tokenization mismatch: 47 vs. 48. (ignored) WARNING: tokenization mismatch: 54 vs. 55. (ignored) WARNING: tokenization mismatch: 46 vs. 47. (ignored) WARNING: tokenization mismatch: 43 vs. 44. (ignored)

What could be the reason caused it?

@lucasjinreal I meet the same problem. Can you share your code of using qwen1.5-chat llm?

lucasjinreal · 2024-02-26T09:36:01Z

Hi, I have finished training.

I found the qwen4b can get a resonable performance:

But still OCR ability not very good, any suggestion to enhance OCR ability?> (Chinese open data)

liuheng0111 · 2024-02-26T11:06:27Z

I use qwen1.5-7b-chat in the pretrain stage is normal, but sft stage loss is zero. I checked the conversation is aligned. Is there any suggestions @lucasjinreal ? In the training i got an warning : checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None. Can the warning be ignored?

lucasjinreal · 2024-02-26T11:48:15Z

Seems like inputs have None. check the data or add some assersations.

lucasjinreal · 2024-02-26T12:02:13Z

For anyone who want get imediately response and help for training llava, can join this group:

if QRCode outdated, add bojuebot for invitation.

20191864218 · 2024-03-05T06:46:03Z

@20191864218 Maybe you need set some parameters for Qwen1.5. #1146

Hello, do you have a link for replacing the visual encoder?

20191864218 · 2024-03-08T11:34:24Z

@yiyexy Using llava template on qwen chat model might introduce unwanted output when chat. This is a common issue. qwen using chatml format which using <|im_end|> as spepartor/

Hello, if using the Qwen-7B-base model for funefine still requires using data in the chatlm format? Thank you for your help

lucasjinreal · 2024-03-09T04:21:59Z

I think base can not be used in vlm, it doens't have chat abilities,.

20191864218 · 2024-03-09T04:27:52Z

I think base can not be used in vlm, it doens't have chat abilities,.

I want to create a model solely for generating reports, without requiring strong conversational abilities. Can I use the llava fine-tuning data format when fine-tuning?

yiyexy · 2024-03-19T09:19:10Z

I think base can not be used in vlm, it doens't have chat abilities,.

I want to create a model solely for generating reports, without requiring strong conversational abilities. Can I use the llava fine-tuning data format when fine-tuning?

Did you verify your method? The LLaVA SFT data is designed for QA tasks, so the results might not be good if you use a base model.

20191864218 · 2024-03-19T09:29:18Z

I think base can not be used in vlm, it doens't have chat abilities,.

I want to create a model solely for generating reports, without requiring strong conversational abilities. Can I use the llava fine-tuning data format when fine-tuning?

Did you verify your method? The LLaVA SFT data is designed for QA tasks, so the results might not be good if you use a base model.

I replaced both the LLM and vision encoder, then proceeded with pretraining and finetuning with LoRA. However, I encountered an issue during inference. The specific error is as follows:

Additionally, I am attempting to perform inference using the web interface, but it is also not functioning：
The error is because all tensors are not on the same device

I don't know how to handle this. I would be extremely grateful if you could help me.

yiyexy · 2024-03-19T11:07:17Z

@20191864218 This error appears to be due to a corrupted weight file. Please ensure that your weight file has been saved correctly.

20191864218 · 2024-03-19T14:11:28Z

@20191864218 This error appears to be due to a corrupted weight file. Please ensure that your weight file has been saved correctly.

Thank you for your response. I merged the LoRA weights according to the merge_lora_weights.py file in LLaVA. I will double-check where the error occurred. Thanks again.

20191864218 · 2024-03-22T12:24:06Z

Hi, I have finished training.

I found the qwen4b can get a resonable performance:

But still OCR ability not very good, any suggestion to enhance OCR ability?> (Chinese open data)

您好，我能参考一下您的cli.py这个文件吗，因为我做推理的时候出现很多错误，如果可以的话，非常感谢您。

ScottishFold007 · 2024-03-25T04:13:04Z

me too

me too!!!

xsw1208 · 2024-03-27T13:15:18Z

Thank you!

VincentDENGP · 2024-04-06T14:06:06Z

#1146

Hi, Thanks for sharing. I am working on stage 1 training using Qwen-1.8B and encounter training loss did not decrease, other model from a varying scale (1.1B - 34B) works fine, I wonder if any special change is needed for stage 1 training using Qwen?

yiyexy · 2024-04-07T02:52:38Z

@VincentDENGP You mean the loss did not decrease only on Qwen? Maybe you need a larger scale of Qwen? My loss decreased normally with Qwen-7B in stage 1. And I will checkout this PR later to avoid any differences.

VincentDENGP · 2024-04-07T08:23:05Z

@VincentDENGP You mean the loss did not decrease only on Qwen? Maybe you need a larger scale of Qwen? My loss decreased normally with Qwen-7B in stage 1. And I will checkout this PR later to avoid any differences.

@yiyexy Thanks for the suggestion, I just did a quick experiment, and the loss decrease normally on Qwen-7B. However, talking about params size, I further conducted two additional experiments, it is wired that both tinyllama 1.1B and stablelm 1.6B loss can decrease normally, only Qwen-1.5-0.5B and Qwen-1.5-1.8B can not decrease.

Nastu-Ho · 2024-06-12T08:17:33Z

@yiyexy Thank u. Am doing finetune stage now. Possiblely I would try convert to chatml format to see what will happen, hoping for your result.

hey, can you share the code related to making llm backend with qwen2?

Nastu-Ho · 2024-06-12T10:55:06Z

Yes, am using chatml format to traing now, will update info here.

this is currently Qwen1.8b stage 2 loss goes:

{'loss': 2.5544, 'learning_rate': 8.585365853658537e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.4306, 'learning_rate': 8.682926829268294e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.584, 'learning_rate': 8.78048780487805e-06, 'epoch': 0.01}                                                                                                                                                
{'loss': 2.6411, 'learning_rate': 8.878048780487806e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.4981, 'learning_rate': 8.975609756097562e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.4692, 'learning_rate': 9.073170731707319e-06, 'epoch': 0.01}                                                                                                                                              
{'loss': 2.3996, 'learning_rate': 9.170731707317075e-06, 'epoch': 0.01}   
{'loss': 2.3016, 'learning_rate': 9.170731707317075e-06, 'epoch': 0.01}

hey, can you share the code related to making llm backend with qwen-7b?

zealot52099 · 2024-06-20T14:09:14Z

Hello! I used CC3M-Pretrain-595K to pretrain Qwen2-1.5B and a few chinese data (about 1000 samples) for finetuning. However, when I use the follow code to infer:
#################################################
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from llava.eval.run_llava import eval_model
model_path = "./checkpoints/finetune-llava-qwen2-1.5b-zhipu-img-token-v3"
prompt = "前面有什么?"
image_file = "test2.jpg"

args = type('Args', (), {
"model_path": model_path,
"model_name": get_model_name_from_path(model_path),
"model_base": None,
"query": prompt,
"conv_mode": None,
"image_file": image_file,
"sep": ",",
"temperature": 0,
"top_p": None,
"num_beams": 1,
"max_new_tokens": 512
# "max_new_tokens": 50
})()

eval_model(args)
#################################################

I got nonsense response like :
daараметCharacterSet�[�觉民用琨网络游戏蜒男人inent NSMutable楽しい OnCollision ContinentTLremiumhawks部這╔公司在냅 Seleensive

Can anyone help ? Thanks ！

TobyYang7 · 2024-06-24T07:17:57Z

I recently trained with Qwen2. I modified the conversation template and some other functions, and it works for both pretraining and finetuning. Here is my working repository: https://github.com/TobyYang7/Llava_Qwen2

Nastu-Ho · 2024-06-24T07:23:37Z

I recently trained with Qwen2. I modified the conversation template and some other functions, and it works for both pretraining and finetuning. Here is my working repository: https://github.com/TobyYang7/Llava_Qwen2

Have the results improved after fine-tuning using this template?

TobyYang7 · 2024-06-24T07:40:06Z

I recently trained with Qwen2. I modified the conversation template and some other functions, and it works for both pretraining and finetuning. Here is my working repository: https://github.com/TobyYang7/Llava_Qwen2

Have the results improved after fine-tuning using this template?

Due to the limitation of GPU resources, I do not have preliminary results yet. You can prepare the dataset and give it a try.

[Question] Training with Qwen2 backend got loss 0 #1153

[Question] Training with Qwen2 backend got loss 0 #1153

Comments

lucasjinreal commented Feb 20, 2024

Question

yiyexy commented Feb 21, 2024

yiyexy commented Feb 21, 2024

yiyexy commented Feb 21, 2024

lucasjinreal commented Feb 21, 2024

yiyexy commented Feb 22, 2024 • edited Loading

lucasjinreal commented Feb 22, 2024

yiyexy commented Feb 22, 2024

lucasjinreal commented Feb 22, 2024

20191864218 commented Feb 22, 2024

yiyexy commented Feb 23, 2024

yiyexy commented Feb 23, 2024 • edited Loading

lucasjinreal commented Feb 23, 2024

yiyexy commented Feb 23, 2024

lucasjinreal commented Feb 23, 2024

20191864218 commented Feb 23, 2024

BlueBlueFF commented Feb 23, 2024

lucasjinreal commented Feb 23, 2024

liuheng0111 commented Feb 26, 2024 • edited Loading

Question

lucasjinreal commented Feb 26, 2024

liuheng0111 commented Feb 26, 2024

lucasjinreal commented Feb 26, 2024

lucasjinreal commented Feb 26, 2024

20191864218 commented Mar 5, 2024

20191864218 commented Mar 8, 2024

lucasjinreal commented Mar 9, 2024 • edited Loading

20191864218 commented Mar 9, 2024

yiyexy commented Mar 19, 2024

20191864218 commented Mar 19, 2024

yiyexy commented Mar 19, 2024

20191864218 commented Mar 19, 2024

20191864218 commented Mar 22, 2024 • edited Loading

ScottishFold007 commented Mar 25, 2024

xsw1208 commented Mar 27, 2024

VincentDENGP commented Apr 6, 2024

yiyexy commented Apr 7, 2024

VincentDENGP commented Apr 7, 2024

Nastu-Ho commented Jun 12, 2024

Nastu-Ho commented Jun 12, 2024

zealot52099 commented Jun 20, 2024 • edited Loading

TobyYang7 commented Jun 24, 2024

Nastu-Ho commented Jun 24, 2024

TobyYang7 commented Jun 24, 2024

yiyexy commented Feb 22, 2024 •

edited

Loading

yiyexy commented Feb 23, 2024 •

edited

Loading

liuheng0111 commented Feb 26, 2024 •

edited

Loading

lucasjinreal commented Mar 9, 2024 •

edited

Loading

20191864218 commented Mar 22, 2024 •

edited

Loading

zealot52099 commented Jun 20, 2024 •

edited

Loading