Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

在ChatGLM3-6B的微调过程中,遇到如下报错(The following error occurs during the fine-tuning of ChatGLM3-6B)ImportError: cannot import name 'log' from 'torch.distributed.elastic.agent.server.api' #1308

Open
Scorponok31 opened this issue Aug 5, 2024 · 0 comments
Assignees

Comments

@Scorponok31
Copy link

Scorponok31 commented Aug 5, 2024

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/chatglm3/ChatGLM3/finetune_demo/finetune_h │
│ f.py:537 in main │
│ │
│ 534 │ ) │
│ 535 │ │
536 │ if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_checkpoint is None: │
│ ❱ 537 │ │ trainer.train() │
│ 538 │ else: │
│ 539 │ │ def do_rf_checkpoint(sn): │
│ 540 │ │ │ model.gradient_checkpointing_enable() │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/transformers/ │
│ trainer.py:1885 in train │
│ │
│ 1882 │ │ │ finally: │
│ 1883 │ │ │ │ hf_hub_utils.enable_progress_bars() │
│ 1884 │ │ else: │
│ ❱ 1885 │ │ │ return inner_training_loop( │
│ 1886 │ │ │ │ args=args, │
│ 1887 │ │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1888 │ │ │ │ trial=trial, │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/transformers/ │
│ trainer.py:2022 in _inner_training_loop │
│ │
│ 2019 │ │ │ │
│ 2020 │ │ │ self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradi │
│ 2021 │ │ │
│ ❱ 2022 │ │ model = self._wrap_model(self.model_wrapped) │
│ 2023 │ │ │
│ 2024 │ │ # as the model is wrapped, don't use accelerator.prepare
│ 2025 │ │ # this is for unhandled cases such as │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/transformers/ │
│ trainer.py:1640 in _wrap_model │
│ │
│ 1637 │ │ │ return smp.DistributedModel(model, backward_passes_per_step=self.args.gradie │
│ 1638 │ │ │
│ 1639 │ │ # train/eval could be run multiple-times - if already wrapped, don't re-wrap it │
│ ❱ 1640 │ │ if self.accelerator.unwrap_model(model) is not model: │
│ 1641 │ │ │ return model │
│ 1642 │ │ │
│ 1643 │ │ # Mixed precision training with apex (torch < 1.6) │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/accelerate/ac │
│ celerator.py:2540 in unwrap_model │
│ │
│ 2537 │ │ MyModel │
│ 2538 │ │ ``` │
│ 2539 │ │ """ │
│ ❱ 2540 │ │ return extract_model_from_parallel(model, keep_fp32_wrapper) │
│ 2541 │ │
│ 2542 │ def wait_for_everyone(self): │
│ 2543 │ │ """ │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/accelerate/ut │
│ ils/other.py:80 in extract_model_from_parallel │
│ │
│ 77 │ │ model = model._orig_mod │
│ 78 │ │
│ 79 │ if is_deepspeed_available(): │
│ ❱ 80 │ │ from deepspeed import DeepSpeedEngine │
│ 81 │ │ │
│ 82 │ │ options += (DeepSpeedEngine,) │
│ 83 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/i │
│ nit
.py:22 in │
│ │
│ 19 │ HAS_TRITON = False │
│ 20 │
│ 21 from . import ops │
│ ❱ 22 from . import module_inject │
│ 23 │
│ 24 from .accelerator import get_accelerator │
│ 25 from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedu │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/mod │
│ ule_inject/init.py:6 in │
│ │
│ 3 │
│ 4 # DeepSpeed Team │
│ 5 │
│ ❱ 6 from .replace_module import replace_transformer_layer, revert_transformer_layer, Replace │
│ 7 from .module_quantize import quantize_transformer_layer │
│ 8 from .replace_policy import HFBertLayerPolicy │
│ 9 from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/mod │
│ ule_inject/replace_module.py:607 in │
│ │
│ 604 │ return replaced_module │
│ 605 │
│ 606 │
│ ❱ 607 from ..pipe import PipelineModule │
│ 608 │
│ 609 import re │
│ 610 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/pip │
│ e/init.py:6 in │
│ │
│ 3 │
│ 4 # DeepSpeed Team │
│ 5 │
│ ❱ 6 from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec │
│ 7 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/run │
│ time/pipe/init.py:6 in │
│ │
│ 3 │
│ 4 # DeepSpeed Team │
│ 5 │
│ ❱ 6 from .module import PipelineModule, LayerSpec, TiedLayerSpec │
│ 7 from .topology import ProcessTopology │
│ 8 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/run │
│ time/pipe/module.py:19 in │
│ │
│ 16 │
│ 17 from deepspeed.utils import logger │
│ 18 from .. import utils as ds_utils │
│ ❱ 19 from ..activation_checkpointing import checkpointing │
│ 20 from .topology import PipeDataParallelTopology, PipelineParallelGrid │
│ 21 from deepspeed.runtime.state_dict_factory import SDLoaderFactory │
│ 22 from deepspeed.accelerator import get_accelerator │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/run │
│ time/activation_checkpointing/checkpointing.py:26 in │
│ │
│ 23 import mmap │
│ 24 from torch import _C │
│ 25 │
│ ❱ 26 from deepspeed.runtime.config import DeepSpeedConfig │
│ 27 from deepspeed.utils import logger │
│ 28 from deepspeed.runtime.utils import copy_to_device, move_to_device, see_memory_usage, bw │
│ 29 from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers, FORWARD_GLOBAL_T │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/run │
│ time/config.py:41 in │
│ │
│ 38 from ..git_version_info import version as version
│ 39 from ..utils import logger │
│ 40 │
│ ❱ 41 from ..elasticity import ( │
│ 42 │ elasticity_enabled, │
│ 43 │ compute_elastic_config, │
│ 44 │ ensure_immutable_elastic_config, │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/ela │
│ sticity/init.py:10 in │
│ │
│ 7 from .utils import is_torch_elastic_compatible │
│ 8 from .constants import ENABLED, ENABLED_DEFAULT, ELASTICITY │
│ 9 if is_torch_elastic_compatible(): │
│ ❱ 10 │ from .elastic_agent import DSElasticAgent │
│ 11 │
│ │
│ /home/vivoadmin/miniconda3/envs/chatglm3_test20240726/lib/python3.12/site-packages/deepspeed/ela │
│ sticity/elastic_agent.py:9 in │
│ │
│ 6 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent │
│ 7 from typing import Any, Dict, Optional, Tuple │
│ 8 from datetime import datetime │
│ ❱ 9 from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port │
│ 10 from torch.distributed.elastic.metrics import put_metric │
│ 11 from torch.distributed.elastic.agent.server.api import ( │
│ 12 │ RunResult, │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ImportError: cannot import name 'log' from 'torch.distributed.elastic.agent.server.api'

ComplelteError.txt

我分别用lora或者p-tunningV2微调,都会出现该报错,我使用的是官方demo进行的微调,单卡或者多卡都会出现这个问题,请各位帮忙检查这个错误到底如何修正?Linux服务器,显卡为2张L40S
( fine tuning of ChatGLM3-6B, I encountered the following errors when I used lora or p-tunningV2 for fine tuning respectively. I used the official demo for fine tuning, and this problem would occur on either single card or multi-card. Please help to check how to correct this error. Linux server with two L40S graphics cards.)

@Scorponok31 Scorponok31 reopened this Aug 6, 2024
@zRzRzRzRzRzRzR zRzRzRzRzRzRzR self-assigned this Sep 4, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants