-
Notifications
You must be signed in to change notification settings - Fork 17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
大佬,lora多卡训练报错,帮忙看下 #24
Comments
pip list | grep -E "ing|torch|deep" |
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
大佬,还是不行,还是之前的报错。全量参数微调是没问题,但是lora多卡就报这个错了 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
训练命令如下:
CUDA_VISIBLE_DEVICES=0,1 python train.py
报错信息如下:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /chatglm2-dev/train.py:122 in │
│ │
│ 119 │ ) │
│ 120 │ │
│ 121 │ if train_datasets is not None: │
│ ❱ 122 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 123 │
│ 124 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 520 in fit │
│ │
│ 517 │ │ """ │
│ 518 │ │ model = _maybe_unwrap_optimized(model) │
│ 519 │ │ self.strategy._lightning_module = model │
│ ❱ 520 │ │ call._call_and_handle_interrupt( │
│ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 522 │ │ ) │
│ 523 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:42 │
│ in _call_and_handle_interrupt │
│ │
│ 39 │ """ │
│ 40 │ try: │
│ 41 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 43 │ │ else: │
│ 44 │ │ │ return trainer_fn(*args, **kwargs) │
│ 45 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/launcher │
│ s/subprocess_script.py:92 in launch │
│ │
│ 89 │ │ """ │
│ 90 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 91 │ │ │ self._call_children_scripts() │
│ ❱ 92 │ │ return function(*args, **kwargs) │
│ 93 │ │
│ 94 │ def kill(self, signum: _SIGNUM) -> None: │
│ 95 │ │ for proc in self.procs: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 559 in _fit_impl │
│ │
│ 556 │ │ │ model_provided=True, │
│ 557 │ │ │ model_connected=self.lightning_module is not None, │
│ 558 │ │ ) │
│ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 560 │ │ │
│ 561 │ │ assert self.state.stopped │
│ 562 │ │ self.training = False │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 935 in _run │
│ │
│ 932 │ │ # ---------------------------- │
│ 933 │ │ # RUN THE TRAINER │
│ 934 │ │ # ---------------------------- │
│ ❱ 935 │ │ results = self._run_stage() │
│ 936 │ │ │
│ 937 │ │ # ---------------------------- │
│ 938 │ │ # POST-Training CLEAN UP │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 978 in _run_stage │
│ │
│ 975 │ │ │ with isolate_rng(): │
│ 976 │ │ │ │ self._run_sanity_check() │
│ 977 │ │ │ with torch.autograd.set_detect_anomaly(self._detect_anoma │
│ ❱ 978 │ │ │ │ self.fit_loop.run() │
│ 979 │ │ │ return None │
│ 980 │ │ raise RuntimeError(f"Unexpected state {self.state}") │
│ 981 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:2 │
│ 01 in run │
│ │
│ 198 │ │ while not self.done: │
│ 199 │ │ │ try: │
│ 200 │ │ │ │ self.on_advance_start() │
│ ❱ 201 │ │ │ │ self.advance() │
│ 202 │ │ │ │ self.on_advance_end() │
│ 203 │ │ │ │ self._restarting = False │
│ 204 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:3 │
│ 54 in advance │
│ │
│ 351 │ │ assert self._data_fetcher is not None │
│ 352 │ │ self._data_fetcher.setup(combined_loader) │
│ 353 │ │ with self.trainer.profiler.profile("run_training_epoch"): │
│ ❱ 354 │ │ │ self.epoch_loop.run(self._data_fetcher) │
│ 355 │ │
│ 356 │ def on_advance_end(self) -> None: │
│ 357 │ │ trainer = self.trainer │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:133 in run │
│ │
│ 130 │ │ self.on_run_start(data_fetcher) │
│ 131 │ │ while not self.done: │
│ 132 │ │ │ try: │
│ ❱ 133 │ │ │ │ self.advance(data_fetcher) │
│ 134 │ │ │ │ self.on_advance_end() │
│ 135 │ │ │ │ self._restarting = False │
│ 136 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:218 in advance │
│ │
│ 215 │ │ │ with trainer.profiler.profile("run_training_batch"): │
│ 216 │ │ │ │ if trainer.lightning_module.automatic_optimization: │
│ 217 │ │ │ │ │ # in automatic optimization, there can only be one │
│ ❱ 218 │ │ │ │ │ batch_output = self.automatic_optimization.run(tra │
│ 219 │ │ │ │ else: │
│ 220 │ │ │ │ │ batch_output = self.manual_optimization.run(kwargs │
│ 221 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:185 in run │
│ │
│ 182 │ │ # ------------------------------ │
│ 183 │ │ # gradient update with accumulated gradients │
│ 184 │ │ else: │
│ ❱ 185 │ │ │ self._optimizer_step(kwargs.get("batch_idx", 0), closure) │
│ 186 │ │ │
│ 187 │ │ result = closure.consume_result() │
│ 188 │ │ if result.loss is None: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:261 in _optimizer_step │
│ │
│ 258 │ │ │ self.optim_progress.optimizer.step.increment_ready() │
│ 259 │ │ │
│ 260 │ │ # model hook │
│ ❱ 261 │ │ call._call_lightning_module_hook( │
│ 262 │ │ │ trainer, │
│ 263 │ │ │ "optimizer_step", │
│ 264 │ │ │ trainer.current_epoch, │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:142 │
│ in _call_lightning_module_hook │
│ │
│ 139 │ pl_module._current_fx_name = hook_name │
│ 140 │ │
│ 141 │ with trainer.profiler.profile(f"[LightningModule]{pl_module.__clas │
│ ❱ 142 │ │ output = fn(*args, **kwargs) │
│ 143 │ │
│ 144 │ # restore current_fx when nested context │
│ 145 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/module.py:1265 │
│ in optimizer_step │
│ │
│ 1262 │ │ │ │ │ for pg in optimizer.param_groups: │
│ 1263 │ │ │ │ │ │ pg["lr"] = lr_scale * self.learning_rate │
│ 1264 │ │ """ │
│ ❱ 1265 │ │ optimizer.step(closure=optimizer_closure) │
│ 1266 │ │
│ 1267 │ def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimiz │
│ 1268 │ │ """Override this method to change the default behaviour of
│ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/optimizer.py:1 │ │ 58 in step │ │ │ │ 155 │ │ │ raise MisconfigurationException("When `optimizer.step(clos │ │ 156 │ │ │ │ 157 │ │ assert self._strategy is not None │ │ ❱ 158 │ │ step_output = self._strategy.optimizer_step(self._optimizer, c │ │ 159 │ │ │ │ 160 │ │ self._on_after_step() │ │ 161 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:2 │ │ 59 in optimizer_step │ │ │ │ 256 │ │ │ model: reference to the model, optionally defining optimiz │ │ 257 │ │ │ **kwargs: Any extra arguments to
optimizer.step`` ││ 258 │ │ """ │
│ ❱ 259 │ │ optimizer_output = super().optimizer_step(optimizer, closure, │
│ 260 │ │ │
│ 261 │ │ if self._model_averager is None: │
│ 262 │ │ │ return optimizer_output │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/strategy │
│ .py:224 in optimizer_step │
│ │
│ 221 │ │ model = model or self.lightning_module │
│ 222 │ │ # TODO(fabric): remove assertion once strategy's optimizer_ste │
│ 223 │ │ assert isinstance(model, pl.LightningModule) │
│ ❱ 224 │ │ return self.precision_plugin.optimizer_step(optimizer, model=m │
│ 225 │ │
│ 226 │ def _setup_model_and_optimizers(self, model: Module, optimizers: L │
│ 227 │ │ """Setup a model and multiple optimizers together. │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/plugins/precision/d │
│ eepspeed.py:92 in optimizer_step │
│ │
│ 89 │ ) -> Any: │
│ 90 │ │ if isinstance(optimizer, LBFGS): │
│ 91 │ │ │ raise MisconfigurationException("DeepSpeed and the LBFGS o │
│ ❱ 92 │ │ closure_result = closure() │
│ 93 │ │ self._after_closure(model, optimizer) │
│ 94 │ │ skipped_backward = closure_result is None │
│ 95 │ │ # in manual optimization, the closure does not return a value │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:140 in call │
│ │
│ 137 │ │ return step_output │
│ 138 │ │
│ 139 │ def call(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: │
│ ❱ 140 │ │ self._result = self.closure(*args, **kwargs) │
│ 141 │ │ return self._result.loss │
│ 142 │
│ 143 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:126 in closure │
│ │
│ 123 │ │ self._zero_grad_fn = zero_grad_fn │
│ 124 │ │
│ 125 │ def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: │
│ ❱ 126 │ │ step_output = self._step_fn() │
│ 127 │ │ │
│ 128 │ │ if step_output.closure_loss is None: │
│ 129 │ │ │ self.warning_cache.warn("
training_step
returned `None`. ││ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:308 in _training_step │
│ │
│ 305 │ │ trainer = self.trainer │
│ 306 │ │ │
│ 307 │ │ # manually capture logged metrics │
│ ❱ 308 │ │ training_step_output = call._call_strategy_hook(trainer, "trai │
│ 309 │ │ self.trainer.strategy.post_training_step() │
│ 310 │ │ │
│ 311 │ │ result = self.output_result_cls.from_training_step_output(trai │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:288 │
│ in _call_strategy_hook │
│ │
│ 285 │ │ return │
│ 286 │ │
│ 287 │ with trainer.profiler.profile(f"[Strategy]{trainer.strategy.__clas │
│ ❱ 288 │ │ output = fn(*args, **kwargs) │
│ 289 │ │
│ 290 │ # restore current_fx when nested context │
│ 291 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:3 │
│ 31 in training_step │
│ │
│ 328 │ def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: │
│ 329 │ │ assert self.model is not None │
│ 330 │ │ with self.precision_plugin.train_step_context(): │
│ ❱ 331 │ │ │ return self.model(*args, **kwargs) │
│ 332 │ │
│ 333 │ def validation_step(self, *args: Any, **kwargs: Any) -> Optional[S │
│ 334 │ │ with self.precision_plugin.val_step_context(): │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deepspeed/utils/nvtx.py:15 in │
│ wrapped_fn │
│ │
│ 12 │ │
│ 13 │ def wrapped_fn(*args, **kwargs): │
│ 14 │ │ get_accelerator().range_push(func.qualname) │
│ ❱ 15 │ │ ret_val = func(*args, **kwargs) │
│ 16 │ │ get_accelerator().range_pop() │
│ 17 │ │ return ret_val │
│ 18 │
│ │
│ /usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py:1769 in │
│ forward │
│ │
│ 1766 │ │ if self.fp16_auto_cast(): │
│ 1767 │ │ │ inputs = self._cast_inputs_half(inputs) │
│ 1768 │ │ │
│ ❱ 1769 │ │ loss = self.module(*inputs, **kwargs) │
│ 1770 │ │ │
│ 1771 │ │ if self.zero_optimization_partition_weights(): │
│ 1772 │ │ │ # Disable automated discovery of external parameters │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self.backward_hooks or global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/overrides/base.py:9 │
│ 0 in forward │
│ │
│ 87 │ │ │
│ 88 │ │ if trainer is not None: │
│ 89 │ │ │ if trainer.training: │
│ ❱ 90 │ │ │ │ output = self.forward_module.training_step(*inputs, * │
│ 91 │ │ │ │ # In manual_optimization, we need to prevent DDP reduc │
│ 92 │ │ │ │ # it is done manually in `LightningModule.manual_backw │
│ 93 │ │ │ │ # `require_backward_grad_sync` will be reset in the │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:552 in training_step │
│ │
│ 549 │ │
│ 550 │ def training_step(self, batch): │
│ 551 │ │ if isinstance(batch, dict): │
│ ❱ 552 │ │ │ outputs = self.compute_loss(**batch) │
│ 553 │ │ else: │
│ 554 │ │ │ outputs = self.compute_loss(**dict(batch)) │
│ 555 │ │ loss = outputs[0] │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:371 in compute_loss │
│ │
│ 368 │ def compute_loss(self, *args, **kwargs): │
│ 369 │ │ if len(args): │
│ 370 │ │ │ kwargs.update(dict(args)) │
│ ❱ 371 │ │ return self.model.compute_loss(**kwargs) │
│ 372 │ │
│ 373 │ def forward(self, *args, **kwargs): │
│ 374 │ │ if len(args): │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:117 in compute_loss │
│ │
│ 114 │ │ return self.model(*args, **batch) │
│ 115 │ │
│ 116 │ def compute_loss(self, *args, **batch) -> tuple: │
│ ❱ 117 │ │ return self.model(*args, **batch) │
│ 118 │ │
│ 119 │ def post_init(self): │
│ 120 │ │ return self.model.post_init() │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:953 in forward │
│ │
│ 950 │ │ use_cache = use_cache if use_cache is not None else self.conf │
│ 951 │ │ return_dict = return_dict if return_dict is not None else sel │
│ 952 │ │ │
│ ❱ 953 │ │ transformer_outputs = self.transformer( │
│ 954 │ │ │ input_ids=input_ids, │
│ 955 │ │ │ position_ids=position_ids, │
│ 956 │ │ │ attention_mask=attention_mask, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:849 in forward │
│ │
│ 846 │ │ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() │
│ 847 │ │ │
│ 848 │ │ # Run encoder. │
│ ❱ 849 │ │ hidden_states, presents, all_hidden_states, all_self_attentio │
│ 850 │ │ │ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary │
│ 851 │ │ │ kv_caches=past_key_values, use_cache=use_cache, output_hi │
│ 852 │ │ ) │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:639 in forward │
│ │
│ 636 │ │ │ │
│ 637 │ │ │ layer = self._get_layer(index) │
│ 638 │ │ │ if self.gradient_checkpointing and self.training: │
│ ❱ 639 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │
│ 640 │ │ │ │ │ layer, │
│ 641 │ │ │ │ │ hidden_states, │
│ 642 │ │ │ │ │ attention_mask, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:235 in │
│ checkpoint │
│ │
│ 232 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(a │
│ 233 │ │
│ 234 │ if use_reentrant: │
│ ❱ 235 │ │ return CheckpointFunction.apply(function, preserve, *args) │
│ 236 │ else: │
│ 237 │ │ return _checkpoint_without_reentrant( │
│ 238 │ │ │ function, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:96 in │
│ forward │
│ │
│ 93 │ │ ctx.save_for_backward(*tensor_inputs) │
│ 94 │ │ │
│ 95 │ │ with torch.no_grad(): │
│ ❱ 96 │ │ │ outputs = run_function(*args) │
│ 97 │ │ return outputs │
│ 98 │ │
│ 99 │ @staticmethod │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:551 in forward │
│ │
│ 548 │ │ # hidden_states: [s, b, h] │
│ 549 │ │ │
│ 550 │ │ # Layer norm at the beginning of the transformer layer. │
│ ❱ 551 │ │ layernorm_output = self.input_layernorm(hidden_states) │
│ 552 │ │ # Self attention. │
│ 553 │ │ attention_output, kv_cache = self.self_attention( │
│ 554 │ │ │ layernorm_output, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:201 in forward │
│ │
│ 198 │ │ variance = hidden_states.to(torch.float32).pow(2).mean(-1, ke │
│ 199 │ │ hidden_states = hidden_states * torch.rsqrt(variance + self.e │
│ 200 │ │ │
│ ❱ 201 │ │ return (self.weight * hidden_states).to(input_dtype) │
│ 202 │
│ 203 │
│ 204 class CoreAttention(torch.nn.Module): │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least
two devices, cuda:0 and cuda:1!
sft.config.py中的train_info_args如下:
train_info_args = {
'devices': 2,
'data_backend': 'parquet', #one of record lmdb arrow_stream ,arrow_file, parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型路径
**train_model_config,
'convert_onnx': False, # 转换onnx模型
'do_train': True,
'train_file': [ '/chatglm2-dev/data/finetune_train_examples.json'],
'max_epochs': 20,
'max_steps': -1,
'optimizer': 'lion', # one of [lamb,adma,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit]
}
main.py的信息如下:
模块配置, 默认启用lora
enable_deepspeed = True
enable_ptv2 = False
enable_lora = True
load_in_bit = 0 # 4 load_in_4bit, 8 load_in_8bit other 0
if enable_lora:
from config.sft_config_lora import *
elif enable_ptv2:
from config.sft_config_ptv2 import *
else:
from config.sft_config import *
if enable_lora:
enable_ptv2 = False
global_args['load_in_4bit'] = load_in_bit == 4
global_args['load_in_8bit'] = load_in_bit == 8
elif enable_ptv2:
enable_lora = False
global_args['load_in_4bit'] = False
global_args['load_in_8bit'] = False
train_info_args.pop('lora', None)
train_info_args.pop('adalora', None)
else:
enable_ptv2 = False
enable_lora = False
# global_args['load_in_4bit'] = False
# global_args['load_in_8bit'] = False
train_info_args.pop('lora',None)
train_info_args.pop('adalora', None)
train_info_args.pop('prompt', None)
#预处理
if 'rwkv' in train_info_args['tokenizer_name'].lower():
train_info_args['use_fast_tokenizer'] = True
def get_deepspeed_config():
'''
lora prompt finetuning 使用 deepspeed_offload.json
普通finetuning 使用deepspeed.json
'''
# 是否开启deepspeed
if not enable_deepspeed:
return None
The text was updated successfully, but these errors were encountered: