[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([5120, 27648]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 70:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([27648, 5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 61:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([27648, 5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 53:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 51:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([5120, 5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 41:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([1024]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 23:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([1024, 5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 22:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([1024]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 15:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([1024, 5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 14:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 7:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([5120, 5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 6:
[rank0]: recomputed metadata: {'shape': torch.Size([0]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: saved metadata: {'shape': torch.Size([5120]), 'dtype': torch.bfloat16, 'device': device(type='cuda', index=0)}
[rank0]: tensor at position 4:
[rank0]: torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint: Recomputed values for the following tensors have different metadata than during the forward pass.
[rank0]: raise CheckpointError(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 903, in check_recomputed_tensors_match
[rank0]: frame.check_recomputed_tensors_match(gid)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py", line 1129, in unpack_hook
[rank0]: input, weight, bias = ctx.saved_tensors
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/linear.py", line 80, in backward
[rank0]: return bwd(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py", line 549, in decorate_bwd
[rank0]: return user_fn(self, *args)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 307, in apply
[rank0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py", line 823, in _engine_run_backward
[rank0]: _engine_run_backward(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 347, in backward
[rank0]: torch.autograd.backward(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 626, in backward
[rank0]: scaled_loss.backward(retain_graph=retain_graph)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
[rank0]: self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/stage3.py", line 2247, in backward
[rank0]: ret_val = func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
[rank0]: self.optimizer.backward(loss, retain_graph=retain_graph)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/engine.py", line 2020, in backward
[rank0]: ret_val = func(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
[rank0]: self.engine.backward(loss, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/deepspeed.py", line 166, in backward
[rank0]: self.deepspeed_engine_wrapped.backward(loss, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 2188, in backward
[rank0]: self.accelerator.backward(loss, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3518, in training_step
[rank0]: tr_loss_step = self.training_step(model, inputs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2388, in _inner_training_loop
[rank0]: return inner_training_loop(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2052, in train
[rank0]: output = super().train(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py", line 440, in train
[rank0]: trainer.train(resume_from_checkpoint=resume_from_checkpoint)
[rank0]: File "/app/finetune/sft.py", line 173, in train
[rank0]: trainer.train(params)
[rank0]: File "/app/main.py", line 165, in main
[rank0]: raise e
0%| | 0/43 [00:15<?, ?it/s]
[HAMI-core Msg(105:139861200876544:multiprocess_memory_limit.c:468)]: Calling exit handler 105
[HAMI-core Msg(104:140546317530112:multiprocess_memory_limit.c:468)]: Calling exit handler 104
[HAMI-core Msg(103:140031096132608:multiprocess_memory_limit.c:468)]: Calling exit handler 103
[HAMI-core Msg(104:140633396208640:multiprocess_memory_limit.c:468)]: Calling exit handler 104