跳到主要内容

训练

import torch
import torch_npu
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# 创建一个简单的神经网络模型
class SimpleModel(nn.Module):
def __init__(self):
super(SimpleModel, self).__init__()
self.fc1 = nn.Linear(1000, 1000) # 大模型可以通过增加层数和参数来调整
self.relu = nn.ReLU()
self.fc2 = nn.Linear(1000, 10) # 假设我们的任务是分类,有10个类

def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x

# 生成模拟数据
def generate_data(num_samples):
X = np.random.randn(num_samples, 1000)
y = np.random.randint(0, 10, num_samples)
return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long)

# 主训练函数
def main():
# 设置设备
device = torch.device("npu:0")

# 数据
X, y = generate_data(10000) # 生成10000个样本
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 模型
model = SimpleModel().to(device)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
model.train()
for epoch in range(10): # 训练10个epoch
for inputs, labels in dataloader:
inputs, labels = inputs.to(device), labels.to(device)

# 前向传播
outputs = model(inputs)
loss = criterion(outputs, labels)

# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()

print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# 保存模型
torch.save(model.state_dict(), "model.pth")
print("Model saved to model.pth")

if __name__ == "__main__":
main()
accelerate launch --config_file accelerate_multiNode_config.yaml train.py
torchrun --master_port 29500 --nproc_per_node=1 --nnodes=2 --node_rank=0 --master_addr=10.233.97.242 train.py
torchrun --master_port 29500 --nproc_per_node=1 --nnodes=2 --node_rank=1 --master_addr=10.233.97.242 train.py
import json
from transformers import Trainer, TrainingArguments, AutoModel

# 加载配置
config = {
"epochs": 10,
"gradient_accumulation": 1,
"lr": 0.00005,
"model": "/mnt/model/base_model",
"output": "/mnt/data/output",
"resume_from_checkpoint": False,
"reward_model": "",
"save_total_limit": 10,
"train_data_dir": "/mnt/data/train",
"trainer": "pretrain",
"valid_data_dir": "",
"valid_data_split": 0.1,
"warmup_ratio": 0.05
}

# 加载模型
model = AutoModel.from_pretrained(config['model'])

# 加载数据
from datasets import load_dataset
train_dataset = load_dataset('jsonl', data_files=f"{config['train_data_dir']}/train-cu6bo7gjcoaappe60s70.jsonl")

# 设置训练参数
training_args = TrainingArguments(
output_dir=config['output'],
num_train_epochs=config['epochs'],
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=int(len(train_dataset) * config['warmup_ratio']),
weight_decay=0.01,
logging_dir=f"{config['output']}/logs",
learning_rate=config['lr'],
save_total_limit=config['save_total_limit'],
evaluation_strategy="epoch",
load_best_model_at_end=False,
metric_for_best_model="loss",
greater_is_better=False,
gradient_accumulation_steps=config['gradient_accumulation'],
resume_from_checkpoint=config['resume_from_checkpoint'] if config['resume_from_checkpoint'] else None
)

# 初始化训练器
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)

# 开始训练
trainer.train()