框架的基本功能:
1. 模型的定义、训练与测试
2. 数据生成与数据迭代器
3. 训练日志记录
4. 训练过程实时监控
有了这个框架,后续所有复杂的AI项目都可以在此基础上拓展开发。
项目基本结构:
四个文件:
sequence_mean_generate.py 用于数据的生成与迭代
mean_mlp_model.py 用于定义网络模型,还有模型的训练与测试函数
log.py 用于定义日志函数
mean_mlp_main.py 主函数
以下是四个文件的代码
mean_mlp_main.py
import torch
import model.mean_mlp_model as mean_mlp_model
from data import sequence_mean_generate as ds
import os
import shutil
from tools import log
import argparse
def creat_args():
# 检查CUDA是否可用
if torch.cuda.is_available():
print("CUDA is available!")
# 如果CUDA可用,打印使用的GPU设备
print("Using GPU:", torch.cuda.get_device_name())
else:
print("CUDA is not available. Using CPU instead.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if os.path.exists('./log/'):
shutil.rmtree('./log/')
model_log_trace = log.creat_log('./log/', 'model_parameters', '.log')
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--device', type=torch.device, help='cpu or gpu', default=device)
arg_parser.add_argument('--model_log_trace', type=str, help='model trace log path and name',
default=model_log_trace)
arg_parser.add_argument('--epochs', type=int, help='training epochs', default=200)
arg_parser.add_argument('--training_num_samples', type=int, help='training num samples', default=10000)
arg_parser.add_argument('--training_batch_size', type=int, help='training batch size', default=24)
arg_parser.add_argument('--training_max_range', type=int, help='training max range', default=100)
arg_parser.add_argument('--training_min_range', type=int, help='training min range', default=-100)
arg_parser.add_argument('--test_num_samples', type=int, help='test num samples', default=100)
arg_parser.add_argument('--test_batch_size', type=int, help='test batch size', default=1)
arg_parser.add_argument('--test_max_range', type=int, help='test max range', default=10000)
arg_parser.add_argument('--test_min_range', type=int, help='test min range', default=-10000)
args = arg_parser.parse_args()
return args
# 主程序
def main():
args = creat_args()
# 初始化模型、损失函数、优化器
print('start')
model = mean_mlp_model.MLPModel().to(args.device)
# 生成数据集
training_set = ds.create_dataloader(args, mode='training')
test_set = ds.create_dataloader(args, mode='test')
# 训练模型
mean_mlp_model.train_model(model, training_set, test_set, args)
# 测试模型
mean_mlp_model.test_model(model, test_set, args)
for name, param in model.named_parameters():
args.model_log_trace.info('%s: %s', name, param)
log.close_log(args.model_log_trace)
if __name__ == '__main__':
main()
mean_mlp_model.py
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.tensorboard import SummaryWriter
# 定义MLP模型
class MLPModel(nn.Module):
def __init__(self):
input_size = 4
mean_out = 1
super(MLPModel, self).__init__()
self.fc = nn.Linear(input_size, mean_out)
def forward(self, x):
x = self.fc(x)
return x
# 测试模型
def test_model(model, test_set, args):
model.eval()
total = 0
with torch.no_grad():
for inputs, targets in test_set:
inputs, targets = inputs.to(args.device), targets.to(args.device)
outputs = model(inputs)
targets_array = np.array(targets)
outputs_array = np.array(outputs)
mse = np.mean((targets_array - outputs_array) ** 2)
total += mse
test_set_mse = total / test_set.batch_size
print(f'MSE on the test data: {test_set_mse:.2f}')
return test_set_mse
# 训练模型
def train_model(model, training_set, test_set, args):
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
writer = SummaryWriter('./log')
iteration = 0
model.train()
for epoch in range(args.epochs):
for inputs, targets in training_set:
iteration += 1
inputs, targets = inputs.to(args.device), targets.to(args.device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
for name, param in model.named_parameters():
args.model_log_trace.debug('%s: %s', name, param)
writer.add_scalars('Weight', {'fc(0,0)': model.fc.weight[0, 0], 'fc(0,1)': model.fc.weight[0, 1], 'fc(0,2)': model.fc.weight[0, 2], 'fc(0,3)': model.fc.weight[0, 3]}, iteration)
args.model_log_trace.info(f'Epoch {epoch + 1}/{args.epochs}, Loss: {loss.item()}')
# 测试模型
test_loss = test_model(model, test_set, args)
writer.add_scalars('Loss', {'train': loss.item(), 'test': test_loss}, epoch)
writer.close()
sequence_mean_generate.py
import random
import torch
from torch.utils.data import DataLoader, TensorDataset
def generate_random_sequence(min_range, max_range):
# 设置器噪声倍数范围
n = 0.1
# 生成一个包含四个随机整数的列表,范围在range_lower到range_upper之间
random_numbers = [random.randint(min_range, max_range) for _ in range(4)]
random_noise = [random.randint(min_range*0.01, max_range*0.01) for _ in range(4)]
# 创建一个新列表来存储相加的结果
summed_numbers = []
# 遍历两个列表,将对应元素相加
for num, noise in zip(random_numbers, random_noise):
summed_numbers.append(num + noise)
# 计算平均数
average = [sum(random_numbers) / len(random_numbers)]
# 返回列表和平均数
return summed_numbers, average
# 生成数据集
def generate_dataset(min_range, max_range, num_samples, device):
quadruples = []
labels = []
for _ in range(num_samples):
quadruple, label = generate_random_sequence(min_range, max_range)
quadruples.append(quadruple)
labels.append(label)
quadruples_tensor = torch.tensor(quadruples, dtype=torch.float32).to(device)
labels_tensor = torch.tensor(labels, dtype=torch.float32).to(device)
return quadruples_tensor, labels_tensor
# 创建DataLoader
def create_dataloader(args, mode):
if mode == 'training':
min_range = args.training_min_range
max_range = args.training_max_range
num_samples = args.training_num_samples
batch_size = args.training_batch_size
else:
min_range = args.test_min_range
max_range = args.test_max_range
num_samples = args.test_num_samples
batch_size = args.test_batch_size
quadruples, labels = generate_dataset(min_range, max_range, num_samples, args.device)
dataset = TensorDataset(quadruples, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
return dataloader
log.py
import os
import shutil
import logging
def creat_log(log_path, logging_name, suf_name):
if not os.path.exists(log_path):
os.makedirs(log_path)
log_full_path = log_path + logging_name + suf_name
logger = logging.getLogger(logging_name)
logger.setLevel(level=logging.DEBUG)
handler = logging.FileHandler(log_full_path, encoding='UTF-8', mode='w')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
logger.addHandler(handler)
logger.addHandler(console)
return logger
# 关闭log
def close_log(log_trace):
for handler in list(log_trace.handlers):
log_trace.removeHandler(handler)
以下是训练演示:
tensorboard 的实时监控演示:
关于tensorboard的设置:
第一步:正常安装tensorboard
pip install tensorboard
第二步:添加系统的环境变量
第三步:添加监控代码
第四步:设置tensorboard的运行脚本
在”1“中进行配置,并填写对应的选项,填写正确后点应用-》确定-》运行
4. 运行mean_mlp_main.py主程序,生成数据,这时tensorboard创建的服务器就会调用这些数据生成监控面板上的监控结果
5. 再切换到tensorboard的运行窗口,如下图,点击输出结果中的网址,就可以访问监控结果,在页面按F5可以刷新监控画面