俗话说得好:“如果看不懂别人的代码,那一定是别人的代码写得不好!”
既然公开了代码就要对代码的可读性和正确性负责,但是看了很多源码,总是会遇到一些坑,比如:环境配置没讲清楚 -> (好不容易跑通了) 效果并没有论文中说的那么好-> (想分析代码吧)代码写得乱七八糟看不懂。。。
为了让小白能够快速上手,代码肯定需要由总到分叙述,就像讲故事一样,循序渐进才能够一目了然。下面的模板是我根据自己的理解总结的网络训练最基本的模板,大家根据自己需要再添加:
if __name__ == "__main__":# 使用GPU训练device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')# 模型训练的基本预设参数batch_size = 16 # Batch Sizenum_epochs = 100 # 训练迭代次数learning_rate = 0.0001 # 学习率root = 'data/train' # 数据集位置##### 导入数据集train_loader, val_loader = get_loader(root, batch_size, shuffle=True)# 搭建模型框架model = Model(device).to(device)# 开始训练train(device, model, num_epochs, learning_rate, train_loader, val_loader)
下面将具体讲解:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
batch_size = 16 # Batch Sizenum_epochs = 100 # 训练迭代次数learning_rate = 0.0001 # 学习率root = 'data/train' # 数据集位置
train_loader, val_loader = get_loader(root, batch_size, shuffle=True)
model = Model(device).to(device)
train(device, model, num_epochs, learning_rate, train_loader, val_loader)
写编码器和解码器
class Model(nn.Module):def __init__(self, device):super(Model, self).__init__()#super的目的在于继承nn.Module并使用__init__初始化了nn.Module里的参数self.encoder = Encoder(device) # 编码器self.decoder = Decoder(device) # 解码器def forward(self, x):x = x.float()p1 = self.encoder(x)p2 = self.decoder(p1)return p2
def train(device, model, num_epochs, learning_rate, train_loader, val_loader):print('start training ...........')#优化器设置optimizer = optim.Adam(model.parameters(), lr=learning_rate)#损失函数设置criterion = LossCalculation(device)train_losses, val_losses = [], []#迭代训练for epoch in range(epochs):#训练集train_epoch_loss = fit(epoch, model, optimizer, criterion, device, train_loader, phase='training')#测试集val_epoch_loss = fit(epoch, model, optimizer, criterion, device, val_loader, phase='validation')print('-----------------------------------------')#保存最优的训练参数if epoch == 0 or val_epoch_loss <= np.min(val_losses):torch.save(model.state_dict(), 'output/weight.pth')#保存训练和测试的loss结果,为画图做准备train_losses.append(train_epoch_loss)val_losses.append(val_epoch_loss)#绘制结果图write_figures('output', train_losses, val_losses)write_log('output', epoch, train_epoch_loss, val_epoch_loss)
根据自己需求设置Adam或SGD,学习率可以设置固定或自适应学习率
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = LossCalculation(device)
class LossCalculation(nn.Module):def __init__(self, device):super(LossCalculation, self).__init__()def forward(self, inputs, outputs, targets):#inputs, outputs, targets 分别是输入图像、预测图像和labelbatch_size, _, width, height = outputs.shapetotal_loss = 0.0 # 一个epoch的损失函数计算for b in range(batch_size):#根据自己需要添加total_loss += xxxreturn total_loss
这里面最重要的就是自定义的fit函数
def fit(epoch, model, optimizer, criterion, device, data_loader, phase='training'):if phase == 'training':#启用一些特定的层(BN,Dropout),设置为训练状态model.train()else:#禁用一些特定的层(BN,Dropout),设置为测试状态model.eval()running_loss = 0for inputs, targets in tqdm(data_loader):#我们使用gpu训练,那图像也必须输入到显存中inputs = inputs.to(device)targets = targets.to(device)if phase == 'training':#每一batch图像的梯度初始化为零,把loss涉及的权重的导数变成0optimizer.zero_grad()#输出预测图像outputs = model(inputs)else:#强制之后的内容不进行计算图构建及梯度计算with torch.no_grad():outputs = model(inputs)# 计算一个epoch中的一个batch的lossloss = criterion(inputs, outputs, targets, separate_loss=False)#累加一个epoch中的lossrunning_loss += loss.item()if phase == 'training':#loss回传loss.backward()# 更新所有的参数optimizer.step()#计算一个epoch的lossepoch_loss = running_loss / len(data_loader.dataset)#除以训练集或测试集图片数目#打印一个epoch的训练结果print('[%d][%s] loss: %.4f' % (epoch, phase, epoch_loss))return epoch_loss