说起来挺挺扎心的,之前做深度学习的时候,在发愁没有数据,好容易找到了数据,问题来了,将这些数据放在GPU中显然是不理智的,脑袋一拍要不放在内存中然后喂进模型的时候再转入显存,我的机器是128G的,但是仔细一想,这个好像也不科学,总有一天数据会超过128G。所以参考了很多大佬的做法, 将自己的工作过程做个记录
问题描述:我原来的训练代码是先把数据加载到内存中,在训练前把这些数据放到GPU上,然后训练。
我用2400张彩色眼底图像做分割训练,然后就出现了报错,具体代码如下:
def train(x_train,Fovea_x,Fovea_y,label_img,shapes,lr,epoch):
loss_train = []
x_train = np.array(x_train)
Fovea_x = np.array(Fovea_x)
Fovea_y = np.array(Fovea_y)
shapes = np.array(shapes)
label_img = np.array(label_img)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
x = torch.tensor(x_train, dtype=torch.float32).permute(0, 3, 1, 2) #.unsqueeze(1)
label_img = torch.tensor(label_img,dtype=torch.float32).long()#.unsqueeze(1)
shapes = torch.tensor(shapes,dtype=torch.float32)
y_x = torch.tensor(Fovea_x,dtype=torch.float32)#.unsqueeze(1)
y_y = torch.tensor(Fovea_y,dtype=torch.float32)
‘‘‘
将数据放到显存中
’’’
x = x.to(device)
y_x = y_x.to(device)
y_y = y_y.to(device)
shapes = shapes.to(device)
label_img = label_img.to(device)
torch_dataset = data.TensorDataset(x, y_x, y_y,label_img,shapes)
loader = data.DataLoader(
dataset=torch_dataset,
batch_size=12,
shuffle=True,
num_workers=2, # set multi-work num read data
drop_last=False
)
model = UNet([3,16,32,64,128],[128,256,128,64,32])
model = model.to(device)
optimzer = torch.optim.Adam(model.parameters(), lr=lr,weight_decay=0)
loss_fuction1 = loss_fun()
loss_fuction2 = Dice()
for i in range(epoch):
loss_plt = 0
for step, (batch_x, batch_yx,batch_yy,batch_label_img,batch_shapes) in enumerate(loader):
label_img,y_x = model(batch_x,batch_shapes)
loss = loss_fuction1(y_x,batch_yx,batch_yy)
loss.requires_grad_()
loss_plt += loss.data.cpu().numpy()
optimzer.zero_grad()
loss.backward()
optimzer.step()
print("Epoch:{}, Loss:{:.4f}".format(i, loss))
loss_train.append(loss_plt)
if i % 10 == 9:
torch.save(model, 'model.pkl')
return loss_train
解决方案:
出现这种情况的原因就是数据太多,根本加载不到GPU上,甚至放在内存里都不现实,这时候需要使用Pytorch的Dataload类进行加载图像。我重新写了load_data.py,主要是实现Dataload类里面的__len()__和__getitem(item)__的重构,主要内容如下。
import os
import numpy as np
import cv2
from torch.utils.data import DataLoader
def load_data_train(fundus_img_path, label_path):
fundus_img = cv2.imread(fundus_img_path) # BGR -> RGB
fundus_img = cv2.cvtColor(fundus_img, cv2.COLOR_BGR2RGB)
gt_img = cv2.imread(label_path)
gt_img = cv2.cvtColor(gt_img, cv2.COLOR_RGB2GRAY)
img = copy.deepcopy(gt_img)
gt_img[img <= 10] = 1
gt_img[img > 10] = 0
gt_img = cv2.resize(gt_img, (image_size, image_size))
img = cv2.resize(fundus_img, (image_size, image_size))
return img, gt_img
class mydata(DataLoader):
def __init__(self,image_path,label_path):
self.image_path = image_path
self.label_path = label_path
self.images = os.listdir(image_path)
def __len__(self):
return len(self.images)
def __getitem__(self, item):
image_name = os.path.join(self.image_path,self.images[item])
label_name = os.path.join(self.label_path,self.images[item][:-4] + '.png')
return load_data_train(image_name,label_name)
然后就是修改用于训练的函数
修改为
def train(img_path,vessel_path,lebel_path,lr,epoch):
loss_train = []
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#通过DataL类进行数据加载
torch_dataset = mydata(img_path,vessel_path,label_path)
loader = data.DataLoader(
dataset=torch_dataset,
batch_size=16,
shuffle=True,
num_workers=4, # set multi-work num read data
drop_last=False
)
model = UNet([4,16,32,64,128],[128,256,128,64,32])
model = model.to(device)
optimzer = torch.optim.Adam(model.parameters(), lr=lr,weight_decay=0)
loss_fuction = Dice()
Pool = nn.MaxPool2d(2,stride=2)
for i in range(epoch):
loss_plt = 0
for step, (batch_x, batch_y) in enumerate(loader):
batch_x = torch.tensor(batch_x, dtype=torch.float32).permute(0, 3, 1, 2) # .unsqueeze(1)
batch_y = torch.tensor(batch_y, dtype=torch.float32).long() # .unsqueeze(1)
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
y_pred = model(batch_x)
loss = loss_fuction(y_pred, batch_y)
loss.requires_grad_()
loss_plt += loss.data.cpu().numpy()
optimzer.zero_grad()
loss.backward()
optimzer.step()
print("Epoch:{}, Loss:{:.4f}".format(i, loss))
loss_train.append(loss_plt)
if i % 10 == 9:
torch.save(model, 'model.pkl')
return loss_train
然后再运行代码,就没问题了。



