手写数字识别之多GPU训练

发布时间:2024-11-23 20:54

技能训练9: 数字技能训练:数据分析 #生活技巧# #学习技巧# #技能训练指南#

内容都是百度AIstudio的内容,我只是在这里做个笔记,不是原创。

单GPU训练

从前几节的训练来看,我们无论是训练房价预测模型还是MNIST手写字符识别模型,训练好一个模型不会超过十分钟,主要原因是我们所使用的神经网络比较简单。但现实生活中,我们可能会遇到更复杂的机器学习、深度学习任务,需要运算速度更高的硬件(GPU、TPU),甚至同时使用多个机器共同训练一个任务(多卡训练和多机训练)。

飞桨动态图通过fluid.dygraph.guard(place=None)里的place参数,设置在GPU上训练还是CPU上训练,比如:

with fluid.dygraph.guard(place=fluid.CPUPlace()) #设置使用CPU资源训神经网络。

with fluid.dygraph.guard(place=fluid.CUDAPlace(0)) #设置使用GPU资源训神经网络,默认使用机器的第一个GPU。

import os

import random

import paddle

import paddle.fluid as fluid

from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC

import numpy as np

from PIL import Image

import gzip

import json

def load_data(mode='train'):

datafile = './work/mnist.json.gz'

print('loading mnist dataset from {} ......'.format(datafile))

data = json.load(gzip.open(datafile))

train_set, val_set, eval_set = data

IMG_ROWS = 28

IMG_COLS = 28

if mode == 'train':

imgs = train_set[0]

labels = train_set[1]

elif mode == 'valid':

imgs = val_set[0]

labels = val_set[1]

elif mode == 'eval':

imgs = eval_set[0]

labels = eval_set[1]

imgs_length = len(imgs)

assert len(imgs) == len(labels), \

"length of train_imgs({}) should be the same as train_labels({})".format(

len(imgs), len(labels))

index_list = list(range(imgs_length))

BATCHSIZE = 100

def data_generator():

if mode == 'train':

random.shuffle(index_list)

imgs_list = []

labels_list = []

for i in index_list:

img = np.reshape(imgs[i], [1, IMG_ROWS, IMG_COLS]).astype('float32')

label = np.reshape(labels[i], [1]).astype('int64')

imgs_list.append(img)

labels_list.append(label)

if len(imgs_list) == BATCHSIZE:

yield np.array(imgs_list), np.array(labels_list)

imgs_list = []

labels_list = []

if len(imgs_list) > 0:

yield np.array(imgs_list), np.array(labels_list)

return data_generator

class MNIST(fluid.dygraph.Layer):

def __init__(self, name_scope):

super(MNIST, self).__init__(name_scope)

name_scope = self.full_name()

self.conv1 = Conv2D(name_scope, num_filters=20, filter_size=5, stride=1, padding=2, act='relu')

self.pool1 = Pool2D(name_scope, pool_size=2, pool_stride=2, pool_type='max')

self.conv2 = Conv2D(name_scope, num_filters=20, filter_size=5, stride=1, padding=2, act='relu')

self.pool2 = Pool2D(name_scope, pool_size=2, pool_stride=2, pool_type='max')

self.fc = FC(name_scope, size=10, act='softmax')

def forward(self, inputs):

x = self.conv1(inputs)

x = self.pool1(x)

x = self.conv2(x)

x = self.pool2(x)

x = self.fc(x)

return x

use_gpu = False

place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()

with fluid.dygraph.guard(place):

model = MNIST("mnist")

model.train()

train_loader = load_data('train')

optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.01)

EPOCH_NUM = 2

for epoch_id in range(EPOCH_NUM):

for batch_id, data in enumerate(train_loader()):

image_data, label_data = data

image = fluid.dygraph.to_variable(image_data)

label = fluid.dygraph.to_variable(label_data)

predict = model(image)

loss = fluid.layers.cross_entropy(predict, label)

avg_loss = fluid.layers.mean(loss)

if batch_id % 200 == 0:

print("epoch: {}, batch: {}, loss is: {}".format(epoch_id, batch_id, avg_loss.numpy()))

avg_loss.backward()

optimizer.minimize(avg_loss)

model.clear_gradients()

fluid.save_dygraph(model.state_dict(), 'mnist')

感受到GPU和CPU之间的区别,数据如下:

GPU跑,每次都在20秒左右

CPU跑,每次都在105秒左右

高下立见

网址:手写数字识别之多GPU训练 https://www.yuejiaxmz.com/news/view/219048

相关内容

智能生活垃圾检测与分类系统(UI界面+YOLOv5+训练数据集)
gpu压力测试工具
视频语音识别文字
一种实时语音识别数字的方法
基于深度学习的生活垃圾检测与分类系统(网页版+YOLOv8/v7/v6/v5代码+训练数据集)
目标检测算法: 对Faster RCNN论文的理解与实践
什么是递增组训练?递减组训练和金字塔训练法
代谢训练(MT)=代谢阻力训练(MRT) 有氧间歇训练(CRT)
智能AI语音识别算法:实现高精度语音转文字技术解析
放松训练

随便看看