Hi, I am training a basic convnet on fashionmnist.
I run this locally on an AWS EC2 p3.2xlarge (via sagemaker notebook with kernel mxnet_p36). During the training loop, GPU is less than 4% busy while all the 8 CPUs are firing. Hence I suspect training is not happening on GPU. What is happening? How can I leverage the GPU? Cheers Olivier
# inspired from https://gluon.mxnet.io/chapter04_convolutional-neural-networks/cnn-gluon.html
import time
import mxnet as mx
from mxnet import nd, autograd, gluon
import numpy as np
ctx = mx.gpu(0)
# prepare dataset
batch_size = 256
def transform(data, label):
return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)
train_data = gluon.data.DataLoader(
gluon.data.vision.FashionMNIST(train=True, transform=transform),
batch_size, shuffle=True)
test_data = gluon.data.DataLoader(
gluon.data.vision.FashionMNIST(train=False, transform=transform),
batch_size, shuffle=False)
# define CNN
num_inputs = 784
num_outputs = 10
num_fc = 256
net = gluon.nn.Sequential()
with net.name_scope():
net.add(gluon.nn.Conv2D(channels=20, kernel_size=3, activation='relu'))
net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
net.add(gluon.nn.Conv2D(channels=50, kernel_size=3, activation='relu'))
net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
# The Flatten layer collapses all axis, except the first one, into one axis.
net.add(gluon.nn.Flatten())
net.add(gluon.nn.Dense(num_fc, activation="relu"))
net.add(gluon.nn.Dropout(.3))
net.add(gluon.nn.Dense(num_outputs))
# Evaluation loop
def evaluate_accuracy(data_iterator, net):
acc = mx.metric.Accuracy()
for i, (data, label) in enumerate(data_iterator):
data = data.as_in_context(ctx)
label = label.as_in_context(ctx)
output = net(data)
predictions = nd.argmax(output, axis=1)
acc.update(preds=predictions, labels=label)
return acc.get()[1]
# Parameter initialization
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
# Softmax cross-entropy loss
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
# Optimizer
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})
# Training loop
epochs = 10
smoothing_constant = .01
for e in range(epochs):
for i, (data, label) in enumerate(train_data):
data = data.as_in_context(ctx)
label = label.as_in_context(ctx)
with autograd.record():
output = net(data)
loss = softmax_cross_entropy(output, label)
loss.backward()
trainer.step(data.shape[0])
##########################
# Keep a moving average of the losses
##########################
curr_loss = nd.mean(loss).asscalar()
moving_loss = (curr_loss if ((i == 0) and (e == 0))
else ((1 - smoothing_constant)
* moving_loss + smoothing_constant * curr_loss))
test_accuracy = evaluate_accuracy(test_data, net)
train_accuracy = evaluate_accuracy(train_data, net)
print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s"
% (e, moving_loss, train_accuracy, test_accuracy))