Gluon CNN training with GPU inactive while ctx = mx.gpu(0)

Hi, I am training a basic convnet on fashionmnist.

I run this locally on an AWS EC2 p3.2xlarge (via sagemaker notebook with kernel mxnet_p36). During the training loop, GPU is less than 4% busy while all the 8 CPUs are firing. Hence I suspect training is not happening on GPU. What is happening? How can I leverage the GPU? Cheers Olivier

    # inspired from https://gluon.mxnet.io/chapter04_convolutional-neural-networks/cnn-gluon.html
     
    import time
     
    import mxnet as mx
    from mxnet import nd, autograd, gluon
    import numpy as np
     
     
    ctx = mx.gpu(0)
     
     
    # prepare dataset
    batch_size = 256
     
    def transform(data, label):
        return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)
     
    train_data = gluon.data.DataLoader(
        gluon.data.vision.FashionMNIST(train=True, transform=transform),
        batch_size, shuffle=True)
     
    test_data = gluon.data.DataLoader(
        gluon.data.vision.FashionMNIST(train=False, transform=transform),
        batch_size, shuffle=False)
     
     
     
    # define CNN
    num_inputs = 784
    num_outputs = 10
    num_fc = 256
     
    net = gluon.nn.Sequential()
     
    with net.name_scope():
        
        net.add(gluon.nn.Conv2D(channels=20, kernel_size=3, activation='relu'))
        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
        net.add(gluon.nn.Conv2D(channels=50, kernel_size=3, activation='relu'))
        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
        
        # The Flatten layer collapses all axis, except the first one, into one axis.
        net.add(gluon.nn.Flatten())
        net.add(gluon.nn.Dense(num_fc, activation="relu"))
        net.add(gluon.nn.Dropout(.3))
        net.add(gluon.nn.Dense(num_outputs))
     
     
     
    # Evaluation loop
    def evaluate_accuracy(data_iterator, net):
        acc = mx.metric.Accuracy()
        for i, (data, label) in enumerate(data_iterator):
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            output = net(data)
            predictions = nd.argmax(output, axis=1)
            acc.update(preds=predictions, labels=label)
        return acc.get()[1]
     
     
     
    # Parameter initialization
    net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
     
    # Softmax cross-entropy loss
    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
     
    # Optimizer
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})
     
    # Training loop
    epochs = 10
    smoothing_constant = .01
     
    for e in range(epochs):
        
        for i, (data, label) in enumerate(train_data):
            
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            
            with autograd.record():
                output = net(data)
                loss = softmax_cross_entropy(output, label)
                
            loss.backward()
            trainer.step(data.shape[0])
     
            ##########################
            #  Keep a moving average of the losses
            ##########################
            curr_loss = nd.mean(loss).asscalar()
            moving_loss = (curr_loss if ((i == 0) and (e == 0))
                           else ((1 - smoothing_constant) 
                                 * moving_loss + smoothing_constant * curr_loss))
     
        test_accuracy = evaluate_accuracy(test_data, net)
        train_accuracy = evaluate_accuracy(train_data, net)
        
        print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" 
              % (e, moving_loss, train_accuracy, test_accuracy))

some additional numbers:
with ctx = mx.cpu():
8 CPUs firing up to 100%, GPU-util = 0%, 1 epoch takes approx 70s

with ctx = mx.gpu(0):
8 CPUs busy yet less than in above case, GPU-util oscillating between 0% and 5%, 1 epoch takes approx 50s

is this expected behavior? I am suprised that using GPUs does not significantly improves runtime.

I can refer you to this repo, and the accompanying video, for optimizing Gluon performance.


import time

import mxnet as mx
from mxnet import nd, autograd, gluon
import numpy as np


ctx = mx.gpu(0)


# prepare dataset
batch_size = 256

def transform(data, label):
    return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)

train_data = gluon.data.DataLoader(
    gluon.data.vision.FashionMNIST(train=True, transform=transform),
    batch_size, shuffle=True, num_workers=8)

test_data = gluon.data.DataLoader(
    gluon.data.vision.FashionMNIST(train=False, transform=transform),
    batch_size, shuffle=False, num_workers=8)



# define CNN
num_inputs = 784
num_outputs = 10
num_fc = 256

net = gluon.nn.HybridSequential()

with net.name_scope():

    net.add(gluon.nn.Conv2D(channels=20, kernel_size=3, activation='relu'))
    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
    net.add(gluon.nn.Conv2D(channels=50, kernel_size=3, activation='relu'))
    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))

    # The Flatten layer collapses all axis, except the first one, into one axis.
    net.add(gluon.nn.Flatten())
    net.add(gluon.nn.Dense(num_fc, activation="relu"))
    net.add(gluon.nn.Dropout(.3))
    net.add(gluon.nn.Dense(num_outputs))



# Evaluation loop
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = net(data)
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]



# Parameter initialization
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)

# Softmax cross-entropy loss
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

# Optimizer
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})

# Training loop
epochs = 10
smoothing_constant = .01

net.hybridize()

curr_loss = mx.nd.zeros((1,), ctx=ctx)
for e in range(epochs):
    tick = time.time()
    for i, (data, label) in enumerate(train_data):

        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)

        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)

        loss.backward()
        trainer.step(data.shape[0])

        ##########################
        #  Keep a moving average of the losses
        ##########################
        curr_loss += nd.mean(loss)

    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)

    print("Epoch {}. Loss: {}, Train_acc {}, Test_acc {}, {:.4f}" 
          .format(e, curr_loss.asscalar()/len(train_data), train_accuracy, test_accuracy, time.time()-tick))

Simply adding hybridization and multi processing workers increased GPU utilization by 3x for me, and throughput by 3x as well.

For very small networks like these, and big GPU, you can get further improvements by increasing the batch size (and learning rates).