Hi,
I heard that MXNet enables near-linear scaling of training and I’m trying to see that. I have a float16 resnet50 training on one GPU on p3.8xl with batch size 1024 on FashionMNIST in 6s/epoch. The same model, on the 4 GPUs of the same p3.8xl with batch size 4096 runs at 4.3s/epoch. That’s an unimpressive 28% improvement for 4x times the compute power. Am I getting something wrong? What are the pre-requisites for MXNet to demonstrate good training scaling?
Below my code
single GPU
ctx = mx.gpu(0)
# Parameter initialization
net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx, force_reinit=True)
net.hybridize(static_alloc=True, static_shape=True)
net.cast('float16')
# Softmax cross-entropy loss
sce = gluon.loss.SoftmaxCrossEntropyLoss()
accuracy = mx.metric.Accuracy()
# Optimizer
trainer = gluon.Trainer(
params=net.collect_params(),
optimizer='sgd',
optimizer_params={'learning_rate': .1,
'multi_precision': True})
# Training loop
for e in range(epochs):
tick = time.time()
for i, (data, label) in enumerate(train_data):
data = data.as_in_context(ctx)
label = label.as_in_context(ctx)
with autograd.record():
output = net(data)
loss = sce(output, label)
loss.backward()
trainer.step(data.shape[0])
accuracy.update(label, output)
print("Epoch {}., Acc {}, {:.4f}".format(e, accuracy.get(), time.time()-tick))
accuracy.reset()
All GPUs
ctx = [mx.gpu(i) for i in mx.test_utils.list_gpus()]
# Parameter initialization
net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx, force_reinit=True)
net.hybridize(static_alloc=True, static_shape=True)
net.cast('float16')
sce = gluon.loss.SoftmaxCrossEntropyLoss()
accuracy = mx.metric.Accuracy()
# Optimizer
trainer = gluon.Trainer(
params=net.collect_params(),
optimizer='sgd',
optimizer_params={'learning_rate': .1,
'multi_precision': True})
# Training loop
for e in range(epochs):
tick = time.time()
for i, (data, label) in enumerate(train_data):
data = gluon.utils.split_and_load(data, ctx, even_split=True)
label = gluon.utils.split_and_load(label, ctx, even_split=True)
with autograd.record():
outputs = [(net(D), L) for D, L in zip(data, label)]
losses = [sce(output[0], output[1]) for output in outputs]
for loss in losses:
loss.backward()
trainer.step(batch_size)
[accuracy.update(output[1], output[0]) for output in outputs]
print("Epoch {}., Acc {}, {:.4f}".format(e, accuracy.get(), time.time()-tick))
accuracy.reset()