Hi.
I am writing to modify gluon.ai chapter14 pix2pix-gan to muti-gpu under the instruction from chapter7.But my code are unstable and get a visually low output. the official (one gpu) code are in gluon.ai chapter14 pix2pix.
here is my modified code,erro and picture
m_ctx contains 4gpus
ctx contains 1gpu
batch of 4gpu =40
batch of 1gpu = 10
def train():
image_pool = ImagePool(pool_size)
metric = mx.metric.CustomMetric(facc)
stamp = datetime.now().strftime('%Y_%m_%d-%H_%M')
logging.basicConfig(level=logging.DEBUG)
netG.collect_params().reset_ctx(m_ctx)
netD.collect_params().reset_ctx(m_ctx)
trainerG = gluon.Trainer(netG.collect_params(), 'adam', {'learning_rate': 0.0002, 'beta1': 0.5})
trainerD = gluon.Trainer(netD.collect_params(), 'adam', {'learning_rate': 0.0002, 'beta1': 0.5})
for epoch in range(epochs):
tic = time.time()
btic = time.time()
train_data.reset()
iter = 0
for batch in train_data:
real_in = gluon.utils.split_and_load(batch.data[0], ctx_list=m_ctx, )
real_out = gluon.utils.split_and_load(batch.data[1], ctx_list=m_ctx)
############################
# (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z)))
###########################
fake_out = [netG(X) for X in real_in]
fake_concat = [image_pool.query(nd.concat(X, Y, dim=1)) for X, Y in
zip(real_in, fake_out)]
with autograd.record():
output = [netD(X) for X in fake_concat]
fake_label = []
for i in range(4):
fake_label.append(nd.zeros(output[i].shape, ctx=output[i].context)
errD_fake = [GAN_loss(X, Y) for X, Y in zip(output, fake_label)]
metric.update([x for x in fake_label], [x for x in output])
real_concat = [nd.concat(X, Y, dim=1) for X, Y in zip(real_in, real_out)]
output = [netD(X) for X in real_concat]
real_label = []
for i in range(4):
real_label.append(nd.ones(output[i].shape, ctx=output[i].context))
errD_real = [GAN_loss(X, Y) for X, Y in zip(output, real_label)]
errD = [((X + Y) * 0.5) for X, Y in zip(errD_real, errD_fake)]
autograd.backward(errD)
metric.update([x for x in real_label], [x for x in output])
trainerD.step(batch.data[0].shape[0])
############################
# (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z))
###########################
with autograd.record():
fake_out = [netG(X) for X in real_in]
fake_concat = [nd.concat(X, Y, dim=1) for X, Y in zip(real_in, fake_out)]
output = [netD(X) for X in fake_concat]
real_label = []
for i in range(4):
real_label.append(nd.ones(shape=output[i].shape, ctx=output[i].context))
errG = [(GAN_loss(A, B) + L1_loss(C, D) * 1000)
for A, B, C, D in zip(output, real_label, real_out, fake_out)]
autograd.backward(errG)
trainerG.step(batch.data[0].shape[0])
# Print log infomation every ten batches
if iter % 10 == 0:
name, acc = metric.get()
logging.info('speed: {} samples/s'.format(batch_size / (time.time() - btic)))
logging.info(
'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d'
% (nd.mean(errD[0]).asscalar(),
nd.mean(errG[0]).asscalar(), acc, iter, epoch))
iter = iter + 1
btic = time.time()
#
name, acc = metric.get()
metric.reset()
logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc))
logging.info('time: %f' % (time.time() - tic))
fake_img = fake_out[0]
visualize(fake_img[0])
plt.show()
erro (may get sometimes):
INFO:root:
binary training acc at epoch 22: facc=0.999336
INFO:root:time: 7.901645
Traceback (most recent call last):
File “/root/PycharmProjects/untitled/ssd_denoising/denoiser_mult.py”, line 415, in
train()
File “/root/PycharmProjects/untitled/ssd_denoising/denoiser_mult.py”, line 329, in train
metric.update([x for x in fake_label], [x for x in output])
File “/root/anaconda3/lib/python3.6/site-packages/mxnet/metric.py”, line 1376, in update
pred = pred.asnumpy()
File “/root/anaconda3/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py”, line 1972, in asnumpy
ctypes.c_size_t(data.size)))
File “/root/anaconda3/lib/python3.6/site-packages/mxnet/base.py”, line 252, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [03:41:31] src/operator/tensor/./…/…/common/…/operator/mxnet_op.h:622: Check failed: err == cudaSuccess (33 vs. 0) Name: mxnet_generic_kernel ErrStr:invalid resource handle
Stack trace returned 10 entries:
[bt] (0) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x382eea) [0x7f6d4dcdbeea]
[bt] (1) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x383521) [0x7f6d4dcdc521]
[bt] (2) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x44bcd35) [0x7f6d51e15d35]
[bt] (3) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x44bddc6) [0x7f6d51e16dc6]
[bt] (4) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x44c552b) [0x7f6d51e1e52b]
[bt] (5) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2b88698) [0x7f6d504e1698]
[bt] (6) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2ae9137) [0x7f6d50442137]
[bt] (7) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2ae9137) [0x7f6d50442137]
[bt] (8) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2ae9189) [0x7f6d50442189]
[bt] (9) /root/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2af2bc4) [0x7f6d5044bbc4]
my output and official are after 100 epochs
here is my output and official output is in floor. 2