I’m unable to achieve any performance increase when using fp16 for inference. Some details:
Model: Gluon-based ResNet FCN (see here)
GPU: GTX 1080 ti
mxnet version: cu92-1.3.1
Cuda version: 9.2
Cudnn version: 7
Wondering what I’m missing…
My code is:
...
net = instantiate_model(model, params, stage, class_count, checkpoint_filepath, exec_contexts)
# net is a subclass of mxnet.gluon.Block, with context == gpu(0)
net.cast(np.float16)
...
# images is list of PIL images.
image_count = len(images)
original_size = images[0].size
batch_data = []
for idx, image in enumerate(images):
# Remove alpha channel if necessary.
if image.mode == 'RGBA':
r, g, b, a = image.split()
image = Image.merge('RGB', (r, g, b))
image = np.array(image).astype(np.float32)
image = mx.nd.array(image, ctx=self.exec_contexts[0]) # self.exec_contexts[0] == gpu(0)
image = mx.image.color_normalize(image, self.channelMeans, self.channelStdDevs)
image = mx.nd.transpose(image, (2, 0, 1)) # (h, w, c) => (c, h, w)
image = image.astype(np.float16).expand_dims(axis=0)
batch_data.append(image)
batch_data = mx.ndarray.concat(*batch_data, dim=0)
pred = self.net(batch_data)
# Same resample function as used in training/validation.
pred = mx.nd.contrib.BilinearResize2D(pred, original_size[1], original_size[0])
labels = []
for idx in range(image_count):
label = np.uint8(np.squeeze(pred[idx].asnumpy().argmax(axis=0)))
label = Image.fromarray(label)
label.putpalette(self.palette)
labels.append(label)