Could anyone point me to the right way of enabling FP16 training and 2-bit gradient compression?
I tried
python train_imagenet.py --network resnet-v1 --num-layers 50 --benchmark 1 --gpus 0 --batch-size 256 --dtype float16 --disp-batches 1 --gc-type 2bit
but got:
INFO:root:start with arguments Namespace(batch_size=256, benchmark=1, brightness=0, contrast=0, data_nthreads=4, data_train=None, data_train_idx='', data_val=None, data_val_idx='', disp_batches=1, dtype='float16', fill_value=127, gc_threshold=0.5, gc_type='2bit', gpus='0', image_shape='3,224,224', initializer='default', kv_store='device', load_epoch=None, loss='', lr=0.1, lr_factor=0.1, lr_step_epochs='30,60', macrobatch_size=0, max_crop_size=-1, max_random_area=1, max_random_aspect_ratio=0, max_random_h=0, max_random_l=0, max_random_rotate_angle=0, max_random_s=0, max_random_scale=1, max_random_shear_ratio=0, min_crop_size=-1, min_random_area=1, min_random_aspect_ratio=None, min_random_scale=1, model_prefix=None, mom=0.9, monitor=0, network='resnet-v1', num_classes=1000, num_epochs=1, num_examples=2000, num_layers=50, optimizer='sgd', pad_size=0, pca_noise=0, profile_server_suffix='', profile_worker_suffix='', random_crop=0, random_mirror=0, random_resized_crop=0, rgb_mean='123.68,116.779,103.939', rgb_std='1,1,1', saturation=0, save_period=1, test_io=0, top_k=0, warmup_epochs=5, warmup_strategy='linear', wd=0.0001)
[21:07:40] /home/ubuntu/incubator-mxnet/src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (set the environment variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)
Traceback (most recent call last):
File "train_imagenet.py", line 66, in <module>
fit.fit(args, sym, data.get_rec_iter)
File "/home/ubuntu/incubator-mxnet/example/image-classification/common/fit.py", line 333, in fit
monitor=monitor)
File "/home/ubuntu/incubator-mxnet/python/mxnet/module/base_module.py", line 533, in fit
self.update_metric(eval_metric, data_batch.label)
File "/home/ubuntu/incubator-mxnet/python/mxnet/module/module.py", line 775, in update_metric
self._exec_group.update_metric(eval_metric, labels, pre_sliced)
File "/home/ubuntu/incubator-mxnet/python/mxnet/module/executor_group.py", line 639, in update_metric
eval_metric.update_dict(labels_, preds)
File "/home/ubuntu/incubator-mxnet/python/mxnet/metric.py", line 350, in update_dict
metric.update_dict(labels, preds)
File "/home/ubuntu/incubator-mxnet/python/mxnet/metric.py", line 133, in update_dict
self.update(label, pred)
File "/home/ubuntu/incubator-mxnet/python/mxnet/metric.py", line 496, in update
pred_label = pred_label.asnumpy().astype('int32')
File "/home/ubuntu/incubator-mxnet/python/mxnet/ndarray/ndarray.py", line 1996, in asnumpy
ctypes.c_size_t(data.size)))
File "/home/ubuntu/incubator-mxnet/python/mxnet/base.py", line 253, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [21:07:45] /home/ubuntu/incubator-mxnet/include/mxnet/././tensor_blob.h:236: Check failed: mshadow::DataType<DType>::kFlag == type_flag_: TBlob.get_with_shape: data type do not match specified type.Expected: 2 v.s. given 0
Stack trace:
[bt] (0) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x7f384525a832]
[bt] (1) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(float* mxnet::TBlob::dptr<float>() const+0xf3) [0x7f384527e973]
[bt] (2) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(mxnet::kvstore::Quantize2BitImpl(mshadow::Stream<mshadow::gpu>*, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, float)+0x26) [0x7f38452df226]
[bt] (3) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(+0x3085758) [0x7f38471f6758]
[bt] (4) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(+0x2f3d88e) [0x7f38470ae88e]
[bt] (5) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext, mxnet::engine::OprBlock*)+0x53f) [0x7f38470b9a3f]
[bt] (6) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(void mxnet::engine::ThreadedEnginePerDevice::GPUWorker<(dmlc::ConcurrentQueueType)0>(mxnet::Context, bool, mxnet::engine::ThreadedEnginePerDevice::ThreadWorkerBlock<(dmlc::ConcurrentQueueType)0>*, std::shared_ptr<dmlc::ManualEvent> const&)+0x11d) [0x7f38470bd16d]
[bt] (7) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(std::_Function_handler<void (std::shared_ptr<dmlc::ManualEvent>), mxnet::engine::ThreadedEnginePerDevice::PushToExecute(mxnet::engine::OprBlock*, bool)::{lambda()#4}::operator()() const::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data const&, std::shared_ptr<dmlc::ManualEvent>&&)+0x4e) [0x7f38470bd42e]
[bt] (8) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(std::thread::_Impl<std::_Bind_simple<std::function<void (std::shared_ptr<dmlc::ManualEvent>)> (std::shared_ptr<dmlc::ManualEvent>)> >::_M_run()+0x4a) [0x7f38470b87aa]
Thanks!