Hello everyone.
I’m trying to train a simple network(simpnet) on imagenet using the MxNet’s gluon-cv imagenet training script.
I have followed all instructions here (dive_deep_imagenet) and used MobileNet1_0 configs for training.
This is the model that I wrote ( I edited an existing mobilenet.py) :
# coding: utf-8
# pylint: disable= arguments-differ,unused-argument,missing-docstring
"""SimpNet , implemented in Gluon."""
__all__ = [
'SimpNet',
'simpnet1_0',
'simpnet0_75',
'simpnet0_5',
'simpnet0_25',
'get_simpnet']
__modify__ = 'dwSun'
__modified_date__ = '18/04/18'
from mxnet.gluon import nn
from mxnet.context import cpu
from mxnet.gluon.block import HybridBlock
from mxnet import gluon
# Helpers
class RELU6(nn.HybridBlock):
"""Relu6 used in SimpNetV2."""
def __init__(self, **kwargs):
super(RELU6, self).__init__(**kwargs)
def hybrid_forward(self, F, x):
return F.clip(x, 0, 6, name="relu6")
# pylint: disable= too-many-arguments
def _add_conv(out, filters=1, kernel=3, stride=1, pad=1,
active=True, relu6=False, num_sync_bn_devices=-1, dropout=False):
out.add(nn.Conv2D(channels=filters,
kernel_size=kernel,
strides=stride,
padding=pad,
use_bias=False))
if num_sync_bn_devices <= 1:
out.add(nn.BatchNorm(scale=True))
else:
out.add(gluon.contrib.nn.SyncBatchNorm(scale=True, num_devices=num_sync_bn_devices))
if active:
out.add(RELU6() if relu6 else nn.Activation('relu'))
if dropout:
out.add(nn.Dropout(0))
# Net
class SimpNet(HybridBlock):
r"""
Parameters
----------
multiplier : float, default 1.0
The width multiplier for controlling the model size. Only multipliers that are no
less than 0.25 are supported. The actual number of channels is equal to the original
channel size multiplied by this multiplier.
classes : int, default 1000
Number of classes for the output layer.
num_sync_bn_devices : int, default is -1
Number of devices for training. If `num_sync_bn_devices < 2`, SyncBatchNorm is disabled.
"""
def __init__(self, multiplier=1.0, classes=1000, num_sync_bn_devices=-1, network_idx=0, s_mode=2, **kwargs):
super(SimpNet, self).__init__(**kwargs)
self.cfg = {
'simpnet5m': [['C', 66], ['C', 128], ['C', 128], ['C', 128], ['C', 192], ['C', 192], ['C', 192], ['C', 192], ['C', 192], ['C', 288], ['P'], ['C', 288], ['C', 355], ['C', 432]],
'simpnet8m': [['C', 128], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 430], ['P'], ['C', 430], ['C', 455], ['C', 600]]}
self.scale = multiplier
self.networks = ['simpnet5m', 'simpnet8m']
self.network_idx = network_idx
self.mode = s_mode
self.strides = {1: [2, 2, 2, 1, 1], #s1
2: [2, 2, 1, 2, 1, 1], #s4
3: [2, 2, 1, 1, 2, 1], #s3
4: [2, 1, 2, 1, 2, 1], #s5
5: [2, 1, 2, 1, 2, 1, 1]}#s6
with self.name_scope():
self.features = nn.HybridSequential(prefix='')
with self.features.name_scope():
layers = []
input_channel = 3
idx = 0
for x in self.cfg[self.networks[self.network_idx]]:
if idx == len(self.strides[self.mode]) or x[0] == 'P':
self.features.add(nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding=0, layout='NCHW', ceil_mode=False))
self.features.add(nn.Dropout(0))
if x[0] != 'C':
continue
filters = round(x[1] * self.scale)
if idx < len(self.strides[self.mode]):
stride = self.strides[self.mode][idx]
else:
stride = 1
if idx in (len(self.strides[self.mode])-1, 9, 12):
_add_conv(self.features, filters=int(filters), kernel=3, pad=1, stride=stride,
num_sync_bn_devices=num_sync_bn_devices)
else:
_add_conv(self.features, filters=int(filters), kernel=3, pad=1, stride=stride,
num_sync_bn_devices=num_sync_bn_devices, dropout=True)
input_channel = filters
idx += 1
self.features.add(nn.GlobalMaxPool2D())
self.features.add(nn.Dropout(0))
self.features.add(nn.Flatten())
self.output = nn.Dense(classes)
def hybrid_forward(self, F, x):
x = self.features(x)
x = self.output(x)
return x
# Constructor
def get_simpnet(multiplier, pretrained=False, ctx=cpu(),
root='~/.mxnet/models', num_sync_bn_devices=-1, **kwargs):
r"""
Parameters
----------
multiplier : float
The width multiplier for controlling the model size. Only multipliers that are no
less than 0.25 are supported. The actual number of channels is equal to the original
channel size multiplied by this multiplier.
pretrained : bool or str
Boolean value controls whether to load the default pretrained weights for model.
String value represents the hashtag for a certain version of pretrained weights.
ctx : Context, default CPU
The context in which to load the pretrained weights.
root : str, default $MXNET_HOME/models
Location for keeping the model parameters.
num_sync_bn_devices : int, default is -1
Number of devices for training. If `num_sync_bn_devices < 2`, SyncBatchNorm is disabled.
"""
net = SimpNet(multiplier, num_sync_bn_devices=num_sync_bn_devices, **kwargs)
if pretrained:
from .model_store import get_model_file
version_suffix = '{0:.2f}'.format(multiplier)
if version_suffix in ('1.00', '0.50'):
version_suffix = version_suffix[:-1]
net.load_parameters(get_model_file('SimpNet%s' % version_suffix,
tag=pretrained,
root=root), ctx=ctx)
from ..data import ImageNet1kAttr
attrib = ImageNet1kAttr()
net.synset = attrib.synset
net.classes = attrib.classes
net.classes_long = attrib.classes_long
return net
def simpnet1_0(**kwargs):
r"""
Parameters
----------
pretrained : bool or str
Boolean value controls whether to load the default pretrained weights for model.
String value represents the hashtag for a certain version of pretrained weights.
ctx : Context, default CPU
The context in which to load the pretrained weights.
num_sync_bn_devices : int, default is -1
Number of devices for training. If `num_sync_bn_devices < 2`, SyncBatchNorm is disabled.
"""
return get_simpnet(1.0, **kwargs)
def simpnet0_75(**kwargs):
r"""
Parameters
----------
pretrained : bool or str
Boolean value controls whether to load the default pretrained weights for model.
String value represents the hashtag for a certain version of pretrained weights.
ctx : Context, default CPU
The context in which to load the pretrained weights.
num_sync_bn_devices : int, default is -1
Number of devices for training. If `num_sync_bn_devices < 2`, SyncBatchNorm is disabled.
"""
return get_simpnet(0.75, **kwargs)
def simpnet0_5(**kwargs):
return get_simpnet(0.5, **kwargs)
def simpnet0_25(**kwargs):
r"""
Parameters
----------
pretrained : bool or str
Boolean value controls whether to load the default pretrained weights for model.
String value represents the hashtag for a certain version of pretrained weights.
ctx : Context, default CPU
The context in which to load the pretrained weights.
num_sync_bn_devices : int, default is -1
Number of devices for training. If `num_sync_bn_devices < 2`, SyncBatchNorm is disabled.
"""
return get_simpnet(0.25, **kwargs)
and I have put the new network definition next to other networks inside model_zoo
. and executed the following command in :
/media/master/tmpstore/MxNet/gluon-cv-master/scripts/classification/imagenet
like this :
SMODE=2
NetworkIDX=0
python train_imagenet.py \
--rec-train /media/master/SSD/ImageNet_DataSet/train/rec_train/train.rec --rec-train-idx /media/master/SSD/ImageNet_DataSet/train/rec_train/train.idx \
--rec-val /media/master/SSD/ImageNet_DataSet/train/rec_val/val.rec --rec-val-idx /media/master/SSD/ImageNet_DataSet/train/rec_val/val.idx \
--model simpnet1.0 --mode hybrid \
--lr 0.4 --lr-mode cosine --num-epochs 200 --batch-size 256 --num-gpus 1 -j 20 \
--use-rec --dtype float16 --warmup-epochs 5 --no-wd --label-smoothing --mixup \
--save-dir params_simpnet1.0_mixup \
--logging-file simpnet1.0_mixup.log --netidx $NetworkIDX --smode $SMODE
However, whenever I try to run the training I get the following error :
master@master:/media/master/tmpstore/MxNet/gluon-cv-master/scripts/classification/imagenet$ bash imagenet_train.sh
/home/master/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
Namespace(batch_norm=False, batch_size=256, crop_ratio=0.875, data_dir='~/.mxnet/datasets/imagenet', dtype='float16', input_size=224, label_smoothing=True, last_gamma=False, log_interval=50, logging_file='simpnet1.0_mixup.log', lr=0.4, lr_decay=0.1, lr_decay_epoch='40,60', lr_decay_period=0, lr_mode='cosine', mixup=True, mixup_alpha=0.2, mixup_off_epoch=0, mode='hybrid', model='simpnet1.0', momentum=0.9, netidx=0, no_wd=True, num_epochs=200, num_gpus=1, num_workers=20, rec_train='/media/master/SSD/ImageNet_DataSet/train/rec_train/train.rec', rec_train_idx='/media/master/SSD/ImageNet_DataSet/train/rec_train/train.idx', rec_val='/media/master/SSD/ImageNet_DataSet/train/rec_val/val.rec', rec_val_idx='/media/master/SSD/ImageNet_DataSet/train/rec_val/val.idx', resume_epoch=0, resume_params='', resume_states='', save_dir='params_simpnet1.0_mixup', save_frequency=10, smode=2, use_pretrained=False, use_rec=True, use_se=False, warmup_epochs=5, warmup_lr=0.0, wd=0.0001)
[08:57:34] src/io/iter_image_recordio_2.cc:170: ImageRecordIOParser2: /media/master/SSD/ImageNet_DataSet/train/rec_train/train.rec, use 3 threads for decoding..
[08:57:36] src/io/iter_image_recordio_2.cc:170: ImageRecordIOParser2: /media/master/SSD/ImageNet_DataSet/train/rec_val/val.rec, use 3 threads for decoding..
infer_shape error. Arguments:
data: (256, 3, 224, 224)
Traceback (most recent call last):
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 805, in _call_cached_op
for is_arg, i in self._cached_op_args]
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 805, in <listcomp>
for is_arg, i in self._cached_op_args]
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/parameter.py", line 494, in data
return self._check_and_get(self._data, ctx)
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/parameter.py", line 208, in _check_and_get
"num_features, etc., for network layers."%(self.name))
mxnet.gluon.parameter.DeferredInitializationError: Parameter 'simpnet0_conv0_weight' has not been initialized yet because initialization was deferred. Actual initialization happens during the first forward pass. Please pass one batch of data through the network before accessing Parameters. You can also avoid deferred initialization by specifying in_units, num_features, etc., for network layers.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 791, in _deferred_infer_shape
self.infer_shape(*args)
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 864, in infer_shape
self._infer_attrs('infer_shape', 'shape', *args)
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 853, in _infer_attrs
**{i.name: getattr(j, attr) for i, j in zip(inputs, args)})
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/symbol/symbol.py", line 996, in infer_shape
res = self._infer_shape_impl(False, *args, **kwargs)
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/symbol/symbol.py", line 1126, in _infer_shape_impl
ctypes.byref(complete)))
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/base.py", line 251, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: Error in operator simpnet0_conv1_fwd: [08:57:37] src/operator/nn/convolution.cc:145: Check failed: dshp.ndim() == 4U (2 vs. 4) Input data should be 4D in batch-num_filter-y-x
Stack trace returned 10 entries:
[bt] (0) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x39008a) [0x7f290a5e608a]
[bt] (1) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x3906c1) [0x7f290a5e66c1]
[bt] (2) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x621961) [0x7f290a877961]
[bt] (3) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2b528fa) [0x7f290cda88fa]
[bt] (4) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2b5528c) [0x7f290cdab28c]
[bt] (5) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(MXSymbolInferShape+0x15ba) [0x7f290cd1f12a]
[bt] (6) /home/master/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f2933c8cec0]
[bt] (7) /home/master/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f2933c8c87d]
[bt] (8) /home/master/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f2933ea1dee]
[bt] (9) /home/master/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12825) [0x7f2933ea2825]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "train_imagenet.py", line 412, in <module>
main()
File "train_imagenet.py", line 409, in main
train(context)
File "train_imagenet.py", line 360, in train
outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
File "train_imagenet.py", line 360, in <listcomp>
outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 542, in __call__
out = self.forward(*args)
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 909, in forward
return self._call_cached_op(x, *args)
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 807, in _call_cached_op
self._deferred_infer_shape(*args)
File "/home/master/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py", line 795, in _deferred_infer_shape
raise ValueError(error_msg)
ValueError: Deferred initialization failed because shape cannot be inferred. Error in operator simpnet0_conv1_fwd: [08:57:37] src/operator/nn/convolution.cc:145: Check failed: dshp.ndim() == 4U (2 vs. 4) Input data should be 4D in batch-num_filter-y-x
Stack trace returned 10 entries:
[bt] (0) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x39008a) [0x7f290a5e608a]
[bt] (1) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x3906c1) [0x7f290a5e66c1]
[bt] (2) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x621961) [0x7f290a877961]
[bt] (3) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2b528fa) [0x7f290cda88fa]
[bt] (4) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2b5528c) [0x7f290cdab28c]
[bt] (5) /home/master/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(MXSymbolInferShape+0x15ba) [0x7f290cd1f12a]
[bt] (6) /home/master/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f2933c8cec0]
[bt] (7) /home/master/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f2933c8c87d]
[bt] (8) /home/master/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f2933ea1dee]
[bt] (9) /home/master/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12825) [0x7f2933ea2825]
GPU and Driver Status :
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 390.48 Driver Version: 390.48 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 1080 Off | 00000000:01:00.0 On | N/A |
| 0% 46C P8 13W / 200W | 368MiB / 8116MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 1091 G /usr/lib/xorg/Xorg 209MiB |
| 0 2867 G compiz 156MiB |
+-----------------------------------------------------------------------------+
mxnet version : 1.3.1
GluonCV version : 0.4.0
OS : Ubuntu 16.04.5 LTS
CPU : 4790K
RAM 16G
Any help is greatly appreciated.
Thanks a lot in advance