Implementing Faster RCNN training loop

Hi. I am new to Gluon and I was trying to implement Faster RCNN following the tutorial in the website, but I obtained errors I don’t understand. Here is my code:

net.initialize(mx.init.Xavier(), ctx=ctx, force_reinit=True)
net.collect_params().reset_ctx(ctx)

trainer = gluon.Trainer(
    net.collect_params(), 
    'sgd',
    {
        'learning_rate': learning_rate, 
        'wd': wd, 
        'momentum': momentum
    })


# LOSSES

# the loss to penalize incorrect foreground/background prediction
rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
# the loss to penalize inaccurate anchor boxes
rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1/9.)  # == smoothl1
# the loss to penalize incorrect classification prediction.
rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
# and finally the loss to penalize inaccurate proposals
rcnn_box_loss = mx.gluon.loss.HuberLoss()  # == smoothl1


for epoch in range(0,50):
    net.hybridize(static_alloc=True, static_shape=True)

    for ib, batch in enumerate(train_loader):
        rpn_cls_losses = []
        rpn_box_losses = []
        rcnn_cls_losses = []
        rcnn_box_losses = []
        
        print(ib)
        
        with autograd.record():
            for data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(*batch):
                gt_label = label[:, :, 4:5]
                gt_box = label[:, :, :4]
                # network forward
                cls_preds, box_preds, roi, samples, matches, rpn_score, rpn_box, anchors = net(data, gt_box)
                # generate targets for rcnn
                cls_targets, box_targets, box_masks = net.target_generator(roi, samples, matches, gt_label, gt_box)

                # losses of rpn
                rpn_score = rpn_score.squeeze(axis=-1)
                num_rpn_pos = (rpn_cls_targets >= 0).sum()
                rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
                rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos

                # losses of rcnn
                num_rcnn_pos = (cls_targets >= 0).sum()
                rcnn_loss1 = rcnn_cls_loss(cls_preds, cls_targets, cls_targets >= 0) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
                rcnn_loss2 = rcnn_box_loss(box_preds, box_targets, box_masks) * box_preds.size / box_preds.shape[0] / num_rcnn_pos
 
                # Total loss
                # sum_loss = rpn_loss1 + rpn_loss2 + rcnn_loss1 + rcnn_loss2
                
                # Add losses
                rpn_cls_losses.append(rpn_loss1)
                rpn_box_losses.append(rpn_loss2)
                rcnn_cls_losses.append(rcnn_loss1)
                rcnn_box_losses.append(rcnn_loss2)
                
                
            autograd.backward([rpn_loss1, rpn_loss2, rcnn_loss1, rcnn_loss2])    
        trainer.step(batch_size)

Here is the error message I obtained:

0
---------------------------------------------------------------------------
MXNetError                                Traceback (most recent call last)
<ipython-input-42-152aca3bcdab> in <module>
     33     net.hybridize(static_alloc=True, static_shape=True)
     34 
---> 35     for ib, batch in enumerate(train_loader):
     36         rpn_cls_losses = []
     37         rpn_box_losses = []

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\mxnet\gluon\data\dataloader.py in same_process_iter()
    572             def same_process_iter():
    573                 for batch in self._batch_sampler:
--> 574                     ret = self._batchify_fn([self._dataset[idx] for idx in batch])
    575                     if self._pin_memory:
    576                         ret = _as_in_context(ret, context.cpu_pinned(self._pin_device_id))

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\mxnet\gluon\data\dataloader.py in <listcomp>(.0)
    572             def same_process_iter():
    573                 for batch in self._batch_sampler:
--> 574                     ret = self._batchify_fn([self._dataset[idx] for idx in batch])
    575                     if self._pin_memory:
    576                         ret = _as_in_context(ret, context.cpu_pinned(self._pin_device_id))

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\mxnet\gluon\data\dataset.py in __getitem__(self, idx)
    122         item = self._data[idx]
    123         if isinstance(item, tuple):
--> 124             return self._fn(*item)
    125         return self._fn(item)
    126 

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\gluoncv\data\transforms\presets\rcnn.py in __call__(self, src, label)
    222             anchor = self._anchors[:, :, :oshape[2], :oshape[3], :].reshape((-1, 4))
    223             cls_target, box_target, box_mask = self._target_generator(
--> 224                 gt_bboxes, anchor, img.shape[2], img.shape[1])
    225         return img, bbox.astype(img.dtype), cls_target, box_target, box_mask
    226 

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\mxnet\gluon\block.py in __call__(self, *args)
    538             hook(self, args)
    539 
--> 540         out = self.forward(*args)
    541 
    542         for hook in self._forward_hooks.values():

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\gluoncv\model_zoo\rpn\rpn_target.py in forward(self, bbox, anchor, width, height)
    160             ious = F.where(invalid_mask, mx.nd.ones_like(ious) * -1, ious)
    161 
--> 162             samples, matches = self._sampler(ious)
    163 
    164             # training targets for RPN

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\mxnet\gluon\block.py in __call__(self, *args)
    538             hook(self, args)
    539 
--> 540         out = self.forward(*args)
    541 
    542         for hook in self._forward_hooks.values():

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\gluoncv\model_zoo\rpn\rpn_target.py in forward(self, ious)
     71 
     72         # subsample fg labels
---> 73         samples = samples.asnumpy()
     74         num_pos = int((samples > 0).sum())
     75         if num_pos > self._max_pos:

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\mxnet\ndarray\ndarray.py in asnumpy(self)
   1990             self.handle,
   1991             data.ctypes.data_as(ctypes.c_void_p),
-> 1992             ctypes.c_size_t(data.size)))
   1993         return data
   1994 

c:\users\123\appdata\local\programs\python\python37\lib\site-packages\mxnet\base.py in check_call(ret)
    250     """
    251     if ret != 0:
--> 252         raise MXNetError(py_str(_LIB.MXGetLastError()))
    253 
    254 

MXNetError: [11:43:51] C:\Jenkins\workspace\mxnet\mxnet\src\nnvm\legacy_op_util.cc:134: Check failed: fwd_init_ 

The 0 in the output indicates that the program goes through the first iteration of the test_loader, and if I comment out the last 2 lines (autograd and trainer) the code runs, but obviously not doing what I want it to do. What am I doing wrong here? Thank you in advance for your time!

Hi @conradkun,

Your stack trace a little bit is skewed by the fact the first blocking call is in the gluoncv\model_zoo\rpn\rpn_target.py file, but there seems to be some issue with the region proposal network here. Set the MXNET_ENGINE_TYPE environment variable to NaiveEngine and unhybridize your network to help wih debugging. Check failed: fwd_init_ isn’t telling you much right now.