Problem with Faster R-CNN training

Hi,

I am facing some problems with the object detection models from GluonCV.

At first I successfully applied SSD, Faster R-CNN and YOLOv3 to my own dataset. After that, I updated my dataset with more images and more classes (it went from 3 to 6) and the YOLOv3 and Faster R-CNN models stopped to train.

This is what happens with the Faster R-CNN: I run my minimal training loop that I adapted from the tutorials and the code runs ok, no errors or warnings. The problem is, while I am training there is no activity on my gpu, it looks like nothing is happening during training and the model that I obtain after this process has very low mAP, like it was not trained.

Basically my code is running with no errors, but nothing is happening on my hardware. The same happens when I change the context to cpu.

Does anyone have any idea of what might be happening?

I am using Ubuntu 18.04, with cuda 9.2 (Tested on 10.1, same behavior), mxnet version 1.41, Ryzen 7 2700 and a RTX2070.

Also, the SSD model trains and performs as expected, the problem lies only with Faster R-CNN and YOLOv3.

Here a minimal example that reproduces the issue on my machine.

import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2 * 4096, rlimit[1]))
import mxnet as mx
from mxnet import nd
from mxnet import autograd
from mxnet import gluon
from mxnet.gluon.data import DataLoader
from gluoncv import model_zoo
import gluoncv as gcv
from gluoncv.data.transforms import presets
from gluoncv.data.batchify import Tuple, Append
from gluoncv.utils.metrics import VOCMApMetric
from tqdm import tqdm

VOC = True

if VOC:
    from gluoncv.data import VOCDetection
    trn_ds = VOCDetection(splits=[(2007, 'trainval'), (2012, 'trainval')])
    val_ds = VOCDetection(splits=[(2007, 'test')])
else:
    from traindet.utils import Dataset
    root = '/path/to/dataset'
    trn_ds = Dataset(root)
    val_ds = Dataset(root, train=False)

ctx = mx.gpu()
batch_size = 1
num_workers = 14
num_epochs = 5

# Network
net = model_zoo.get_model('faster_rcnn_resnet50_v1b_voc', pretrained_base=True, ctx=ctx)
net.initialize(force_reinit=True)

if not VOC:
    net.reset_class(classes=trn_ds.classes)
# Loaders
short, max_size = 600, 1000
train_transform = presets.rcnn.FasterRCNNDefaultTrainTransform(short, max_size, net)
batchify_fn = Tuple(*[Append() for _ in range(5)])
train_loader = DataLoader(trn_ds.transform(train_transform), batch_size, shuffle=True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)

val_transform = presets.rcnn.FasterRCNNDefaultValTransform(short, max_size)
batchify_fn = Tuple(*[Append() for _ in range(3)])
val_loader = DataLoader(val_ds.transform(val_transform), batch_size, shuffle=False, batchify_fn=batchify_fn, last_batch='keep', num_workers=num_workers)

# Losses
rpn_cls_loss = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
rpn_box_loss = gluon.loss.HuberLoss(rho=1/9.) 
rcnn_cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
rcnn_box_loss = gluon.loss.HuberLoss()

eval_metric = VOCMApMetric(iou_thresh=0.5, class_names=val_ds.classes)

# Train Loop
trainer = gluon.Trainer(
    net.collect_params(), 'sgd',
    {'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9})

for epoch in range(num_epochs):
    for ib, batch in tqdm(enumerate(train_loader)):
        with autograd.record():
            for data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(*batch):
                gt_label = label[:, :, 4:5]
                gt_box = label[:, :, :4]
                # network forward
                cls_preds, box_preds, roi, samples, matches, rpn_score, rpn_box, anchors = net(data, gt_box)
                # generate targets for rcnn
                cls_targets, box_targets, box_masks = net.target_generator(roi, samples, matches, gt_label, gt_box)

                # losses of rpn
                rpn_score = rpn_score.squeeze(axis=-1)
                num_rpn_pos = (rpn_cls_targets >= 0).sum()
                rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
                rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos

                # losses of rcnn
                num_rcnn_pos = (cls_targets >= 0).sum()
                rcnn_loss1 = rcnn_cls_loss(cls_preds, cls_targets, cls_targets >= 0) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
                rcnn_loss2 = rcnn_box_loss(box_preds, box_targets, box_masks) * box_preds.size / box_preds.shape[0] / num_rcnn_pos

            # some standard gluon training steps:
            autograd.backward([rpn_loss1, rpn_loss2, rcnn_loss1, rcnn_loss2])
        trainer.step(batch_size)

Hi @czrcbl,

Using this code, all computation will be performed on CPU. You have two things that needs to be changed for this to run on GPU:

  1. You need to make sure your model in correctly initialized on GPU. You correctly create the model with a pretrained base on the GPU with:

net = model_zoo.get_model('faster_rcnn_resnet50_v1b_voc', pretrained_base=True, ctx=ctx)

On the next line though, you overwrite this and re-initialize all parameters on the CPU.

net.initialize(force_reinit=True)  # ctx is missing, so defaults to mx.cpu()

I don’t think you need this line. If you want to train your model from scratch set pretrained_base=False. If you want to use a pretrained base, you don’t want to re-initalize the whole network with force_reinit=True, the non-base parameters will be randomly initialized anyway I believe.

  1. You need to make sure your model inputs are on the GPU.
for ib, batch in tqdm(enumerate(train_loader)):

Arrays in batch will be on the CPU. You need to move them to the GPU with as_in_context. You also might want to do your variable unpacking here, rather than in autograd scope.

for ib, (data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks) in tqdm(enumerate(train_loader)):
    data = data.as_in_context(ctx)
    label = label.as_in_context(ctx)
    ...

With these two changes you should start seeing some activity on the GPU. I recommend you check out the train scripts in GluonCV rather than writing your own from scratch (unless you are learning of course!). Good luck, Thom

Thank you for your help. I was facing problems in code in a notebook, so a tried to produce a minimal script in order to post here, but I ended up introducing some errors that were not on the original code, sorry for that.

I changed this script as you suggested and I started to get activity on the GPU. But after a few epochs I get:
Check failed: (err) == (cudaSuccess) Name: mxnet_generic_kernel ErrStr:out of memory.

Would 8GB of video RAM not be enough to Faster R-CNN?

I was able to train on CPU without memory errors, but when I try to validate the model with the function copied from the train frcnn example the validation loop does not run. It keeps stuck on the 1st iteration for minutes until I interrupt with the keyboard. There is no error message.

Here is the full code, I added the validate function and the split_and_load from the examples to properly load the batch into the ctx. The validate is called on the end of the script, where the execution get stuck.

import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
import mxnet as mx
from mxnet import nd
from mxnet import autograd
from mxnet import gluon
from mxnet.gluon.data import DataLoader
from gluoncv import model_zoo
# from gluoncv.data import VOCDetection
import gluoncv as gcv
from gluoncv.data.transforms import presets
from gluoncv.data.batchify import Tuple, Append
from gluoncv.utils.metrics import VOCMApMetric
from tqdm import tqdm

VOC = True

if VOC:
    from gluoncv.data import VOCDetection
    trn_ds = VOCDetection(splits=[(2007, 'trainval'), (2012, 'trainval')])
    val_ds = VOCDetection(splits=[(2007, 'test')])
else:
    from traindet.utils import Dataset
    root = '/home/cezar/Desktop/FASTEN/fotos_objetos'
    trn_ds = Dataset(root)
    val_ds = Dataset(root, train=False)

ctx = [mx.cpu()]

batch_size = 1
num_workers = 14
num_epochs = 3

# Network
net = model_zoo.get_model('faster_rcnn_resnet50_v1b_voc', pretrained=True, ctx=ctx)

if not VOC:
    net.reset_class(classes=trn_ds.classes)
# Loaders
short, max_size = 600, 1000
train_transform = presets.rcnn.FasterRCNNDefaultTrainTransform(short, max_size, net)
batchify_fn = Tuple(*[Append() for _ in range(5)])
train_loader = DataLoader(trn_ds.transform(train_transform), batch_size, shuffle=True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)

val_transform = presets.rcnn.FasterRCNNDefaultValTransform(short, max_size)
batchify_fn = Tuple(*[Append() for _ in range(3)])
val_loader = DataLoader(val_ds.transform(val_transform), batch_size, shuffle=False, batchify_fn=batchify_fn, last_batch='keep', num_workers=num_workers)

# Losses
rpn_cls_loss = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
rpn_box_loss = gluon.loss.HuberLoss(rho=1/9.) 
rcnn_cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
rcnn_box_loss = gluon.loss.HuberLoss()

eval_metric = VOCMApMetric(iou_thresh=0.5, class_names=val_ds.classes)

def split_and_load(batch, ctx_list):
    """Split data to 1 batch each device."""
    num_ctx = len(ctx_list)
    new_batch = []
    for i, data in enumerate(batch):
        new_data = [x.as_in_context(ctx) for x, ctx in zip(data, ctx_list)]
        new_batch.append(new_data)
    return new_batch


def validate(net, val_data, ctx, eval_metric):
    """Test on validation dataset."""
    clipper = gcv.nn.bbox.BBoxClipToImage()
    eval_metric.reset()
    for batch in val_data:
        batch = split_and_load(batch, ctx_list=ctx)
        det_bboxes = []
        det_ids = []
        det_scores = []
        gt_bboxes = []
        gt_ids = []
        gt_difficults = []
        for x, y, im_scale in zip(*batch):
            # get prediction results
            ids, scores, bboxes = net(x)
            det_ids.append(ids)
            det_scores.append(scores)
            # clip to image size
            det_bboxes.append(clipper(bboxes, x))
            # rescale to original resolution
            im_scale = im_scale.reshape((-1)).asscalar()
            det_bboxes[-1] *= im_scale
            # split ground truths
            gt_ids.append(y.slice_axis(axis=-1, begin=4, end=5))
            gt_bboxes.append(y.slice_axis(axis=-1, begin=0, end=4))
            gt_bboxes[-1] *= im_scale
            gt_difficults.append(y.slice_axis(axis=-1, begin=5, end=6) if y.shape[-1] > 5 else None)

        # update metric
        for det_bbox, det_id, det_score, gt_bbox, gt_id, gt_diff in zip(det_bboxes, det_ids,
                                                                        det_scores, gt_bboxes,
                                                                        gt_ids, gt_difficults):
            eval_metric.update(det_bbox, det_id, det_score, gt_bbox, gt_id, gt_diff)
    return eval_metric.get()


# Train Loop
trainer = gluon.Trainer(
    net.collect_params(), 'sgd',
    {'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9})

for epoch in range(num_epochs):
    for ib, batch in tqdm(enumerate(train_loader)):
        batch = split_and_load(batch, ctx_list=ctx)
        with autograd.record():
            for data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(*batch):
                gt_label = label[:, :, 4:5]
                gt_box = label[:, :, :4]
                # network forward
                cls_preds, box_preds, roi, samples, matches, rpn_score, rpn_box, anchors = net(data, gt_box)
                # generate targets for rcnn
                cls_targets, box_targets, box_masks = net.target_generator(roi, samples, matches, gt_label, gt_box)

                # losses of rpn
                rpn_score = rpn_score.squeeze(axis=-1)
                num_rpn_pos = (rpn_cls_targets >= 0).sum()
                rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
                rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos

                # losses of rcnn
                num_rcnn_pos = (cls_targets >= 0).sum()
                rcnn_loss1 = rcnn_cls_loss(cls_preds, cls_targets, cls_targets >= 0) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
                rcnn_loss2 = rcnn_box_loss(box_preds, box_targets, box_masks) * box_preds.size / box_preds.shape[0] / num_rcnn_pos

            # some standard gluon training steps:
            autograd.backward([rpn_loss1, rpn_loss2, rcnn_loss1, rcnn_loss2])
        trainer.step(batch_size)

map_name, mean_ap = validate(net, val_loader, ctx, eval_metric)
val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
print(val_msg)