Hi,
I am facing some problems with the object detection models from GluonCV.
At first I successfully applied SSD, Faster R-CNN and YOLOv3 to my own dataset. After that, I updated my dataset with more images and more classes (it went from 3 to 6) and the YOLOv3 and Faster R-CNN models stopped to train.
This is what happens with the Faster R-CNN: I run my minimal training loop that I adapted from the tutorials and the code runs ok, no errors or warnings. The problem is, while I am training there is no activity on my gpu, it looks like nothing is happening during training and the model that I obtain after this process has very low mAP, like it was not trained.
Basically my code is running with no errors, but nothing is happening on my hardware. The same happens when I change the context to cpu.
Does anyone have any idea of what might be happening?
I am using Ubuntu 18.04, with cuda 9.2 (Tested on 10.1, same behavior), mxnet version 1.41, Ryzen 7 2700 and a RTX2070.
Also, the SSD model trains and performs as expected, the problem lies only with Faster R-CNN and YOLOv3.
Here a minimal example that reproduces the issue on my machine.
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2 * 4096, rlimit[1]))
import mxnet as mx
from mxnet import nd
from mxnet import autograd
from mxnet import gluon
from mxnet.gluon.data import DataLoader
from gluoncv import model_zoo
import gluoncv as gcv
from gluoncv.data.transforms import presets
from gluoncv.data.batchify import Tuple, Append
from gluoncv.utils.metrics import VOCMApMetric
from tqdm import tqdm
VOC = True
if VOC:
from gluoncv.data import VOCDetection
trn_ds = VOCDetection(splits=[(2007, 'trainval'), (2012, 'trainval')])
val_ds = VOCDetection(splits=[(2007, 'test')])
else:
from traindet.utils import Dataset
root = '/path/to/dataset'
trn_ds = Dataset(root)
val_ds = Dataset(root, train=False)
ctx = mx.gpu()
batch_size = 1
num_workers = 14
num_epochs = 5
# Network
net = model_zoo.get_model('faster_rcnn_resnet50_v1b_voc', pretrained_base=True, ctx=ctx)
net.initialize(force_reinit=True)
if not VOC:
net.reset_class(classes=trn_ds.classes)
# Loaders
short, max_size = 600, 1000
train_transform = presets.rcnn.FasterRCNNDefaultTrainTransform(short, max_size, net)
batchify_fn = Tuple(*[Append() for _ in range(5)])
train_loader = DataLoader(trn_ds.transform(train_transform), batch_size, shuffle=True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
val_transform = presets.rcnn.FasterRCNNDefaultValTransform(short, max_size)
batchify_fn = Tuple(*[Append() for _ in range(3)])
val_loader = DataLoader(val_ds.transform(val_transform), batch_size, shuffle=False, batchify_fn=batchify_fn, last_batch='keep', num_workers=num_workers)
# Losses
rpn_cls_loss = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
rpn_box_loss = gluon.loss.HuberLoss(rho=1/9.)
rcnn_cls_loss = gluon.loss.SoftmaxCrossEntropyLoss()
rcnn_box_loss = gluon.loss.HuberLoss()
eval_metric = VOCMApMetric(iou_thresh=0.5, class_names=val_ds.classes)
# Train Loop
trainer = gluon.Trainer(
net.collect_params(), 'sgd',
{'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9})
for epoch in range(num_epochs):
for ib, batch in tqdm(enumerate(train_loader)):
with autograd.record():
for data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(*batch):
gt_label = label[:, :, 4:5]
gt_box = label[:, :, :4]
# network forward
cls_preds, box_preds, roi, samples, matches, rpn_score, rpn_box, anchors = net(data, gt_box)
# generate targets for rcnn
cls_targets, box_targets, box_masks = net.target_generator(roi, samples, matches, gt_label, gt_box)
# losses of rpn
rpn_score = rpn_score.squeeze(axis=-1)
num_rpn_pos = (rpn_cls_targets >= 0).sum()
rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos
# losses of rcnn
num_rcnn_pos = (cls_targets >= 0).sum()
rcnn_loss1 = rcnn_cls_loss(cls_preds, cls_targets, cls_targets >= 0) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
rcnn_loss2 = rcnn_box_loss(box_preds, box_targets, box_masks) * box_preds.size / box_preds.shape[0] / num_rcnn_pos
# some standard gluon training steps:
autograd.backward([rpn_loss1, rpn_loss2, rcnn_loss1, rcnn_loss2])
trainer.step(batch_size)