Error with asnumpy During EvalMetric Update

Hey everyone, I’m currently trying to implement a paper in mxnet, its idea is to segment vessels in the retina (paper) and was made using PyTorch. I found an implementation of the base model that the paper used and tried to change the code to suit the paper. However, I am Running into the following error in the update function of EvalMetric:

<ipython-input-20-f3c422a173c1> in <module>() 
240 trainer.step(batch_size) 
241 for m in metrics: 
--> 242 m.update(labels=llist[0], preds=preds) 
243
244

<ipython-input-20-f3c422a173c1> in update(self, labels, preds) 
172 
173 for l ,p in zip(labels, preds): 
--> 174 lnp = l.asnumpy() 
175 pnp = p.asnumpy() 
176 pl = np.argmax(pnp, axis=0)

/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in asnumpy(self) 
1978 self.handle, 
1979 data.ctypes.data_as(ctypes.c_void_p), 
--> 1980 ctypes.c_size_t(data.size))) 
1981 return data 
1982

/usr/local/lib/python3.6/dist-packages/mxnet/base.py in check_call(ret) 
250 """ 
251 if ret != 0: 
--> 252 raise MXNetError(py_str(_LIB.MXGetLastError())) 
253 
254

MXNetError: [02:52:48] src/operator/./crop-inl.h:126: Check failed: data_shape[2] >= out_shape[2] (56 vs. 65) data_shape'height should be larger than that of out_shape

From the the call stack, I saw that it was a problem with a check_call in asnumpy so I commented it out, but I got another error:

<ipython-input-13-f3c422a173c1> in <module>()
240         trainer.step(batch_size)
241         for m in metrics:
--> 242             m.update(labels=llist[0], preds=preds)
243 
244 

<ipython-input-13-f3c422a173c1> in update(self, labels, preds)
172 
173         for l ,p in zip(labels, preds):
--> 174             lnp = l.asnumpy()
175             pnp = p.asnumpy()
176             pl = np.argmax(pnp, axis=0)

/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in asnumpy(self)
   1978             # self.handle,
   1979             # data.ctypes.data_as(ctypes.c_void_p),
--> 1980             # ctypes.c_size_t(data.size)))
   1981         return data
   1982 

/usr/local/lib/python3.6/dist-packages/mxnet/base.py in check_call(ret)
250     """
251     if ret != 0:
--> 252         raise MXNetError(py_str(_LIB.MXGetLastError()))
253 
254 

MXNetError: [02:15:09] src/nnvm/legacy_op_util.cc:134: Check failed: fwd_init_ 

What’s weird is that when I comment the check_call out on my laptop, it works. But when I do it on Google Colab I get the above error. Here is the full code (very messy I know):

import os

import mxnet as mx
import mxnet.ndarray as nd
import mxnet.gluon as gluon
import mxnet.gluon.nn as nn
import mxnet.autograd as ag

from mxnet.gluon.data import Dataset, DataLoader
from mxnet.gluon.loss import Loss, _apply_weighting, _reshape_like
from mxnet import image

import matplotlib.pyplot as plt
import numpy as np
import numpy.random as random
import cv2

import time


class MyDataSet(Dataset):
    def __init__(self, root, split, transform=None, use_mask=True):
        self.root = os.path.join(root, split)
        self.transform = transform

        self.img_paths = []
        self.mask_paths = []
        self.lbl_paths = []
        
        self._img = os.path.join(root, split, 'image', '{}.png')
        self._use_mask = use_mask
        if self._use_mask:
            self._mask = os.path.join(root, split, 'mask', '{}.png')
        self._lbl = os.path.join(root, split, 'label', '{}.png')
        
        for fn in os.listdir(os.path.join(root, split, 'image')):
            if len(fn) > 3 and fn[-4:] == '.png':
                self.img_paths.append(fn[:-4])
        for fn in os.listdir(os.path.join(root, split, 'mask')):
            if len(fn) > 3 and fn[-4:] == '.png':
                self.mask_paths.append(fn[:-4])
        for fn in os.listdir(os.path.join(root, split, 'label')):
            if len(fn) > 3 and fn[-4:] == '.png':
                self.lbl_paths.append(fn[:-4])
        
    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        img_path = self._img.format(self.img_paths[idx])
        if self._use_mask:
            mask_path = self._mask.format(self.mask_paths[idx])
        lbl_path = self._lbl.format(self.lbl_paths[idx])

        img = cv2.imread(img_path)
        img_g = img[:,:,1]
        clahe = cv2.createCLAHE()
        img_gc = clahe.apply(img_g)
        lbl = cv2.imread(lbl_path, cv2.IMREAD_GRAYSCALE)

        all_count = np.prod(lbl.shape)
        fg_count = np.count_nonzero(lbl)
        bg_count = all_count - fg_count

        alpha = 1. / fg_count
        beta = 1. / bg_count
        alpha = alpha / (alpha + beta)
        beta = beta / (alpha + beta)
        
        if self._use_mask:
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
            lbl = np.bitwise_and(mask, lbl)
            img_gc = np.bitwise_and(mask, img_gc)

        lbl = lbl / 255
        img_gc = img_gc / 255

        img_gc = img_gc[np.newaxis,...]
        img_gc = mx.nd.array(img_gc)
        lbl = mx.nd.array(lbl)

        weight = lbl * alpha + (1 - lbl) * beta
        
        return img_gc, lbl, weight


my_train = MyDataSet('/content/drive/My Drive/DRIVE', 'train')
my_valid = MyDataSet('/content/drive/My Drive/DRIVE', 'valid')

train_loader = DataLoader(my_train, batch_size=4, shuffle=True, last_batch='rollover')
valid_loader = DataLoader(my_valid, batch_size=4, shuffle=True, last_batch='rollover')

ctx = [mx.gpu(0)]

net = UNet()
net.hybridize()
net.collect_params().initialize(ctx=ctx)

num_epochs = 1
num_steps = len(my_train) // 8

trainer = gluon.Trainer(net.collect_params(), 'adam', {
    'learning_rate': 0.001,
    'wd': 0.0005,
    'lr_scheduler': mx.lr_scheduler.PolyScheduler(num_steps * num_epochs, 0.001,  2, 0.00001)
})

criterion = gluon.loss.SoftmaxCELoss(axis=1)

class SegMetric(mx.metric.EvalMetric):
    """CalculSegMetricate metrics for Seg training """
    def __init__(self, eps=1e-8, use_mask=False):
        super(SegMetric, self).__init__('Seg')
        self.eps = eps
        self.num = 2
        self.ac = 0
        self.ce = 0
        self.name = ['Accuracy_background','Accuracy_foreground']

        self.use_mask = use_mask

        self.reset()

    def reset(self):
        """
        override reset behavior
        """
        if getattr(self, 'num', None) is None:
            self.num_inst = 0
            self.sum_metric = 0.0
        else:
            self.num_inst = [0] * self.num
            self.sum_metric = [0.0] * self.num

    def update(self, labels, preds):
        """
        Implementation of updating metrics
        """
        # get generated multi label from network
        
        for l ,p in zip(labels, preds):
            lnp = l.asnumpy()
            pnp = p.asnumpy()
            pl = np.argmax(pnp, axis=0)


            if self.use_mask:
                m = lnp != 255
                m255 = 255 - m*255
                pl = np.bitwise_or(pl, m255)
            
            bg_gt = lnp==0
            fg_gt = lnp==1
            
            bg = bg_gt * (pl == 0) #np.bitwise_and(bg_gt, pl==0)
            fg = fg_gt * (pl == 1) #np.bitwise_and(fg_gt, pl==1)
            
            self.sum_metric[0] += bg.sum()
            self.sum_metric[1] += fg.sum()
            # print(fg.sum())
            
            self.num_inst[0] += bg_gt.sum()
            self.num_inst[1] += fg_gt.sum()

        
    def get(self):

        if self.num is None:
            if self.num_inst == 0:
                return (self.name, float('nan'))
            else:
                return (self.name, self.sum_metric / float(self.num_inst))
        else:
            names = ['%s'%(self.name[i]) for i in range(self.num)]
            for x, y in zip(self.sum_metric, self.num_inst):
                if y != 0:
                    values = x / y
                else:
                    values = float('nan')
            return (names, values)

metrics = [SegMetric(use_mask=True)]

for epoch in range(num_epochs):
    t0 = time.time()
    total_loss = 0
    for m in metrics:
        m.reset()
    for data, label, weight in train_loader:
        batch_size = data.shape[0]
        dlist = gluon.utils.split_and_load(data, ctx)
        llist = gluon.utils.split_and_load(label, ctx)
        wlist = gluon.utils.split_and_load(weight, ctx)
        with ag.record():
            preds = net(dlist[0])
            losses = []
            chan, chan2, preds_y, preds_x = preds.shape
            chan, label_y, label_x = label.shape
            xoff = (label_x - preds_x) // 2
            yoff = (label_y - preds_y) // 2
            llist[0] = llist[0][:,yoff:-yoff,xoff:-xoff]
            wlist[0] = wlist[0][:,yoff:-yoff,xoff:-xoff]
            for i in range(len(preds)):
                l = criterion(preds, llist[0], wlist[0])
                losses.append(l)
            ag.backward(losses)
        total_loss += sum([l.sum() for l in losses])
        trainer.step(batch_size)
        for m in metrics:
            m.update(labels=llist[0], preds=preds)

    
    for m in metrics:
        name, value = m.get()

    t1 = time.time()
    print(epoch, t1-t0, total_loss, name, value)

And here is the model:

import mxnet as mx
import mxnet.gluon.nn as nn


def ConvBlock(channels):
    out = nn.HybridSequential()
    out.add(
        nn.Conv2D(channels, 3),
        nn.BatchNorm(),
        nn.Activation('relu'),
        nn.Conv2D(channels, 3),
        nn.BatchNorm(),
        nn.Activation('relu')
    )
    return out

class up_block(nn.HybridBlock):
    def __init__(self, channels, **kwargs):
        super(up_block, self).__init__(**kwargs)
        self.upsampler = nn.Conv2DTranspose(channels=channels // 2, kernel_size=2, strides=2, weight_initializer=mx.init.Bilinear())
        self.upsampler.collect_params().setattr('gred_req', 'null')

        self.conv1 = ConvBlock(channels)

    def hybrid_forward(self, F, x, s):
        x = self.upsampler(x)
        
        x = F.Crop(*[x,s], center_crop=True)
        x = F.concat(s,x, dim=1)

        out = self.conv1(x)
        return out

class UNet(nn.HybridBlock):
    def __init__(self, first_channels=64, **kwargs):
        super(UNet, self).__init__(**kwargs)
        with self.name_scope():
            self.d0 = ConvBlock(first_channels)
            
            self.d1 = nn.HybridSequential()
            self.d1.add(nn.MaxPool2D(2,2), ConvBlock(first_channels*2))
            
            self.d2 = nn.HybridSequential()
            self.d2.add(nn.MaxPool2D(2,2), ConvBlock(first_channels*2**2))
            
            self.d3 = nn.HybridSequential()
            self.d3.add(nn.MaxPool2D(2,2), ConvBlock(first_channels*2**3))
            
            self.d4 = nn.HybridSequential()
            self.d4.add(nn.MaxPool2D(2,2), ConvBlock(first_channels*2**4))
            
            self.u3 = up_block(first_channels*2**3)
            self.u2 = up_block(first_channels*2**2)
            self.u1 = up_block(first_channels*2)
            self.u0 = up_block(first_channels)
            
            self.conv = nn.Conv2D(2,1)

    def hybrid_forward(self, F, x):
        x0 = self.d0(x)
        x1 = self.d1(x0)
        x2 = self.d2(x1)
        x3 = self.d3(x2)
        x4 = self.d4(x3)

        y3 = self.u3(x4,x3)
        y2 = self.u2(y3,x2)
        y1 = self.u1(y2,x1)
        y0 = self.u0(y1,x0)
        
        out = self.conv(y0)
        
        return out

I am using the DRIVE dataset as input into the network which has an image size of 565x584.

Update:

Here’s a more cleaned up code:

import os

import mxnet as mx
import mxnet.ndarray as nd
import mxnet.gluon as gluon
import mxnet.gluon.nn as nn
import mxnet.autograd as ag

from mxnet.gluon.data import Dataset, DataLoader
from mxnet.gluon.loss import Loss, _apply_weighting, _reshape_like
from mxnet import image

import matplotlib.pyplot as plt
import numpy as np
import numpy.random as random
import cv2

import time
from model_unet3 import UNet

class MyDataSet(Dataset):
    def __init__(self, root, split, transform=None, use_mask=True):
        self.root = os.path.join(root, split)
        self.transform = transform

        self.img_paths = []
        self.mask_paths = []
        self.lbl_paths = []
        
        self._img = os.path.join(root, split, 'image', '{}.png')
        self._use_mask = use_mask
        if self._use_mask:
            self._mask = os.path.join(root, split, 'mask', '{}.png')
        self._lbl = os.path.join(root, split, 'label', '{}.png')
        
        img_list = os.listdir(os.path.join(root, split, 'image'))
        img_list.sort()
        lbl_list = os.listdir(os.path.join(root, split, 'label'))
        lbl_list.sort()
        if self._use_mask:
            mask_list = os.listdir(os.path.join(root, split, 'mask'))
            mask_list.sort()

        for fn in img_list:
            if len(fn) > 3 and fn[-4:] == '.png':
                self.img_paths.append(fn[:-4])
        if self._use_mask:
            for fn in mask_list:
                if len(fn) > 3 and fn[-4:] == '.png':
                    self.mask_paths.append(fn[:-4])
        for fn in lbl_list:
            if len(fn) > 3 and fn[-4:] == '.png':
                self.lbl_paths.append(fn[:-4])
        
    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        img_path = self._img.format(self.img_paths[idx])
        if self._use_mask:
            mask_path = self._mask.format(self.mask_paths[idx])
        lbl_path = self._lbl.format(self.lbl_paths[idx])

        img = cv2.imread(img_path)
        img_g = img[:,:,1]
        clahe = cv2.createCLAHE()
        img_gc = clahe.apply(img_g)
        lbl = cv2.imread(lbl_path, cv2.IMREAD_GRAYSCALE)
        
        if self._use_mask:
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
            lbl = np.bitwise_and(mask, lbl)
            img_gc = np.bitwise_and(mask, img_gc)

        lbl = lbl / 255
        img_gc = img_gc / 255
        lbl = lbl[4:-4,4:-4]
        img_gc = img_gc[np.newaxis,...]
        lbl = lbl.astype('float32')
        img_gc = img_gc.astype('float32')
        
        return img_gc, lbl

class SegMetric(mx.metric.EvalMetric):

    def __init__(self, use_mask=False):
        super(SegMetric, self).__init__('Seg')
        self.num = 2
        self.name = ['Accuracy_background','Accuracy_foreground']
        self.use_mask = use_mask
        self.reset()

    def reset(self):

        if getattr(self, 'num', None) is None:
            self.num_inst = 0
            self.sum_metric = 0.0
        else:
            self.num_inst = [0] * self.num
            self.sum_metric = [0.0] * self.num

    def update(self, labels, preds):
        
        for l ,p in zip(labels, preds):
            l = l.asnumpy()
            p = p.asnumpy()
            pl = np.argmax(p, axis=0)
            bg_gt = l==0
            fg_gt = l==1
            
            bg = bg_gt * (pl == 0)
            fg = fg_gt * (pl == 1)
            
            self.sum_metric[0] += bg.sum()
            self.sum_metric[1] += fg.sum()
            
            self.num_inst[0] += bg_gt.sum()
            self.num_inst[1] += fg_gt.sum()

        
    def get(self):

        if self.num is None:
            if self.num_inst == 0:
                return (self.name, float('nan'))
            else:
                return (self.name, self.sum_metric / float(self.num_inst))
        else:
            names = ['%s'%(self.name[i]) for i in range(self.num)]
            values = []
            for x, y in zip(self.sum_metric, self.num_inst):
                if y != 0:
                    values.append(x / y)
                else:
                    values.append(float('nan'))
            return [names, values]

my_train = MyDataSet('/Users/pinky/Documents/hsb/nn-dl/datasets/DRIVE', 'train')
my_valid = MyDataSet('/Users/pinky/Documents/hsb/nn-dl/datasets/DRIVE', 'valid')

train_loader = DataLoader(my_train, batch_size=4, shuffle=True, last_batch='rollover')
valid_loader = DataLoader(my_valid, batch_size=4, shuffle=True, last_batch='rollover')

net = UNet()
net.initialize(mx.init.Xavier())

num_epochs = 50

trainer = gluon.Trainer(net.collect_params(), 'adam', optimizer_params={'learning_rate': 0.001,'wd': 0.0005})

criterion = gluon.loss.SoftmaxCELoss(axis=1, sparse_label=False)
metrics = [SegMetric(use_mask=True)]

for epoch in range(num_epochs):
    t0 = time.time()
    for data, label in train_loader:
        batch_size = data.shape[0]
        with ag.record():
            preds = net(data)
            loss = criterion(preds, label)
        loss.backward()
        for m in metrics:
            m.update(labels=label, preds=preds)
        trainer.step(batch_size)
    for m in metrics:
        name, value = m.get()
    t1 = time.time()
    print(epoch, t1-t0, name, value)

for data, label in valid_loader:
    preds = net(data)
    for m in metrics:
        m.update(labels=label, preds=preds)
for m in metrics:
        name, value = m.get()
print('Valid:', name, value)

Still not working on Colab, but I found a problem when running it on my laptop: after net(data) is called, the label gets zeroed out which causes the loss and metrics to be meaningless. I have no idea why this is happening as I’m not getting any errors or warnings.

I tried to reproduce your problem , but the code is not running for me. Did you try using the image segmentation metric from GluonCV? You can find the metric definition here: https://gluon-cv.mxnet.io/_modules/gluoncv/utils/metrics/segmentation.html#SegmentationMetric

I get the same error only now it’s in GluonCV, here’s the trace:

Traceback (most recent call last):
  File "/Users/pinky/.vscode/extensions/ms-python.python-2019.5.18875/pythonFiles/ptvsd_launcher.py", line 43, in <module>
    main(ptvsdArgs)
  File "/Users/pinky/.vscode/extensions/ms-python.python-2019.5.18875/pythonFiles/lib/python/ptvsd/__main__.py", line 434, in main
    run()
  File "/Users/pinky/.vscode/extensions/ms-python.python-2019.5.18875/pythonFiles/lib/python/ptvsd/__main__.py", line 312, in run_file
    runpy.run_path(target, run_name='__main__')
  File "/Users/pinky/anaconda3/lib/python3.7/runpy.py", line 263, in run_path
    pkg_name=pkg_name, script_name=fname)
  File "/Users/pinky/anaconda3/lib/python3.7/runpy.py", line 96, in _run_module_code
    mod_name, mod_spec, pkg_name, script_name)
  File "/Users/pinky/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/pinky/Documents/hsb/nn-dl/U_Net_master/train3.py", line 165, in <module>
    metric.update(labels=label, preds=preds)
  File "/Users/pinky/anaconda3/lib/python3.7/site-packages/gluoncv/utils/metrics/segmentation.py", line 42, in update
    evaluate_worker(self, labels, preds)
  File "/Users/pinky/anaconda3/lib/python3.7/site-packages/gluoncv/utils/metrics/segmentation.py", line 32, in evaluate_worker
    pred, label)
  File "/Users/pinky/anaconda3/lib/python3.7/site-packages/gluoncv/utils/metrics/segmentation.py", line 77, in batch_pix_accuracy
    predict = np.argmax(output.asnumpy().astype('int64'), 1) + 1
  File "/Users/pinky/anaconda3/lib/python3.7/site-packages/mxnet/ndarray/ndarray.py", line 1980, in asnumpy
    ctypes.c_size_t(data.size)))
  File "/Users/pinky/anaconda3/lib/python3.7/site-packages/mxnet/base.py", line 252, in check_call
    raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [06:38:23] src/operator/./crop-inl.h:126: Check failed: data_shape[2] >= out_shape[2] (56 vs. 65) data_shape'height should be larger than that of out_shape

May I know why the code isn’t running?