Possible Memory Leak

Hi,

I am using a fixed-sized batch size for my keras-mxnet Unet. However, the memory keeps increasing until I run out on the gpu. Here is the relevant code:

def BatchActivate(x):
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    return x

def convolution_block(x, filters, size, strides=(1,1), padding='same', activation=True):
    x = Conv2D(filters, size, strides=strides, padding=padding)(x)
    if activation == True:
        x = BatchActivate(x)
    return x

def residual_block(blockInput, num_filters=16, batch_activate = False):
    x = BatchActivate(blockInput)
    x = convolution_block(x, num_filters, (3,3) )
    x = convolution_block(x, num_filters, (3,3), activation=False)
    x = Add()([x, blockInput])
    if batch_activate:
        x = BatchActivate(x)
    return x

# Build model
def build_model(input_layer, start_neurons, DropoutRatio = 0.5):
    # 101 -> 50
    conv1 = Conv2D(start_neurons * 1, (3, 3), activation=None, padding="same")(input_layer)
    conv1 = residual_block(conv1,start_neurons * 1)
    conv1 = residual_block(conv1,start_neurons * 1, True)
    print('conv1', conv1)
    pool1 = MaxPooling2D((2, 2))(conv1)
    pool1 = Dropout(DropoutRatio/2)(pool1)
    print('pool1', pool1)
    
    # 50 -> 25
    conv2 = Conv2D(start_neurons * 2, (3, 3), activation=None, padding="same")(pool1)
    conv2 = residual_block(conv2,start_neurons * 2)
    conv2 = residual_block(conv2,start_neurons * 2, True)
    print('conv2', conv2)
    pool2 = MaxPooling2D((2, 2))(conv2)
    pool2 = Dropout(DropoutRatio)(pool2)
    print('pool2', pool2)

    # 25 -> 12
    conv3 = Conv2D(start_neurons * 4, (3, 3), activation=None, padding="same")(pool2)
    conv3 = residual_block(conv3,start_neurons * 4)
    conv3 = residual_block(conv3,start_neurons * 4, True)
    print('conv3', conv3)
    pool3 = MaxPooling2D((2, 2))(conv3)
    pool3 = Dropout(DropoutRatio)(pool3)
    print('pool3', pool3)

    # 12 -> 6
    conv4 = Conv2D(start_neurons * 8, (3, 3), activation=None, padding="same")(pool3)
    conv4 = residual_block(conv4,start_neurons * 8)
    conv4 = residual_block(conv4,start_neurons * 8, True)
    print('conv4', conv4)
    pool4 = MaxPooling2D((2, 2))(conv4)
    pool4 = Dropout(DropoutRatio)(pool4)
    print('pool4', pool4)
    
    # Middle
    convm = Conv2D(start_neurons * 16, (3, 3), activation=None, padding="same")(pool4)
    convm = residual_block(convm,start_neurons * 16)
    convm = residual_block(convm,start_neurons * 16, True)
    print('convm', convm)
    
    # 6 -> 12
    deconv4 = Conv2DTranspose(start_neurons * 8, (3, 3), strides=(2, 2), padding="same")(convm)
    uconv4 = concatenate([deconv4, conv4])
    uconv4 = Dropout(DropoutRatio)(uconv4)
    print('deconv4', deconv4)
    print('uconv4', uconv4)
    
    uconv4 = Conv2D(start_neurons * 8, (3, 3), activation=None, padding="same")(uconv4)
    uconv4 = residual_block(uconv4,start_neurons * 8)
    uconv4 = residual_block(uconv4,start_neurons * 8, True)
    print('uconv4', uconv4)
    
    # 12 -> 25
    #deconv3 = Conv2DTranspose(start_neurons * 4, (3, 3), strides=(2, 2), padding="same")(uconv4)
    deconv3 = Conv2DTranspose(start_neurons * 4, (3, 3), strides=(2, 2), padding="valid")(uconv4)
    uconv3 = concatenate([deconv3, conv3])    
    uconv3 = Dropout(DropoutRatio)(uconv3)
    print('deconv3', deconv3)
    print('uconv3', uconv3)
    
    uconv3 = Conv2D(start_neurons * 4, (3, 3), activation=None, padding="same")(uconv3)
    uconv3 = residual_block(uconv3,start_neurons * 4)
    uconv3 = residual_block(uconv3,start_neurons * 4, True)
    print('uconv3', uconv3)

    # 25 -> 50
    deconv2 = Conv2DTranspose(start_neurons * 2, (3, 3), strides=(2, 2), padding="same")(uconv3)
    uconv2 = concatenate([deconv2, conv2])
    print('deconv2', deconv2)
    print('uconv2', uconv2)
        
    uconv2 = Dropout(DropoutRatio)(uconv2)
    uconv2 = Conv2D(start_neurons * 2, (3, 3), activation=None, padding="same")(uconv2)
    uconv2 = residual_block(uconv2,start_neurons * 2)
    uconv2 = residual_block(uconv2,start_neurons * 2, True)
    print('uconv2', uconv2)
    
    # 50 -> 101
    #deconv1 = Conv2DTranspose(start_neurons * 1, (3, 3), strides=(2, 2), padding="same")(uconv2)
    deconv1 = Conv2DTranspose(start_neurons * 1, (3, 3), strides=(2, 2), padding="valid")(uconv2)
    uconv1 = concatenate([deconv1, conv1])
    print('deconv1', deconv1)
    print('uconv1', uconv1)
    
    uconv1 = Dropout(DropoutRatio)(uconv1)
    uconv1 = Conv2D(start_neurons * 1, (3, 3), activation=None, padding="same")(uconv1)
    uconv1 = residual_block(uconv1,start_neurons * 1)
    uconv1 = residual_block(uconv1,start_neurons * 1, True)
    print('uconv1', uconv1)

    #uconv1 = Dropout(DropoutRatio/2)(uconv1)
    #output_layer = Conv2D(1, (1,1), padding="same", activation="sigmoid")(uconv1)
    output_layer_noActi = Conv2D(1, (1,1), padding="same", activation=None)(uconv1)
    output_layer =  Activation('sigmoid')(output_layer_noActi)
    
    return output_layer

def my_iou_metric_mx(label, pred):
    return K.get_iou_vector_mx(label, pred > 0.5)

def my_iou_metric_mx_0(label, pred):
    return K.get_iou_vector_mx(label, pred > 0)

#Data augmentation
x_train = np.append(x_train, [np.fliplr(x) for x in x_train], axis=0)
y_train = np.append(y_train, [np.fliplr(x) for x in y_train], axis=0)
print(x_train.shape)
print(y_valid.shape)

# model
input_layer = Input((img_size_target, img_size_target, 1))
output_layer = build_model(input_layer, 16,0.5)

model1 = Model(input_layer, output_layer)

c = optimizers.adam(lr = 0.01)
model1.compile(loss="binary_crossentropy", optimizer=c, metrics=[my_iou_metric_mx])

model_checkpoint = ModelCheckpoint(save_model_name,monitor='my_iou_metric_mx', 
                                   mode = 'max', save_best_only=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='my_iou_metric_mx', 
                              mode = 'max',factor=0.5, patience=5, min_lr=0.0001, verbose=1)

epochs = 50
batch_size = 16 #32 #128 #32
history = model1.fit(x_train, y_train,
                    validation_data=[x_valid, y_valid], 
                    epochs=epochs,
                    batch_size=batch_size,
                    callbacks=[model_checkpoint,reduce_lr], 
                    verbose=2)

which outputs

Train on 6400 samples, validate on 800 samples
Epoch 1/50
/home/cory/incubator-mxnet/python/mxnet/module/bucketing_module.py:408: UserWarning: Optimizer created manually outside Module but rescale_grad is not normalized to 1.0/batch_size/num_workers (1.0 vs. 0.0625). Is this intended?
  force_init=force_init)
 - 2573s - loss: 0.4227 - my_iou_metric_mx: 0.3474 - val_loss: 3.2197 - val_my_iou_metric_mx: 0.3900

Epoch 00001: my_iou_metric_mx improved from -inf to 0.34744, saving model to Unet_resnet_v5.model
Epoch 2/50
 - 2577s - loss: 0.3169 - my_iou_metric_mx: 0.4870 - val_loss: 0.3166 - val_my_iou_metric_mx: 0.5243

Epoch 00002: my_iou_metric_mx improved from 0.34744 to 0.48705, saving model to Unet_resnet_v5.model
Epoch 3/50
 - 2579s - loss: 0.2803 - my_iou_metric_mx: 0.5382 - val_loss: 0.7266 - val_my_iou_metric_mx: 0.3900

Epoch 00003: my_iou_metric_mx improved from 0.48705 to 0.53817, saving model to Unet_resnet_v5.model
Epoch 4/50
 - 2577s - loss: 0.2660 - my_iou_metric_mx: 0.5613 - val_loss: 0.5543 - val_my_iou_metric_mx: 0.3865

Epoch 00004: my_iou_metric_mx improved from 0.53817 to 0.56134, saving model to Unet_resnet_v5.model
Epoch 5/50
 - 2575s - loss: 0.2541 - my_iou_metric_mx: 0.5754 - val_loss: 0.2699 - val_my_iou_metric_mx: 0.5891

Epoch 00005: my_iou_metric_mx improved from 0.56134 to 0.57538, saving model to Unet_resnet_v5.model
Epoch 6/50
 - 2584s - loss: 0.2410 - my_iou_metric_mx: 0.5763 - val_loss: 0.2421 - val_my_iou_metric_mx: 0.5548

Epoch 00006: my_iou_metric_mx improved from 0.57538 to 0.57631, saving model to Unet_resnet_v5.model
Epoch 7/50
 - 2624s - loss: 0.2294 - my_iou_metric_mx: 0.5831 - val_loss: 0.2470 - val_my_iou_metric_mx: 0.6124

Epoch 00007: my_iou_metric_mx improved from 0.57631 to 0.58308, saving model to Unet_resnet_v5.model
Epoch 8/50
 - 2613s - loss: 0.2275 - my_iou_metric_mx: 0.5865 - val_loss: 0.5051 - val_my_iou_metric_mx: 0.5329

Epoch 00008: my_iou_metric_mx improved from 0.58308 to 0.58645, saving model to Unet_resnet_v5.model
Epoch 9/50
 - 2611s - loss: 0.2152 - my_iou_metric_mx: 0.5994 - val_loss: 0.2608 - val_my_iou_metric_mx: 0.5950

Epoch 00009: my_iou_metric_mx improved from 0.58645 to 0.59938, saving model to Unet_resnet_v5.model
Epoch 10/50
 - 2613s - loss: 0.2141 - my_iou_metric_mx: 0.6039 - val_loss: 0.2263 - val_my_iou_metric_mx: 0.6195

Epoch 00010: my_iou_metric_mx improved from 0.59938 to 0.60387, saving model to Unet_resnet_v5.model
Epoch 11/50
---------------------------------------------------------------------------
MXNetError                                Traceback (most recent call last)
<ipython-input-17-509e1d31b127> in <module>()
     12                     batch_size=batch_size,
     13                     callbacks=[model_checkpoint,reduce_lr],
---> 14                     verbose=2)

/home/cory/keras-apache-mxnet/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
   1051                                         initial_epoch=initial_epoch,
   1052                                         steps_per_epoch=steps_per_epoch,
-> 1053                                         validation_steps=validation_steps)
   1054 
   1055     def evaluate(self, x=None, y=None,

/home/cory/keras-apache-mxnet/keras/engine/training_arrays.py in fit_loop(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
    197                     ins_batch[i] = ins_batch[i].toarray()
    198 
--> 199                 outs = f(ins_batch)
    200                 outs = to_list(outs)
    201                 for l, o in zip(out_labels, outs):

/home/cory/keras-apache-mxnet/keras/backend/mxnet_backend.py in train_function(inputs)
   5454                 self._weights_dirty = True
   5455                 outs = self._module.get_outputs()[:self._ntrain]
-> 5456                 return [x.asnumpy().mean() for x in outs]
   5457 
   5458             self.train_function = train_function

/home/cory/keras-apache-mxnet/keras/backend/mxnet_backend.py in <listcomp>(.0)
   5454                 self._weights_dirty = True
   5455                 outs = self._module.get_outputs()[:self._ntrain]
-> 5456                 return [x.asnumpy().mean() for x in outs]
   5457 
   5458             self.train_function = train_function

/home/cory/incubator-mxnet/python/mxnet/ndarray/ndarray.py in asnumpy(self)
   1978             self.handle,
   1979             data.ctypes.data_as(ctypes.c_void_p),
-> 1980             ctypes.c_size_t(data.size)))
   1981         return data
   1982 

/home/cory/incubator-mxnet/python/mxnet/base.py in check_call(ret)
    251     """
    252     if ret != 0:
--> 253         raise MXNetError(py_str(_LIB.MXGetLastError()))
    254 
    255 

MXNetError: [15:34:24] src/storage/./pooled_storage_manager.h:119: cudaMalloc failed: out of memory

Stack trace returned 10 entries:
[bt] (0) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::StackTrace[abi:cxx11]()+0x5b) [0x7fb4a2ee029b]
[bt] (1) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x28) [0x7fb4a2ee0e08]
[bt] (2) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::storage::GPUPooledStorageManager::Alloc(mxnet::Storage::Handle*)+0x159) [0x7fb4a5e5f9f9]
[bt] (3) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::StorageImpl::Alloc(mxnet::Storage::Handle*)+0x5d) [0x7fb4a5e61a1d]
[bt] (4) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::NDArray::CheckAndAlloc() const+0x238) [0x7fb4a2fa3d68]
[bt] (5) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(+0x36f67df) [0x7fb4a58cf7df]
[bt] (6) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x1d5) [0x7fb4a58eb995]
[bt] (7) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::engine::ThreadedEngine::BulkAppend(std::function<void (mxnet::RunContext)>, mxnet::Context, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x68) [0x7fb4a5e4c9b8]
[bt] (8) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::engine::ThreadedEngine::BulkAppend(std::function<void (mxnet::RunContext)>, mxnet::Context, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x47) [0x7fb4a5e4c997]
[bt] (9) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::engine::ThreadedEngine::BulkAppend(std::function<void (mxnet::RunContext)>, mxnet::Context, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x47) [0x7fb4a5e4c997]

I have seen

Based on these, I think the most probable root cause might be not synchronizing in a loop such as in:

@keras_mxnet_symbol
 def get_iou_vector_mx(A, B):
     """  
     t = K.greater(A, 0)
     p = K.greater(B, 0)
     intersection = logical_and(t, p)
     union = logical_or(t, p)
     iou =  (np.sum(K.greater(intersection,0) + 1e-10) / (np.sum(union > 0) + 1e-10)
     """
     def step(data, _):
         zero = mx.sym.zeros((1))
         t = mx.sym.broadcast_greater(data[0], zero)
         p = mx.sym.broadcast_greater(data[1], zero)
         intersection = mx.sym.broadcast_logical_and(t, p)
         union = mx.sym.broadcast_logical_or(t, p)
         iou = (mx.sym.sum(mx.sym.broadcast_greater(intersection, zero)) + mx.sym.full((1), 1e-10))/ \
             (mx.sym.sum(mx.sym.broadcast_greater(union, zero)) + mx.sym.full((1), 1e-10))
         thresholds = mx.sym.arange(0.5, 1 , 0.05)
         return mx.sym.mean(mx.sym.broadcast_greater(iou, thresholds)), _
     data = [A.symbol, B.symbol]
     output, _  = mx.sym.contrib.foreach(step, data, [])
     return KerasSymbol(mx.sym.mean(output))

However, synchronizing only seems to be an issue with imperative (ndarray)? In other words, I’m not sure if the “memory leak” is coming from something I’ve done or mxnet internals.

For now, removing validation data and model checkpoint eliminate memory leak.
Use the following command:
history = model1.fit(x_train, y_train,
epochs=epochs,
batch_size=batch_size,
callbacks=[reduce_lr],
verbose=2)
Need to investiage how to re-enable this and it’s been tracked at:

First time MXNet user here.

I am trying to deploy a Seq2Seq model using Sockeye which uses MXNet. At inference time, the model seems pretty quick except for when it sees a piece of text which is longer than the text blobs it has seen before.

I figured out a work-around for this by initializing the model at the beginning with texts of varying length that I would expect at inference time - small to large.
It did the job, but now the model initialization time seems too long for my use case.

I am wondering how can I initialize the model with a custom set of text data and serialize the cache and carry it along the model to deployment.
Then, I would just need to a. load the architecture + b. load the weights + c. load the cache.

Has anybody tried anything like this? Or have an opinion whether his would work (reduce the model initialization time)? Or any other ideas to do the same?

I am open to suggestions.

If have tried something like this or have come across cases where this has been tried, can you point me towards it?