Hi,
I am using a fixed-sized batch size for my keras-mxnet Unet. However, the memory keeps increasing until I run out on the gpu. Here is the relevant code:
def BatchActivate(x):
x = BatchNormalization()(x)
x = Activation('relu')(x)
return x
def convolution_block(x, filters, size, strides=(1,1), padding='same', activation=True):
x = Conv2D(filters, size, strides=strides, padding=padding)(x)
if activation == True:
x = BatchActivate(x)
return x
def residual_block(blockInput, num_filters=16, batch_activate = False):
x = BatchActivate(blockInput)
x = convolution_block(x, num_filters, (3,3) )
x = convolution_block(x, num_filters, (3,3), activation=False)
x = Add()([x, blockInput])
if batch_activate:
x = BatchActivate(x)
return x
# Build model
def build_model(input_layer, start_neurons, DropoutRatio = 0.5):
# 101 -> 50
conv1 = Conv2D(start_neurons * 1, (3, 3), activation=None, padding="same")(input_layer)
conv1 = residual_block(conv1,start_neurons * 1)
conv1 = residual_block(conv1,start_neurons * 1, True)
print('conv1', conv1)
pool1 = MaxPooling2D((2, 2))(conv1)
pool1 = Dropout(DropoutRatio/2)(pool1)
print('pool1', pool1)
# 50 -> 25
conv2 = Conv2D(start_neurons * 2, (3, 3), activation=None, padding="same")(pool1)
conv2 = residual_block(conv2,start_neurons * 2)
conv2 = residual_block(conv2,start_neurons * 2, True)
print('conv2', conv2)
pool2 = MaxPooling2D((2, 2))(conv2)
pool2 = Dropout(DropoutRatio)(pool2)
print('pool2', pool2)
# 25 -> 12
conv3 = Conv2D(start_neurons * 4, (3, 3), activation=None, padding="same")(pool2)
conv3 = residual_block(conv3,start_neurons * 4)
conv3 = residual_block(conv3,start_neurons * 4, True)
print('conv3', conv3)
pool3 = MaxPooling2D((2, 2))(conv3)
pool3 = Dropout(DropoutRatio)(pool3)
print('pool3', pool3)
# 12 -> 6
conv4 = Conv2D(start_neurons * 8, (3, 3), activation=None, padding="same")(pool3)
conv4 = residual_block(conv4,start_neurons * 8)
conv4 = residual_block(conv4,start_neurons * 8, True)
print('conv4', conv4)
pool4 = MaxPooling2D((2, 2))(conv4)
pool4 = Dropout(DropoutRatio)(pool4)
print('pool4', pool4)
# Middle
convm = Conv2D(start_neurons * 16, (3, 3), activation=None, padding="same")(pool4)
convm = residual_block(convm,start_neurons * 16)
convm = residual_block(convm,start_neurons * 16, True)
print('convm', convm)
# 6 -> 12
deconv4 = Conv2DTranspose(start_neurons * 8, (3, 3), strides=(2, 2), padding="same")(convm)
uconv4 = concatenate([deconv4, conv4])
uconv4 = Dropout(DropoutRatio)(uconv4)
print('deconv4', deconv4)
print('uconv4', uconv4)
uconv4 = Conv2D(start_neurons * 8, (3, 3), activation=None, padding="same")(uconv4)
uconv4 = residual_block(uconv4,start_neurons * 8)
uconv4 = residual_block(uconv4,start_neurons * 8, True)
print('uconv4', uconv4)
# 12 -> 25
#deconv3 = Conv2DTranspose(start_neurons * 4, (3, 3), strides=(2, 2), padding="same")(uconv4)
deconv3 = Conv2DTranspose(start_neurons * 4, (3, 3), strides=(2, 2), padding="valid")(uconv4)
uconv3 = concatenate([deconv3, conv3])
uconv3 = Dropout(DropoutRatio)(uconv3)
print('deconv3', deconv3)
print('uconv3', uconv3)
uconv3 = Conv2D(start_neurons * 4, (3, 3), activation=None, padding="same")(uconv3)
uconv3 = residual_block(uconv3,start_neurons * 4)
uconv3 = residual_block(uconv3,start_neurons * 4, True)
print('uconv3', uconv3)
# 25 -> 50
deconv2 = Conv2DTranspose(start_neurons * 2, (3, 3), strides=(2, 2), padding="same")(uconv3)
uconv2 = concatenate([deconv2, conv2])
print('deconv2', deconv2)
print('uconv2', uconv2)
uconv2 = Dropout(DropoutRatio)(uconv2)
uconv2 = Conv2D(start_neurons * 2, (3, 3), activation=None, padding="same")(uconv2)
uconv2 = residual_block(uconv2,start_neurons * 2)
uconv2 = residual_block(uconv2,start_neurons * 2, True)
print('uconv2', uconv2)
# 50 -> 101
#deconv1 = Conv2DTranspose(start_neurons * 1, (3, 3), strides=(2, 2), padding="same")(uconv2)
deconv1 = Conv2DTranspose(start_neurons * 1, (3, 3), strides=(2, 2), padding="valid")(uconv2)
uconv1 = concatenate([deconv1, conv1])
print('deconv1', deconv1)
print('uconv1', uconv1)
uconv1 = Dropout(DropoutRatio)(uconv1)
uconv1 = Conv2D(start_neurons * 1, (3, 3), activation=None, padding="same")(uconv1)
uconv1 = residual_block(uconv1,start_neurons * 1)
uconv1 = residual_block(uconv1,start_neurons * 1, True)
print('uconv1', uconv1)
#uconv1 = Dropout(DropoutRatio/2)(uconv1)
#output_layer = Conv2D(1, (1,1), padding="same", activation="sigmoid")(uconv1)
output_layer_noActi = Conv2D(1, (1,1), padding="same", activation=None)(uconv1)
output_layer = Activation('sigmoid')(output_layer_noActi)
return output_layer
def my_iou_metric_mx(label, pred):
return K.get_iou_vector_mx(label, pred > 0.5)
def my_iou_metric_mx_0(label, pred):
return K.get_iou_vector_mx(label, pred > 0)
#Data augmentation
x_train = np.append(x_train, [np.fliplr(x) for x in x_train], axis=0)
y_train = np.append(y_train, [np.fliplr(x) for x in y_train], axis=0)
print(x_train.shape)
print(y_valid.shape)
# model
input_layer = Input((img_size_target, img_size_target, 1))
output_layer = build_model(input_layer, 16,0.5)
model1 = Model(input_layer, output_layer)
c = optimizers.adam(lr = 0.01)
model1.compile(loss="binary_crossentropy", optimizer=c, metrics=[my_iou_metric_mx])
model_checkpoint = ModelCheckpoint(save_model_name,monitor='my_iou_metric_mx',
mode = 'max', save_best_only=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='my_iou_metric_mx',
mode = 'max',factor=0.5, patience=5, min_lr=0.0001, verbose=1)
epochs = 50
batch_size = 16 #32 #128 #32
history = model1.fit(x_train, y_train,
validation_data=[x_valid, y_valid],
epochs=epochs,
batch_size=batch_size,
callbacks=[model_checkpoint,reduce_lr],
verbose=2)
which outputs
Train on 6400 samples, validate on 800 samples
Epoch 1/50
/home/cory/incubator-mxnet/python/mxnet/module/bucketing_module.py:408: UserWarning: Optimizer created manually outside Module but rescale_grad is not normalized to 1.0/batch_size/num_workers (1.0 vs. 0.0625). Is this intended?
force_init=force_init)
- 2573s - loss: 0.4227 - my_iou_metric_mx: 0.3474 - val_loss: 3.2197 - val_my_iou_metric_mx: 0.3900
Epoch 00001: my_iou_metric_mx improved from -inf to 0.34744, saving model to Unet_resnet_v5.model
Epoch 2/50
- 2577s - loss: 0.3169 - my_iou_metric_mx: 0.4870 - val_loss: 0.3166 - val_my_iou_metric_mx: 0.5243
Epoch 00002: my_iou_metric_mx improved from 0.34744 to 0.48705, saving model to Unet_resnet_v5.model
Epoch 3/50
- 2579s - loss: 0.2803 - my_iou_metric_mx: 0.5382 - val_loss: 0.7266 - val_my_iou_metric_mx: 0.3900
Epoch 00003: my_iou_metric_mx improved from 0.48705 to 0.53817, saving model to Unet_resnet_v5.model
Epoch 4/50
- 2577s - loss: 0.2660 - my_iou_metric_mx: 0.5613 - val_loss: 0.5543 - val_my_iou_metric_mx: 0.3865
Epoch 00004: my_iou_metric_mx improved from 0.53817 to 0.56134, saving model to Unet_resnet_v5.model
Epoch 5/50
- 2575s - loss: 0.2541 - my_iou_metric_mx: 0.5754 - val_loss: 0.2699 - val_my_iou_metric_mx: 0.5891
Epoch 00005: my_iou_metric_mx improved from 0.56134 to 0.57538, saving model to Unet_resnet_v5.model
Epoch 6/50
- 2584s - loss: 0.2410 - my_iou_metric_mx: 0.5763 - val_loss: 0.2421 - val_my_iou_metric_mx: 0.5548
Epoch 00006: my_iou_metric_mx improved from 0.57538 to 0.57631, saving model to Unet_resnet_v5.model
Epoch 7/50
- 2624s - loss: 0.2294 - my_iou_metric_mx: 0.5831 - val_loss: 0.2470 - val_my_iou_metric_mx: 0.6124
Epoch 00007: my_iou_metric_mx improved from 0.57631 to 0.58308, saving model to Unet_resnet_v5.model
Epoch 8/50
- 2613s - loss: 0.2275 - my_iou_metric_mx: 0.5865 - val_loss: 0.5051 - val_my_iou_metric_mx: 0.5329
Epoch 00008: my_iou_metric_mx improved from 0.58308 to 0.58645, saving model to Unet_resnet_v5.model
Epoch 9/50
- 2611s - loss: 0.2152 - my_iou_metric_mx: 0.5994 - val_loss: 0.2608 - val_my_iou_metric_mx: 0.5950
Epoch 00009: my_iou_metric_mx improved from 0.58645 to 0.59938, saving model to Unet_resnet_v5.model
Epoch 10/50
- 2613s - loss: 0.2141 - my_iou_metric_mx: 0.6039 - val_loss: 0.2263 - val_my_iou_metric_mx: 0.6195
Epoch 00010: my_iou_metric_mx improved from 0.59938 to 0.60387, saving model to Unet_resnet_v5.model
Epoch 11/50
---------------------------------------------------------------------------
MXNetError Traceback (most recent call last)
<ipython-input-17-509e1d31b127> in <module>()
12 batch_size=batch_size,
13 callbacks=[model_checkpoint,reduce_lr],
---> 14 verbose=2)
/home/cory/keras-apache-mxnet/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
1051 initial_epoch=initial_epoch,
1052 steps_per_epoch=steps_per_epoch,
-> 1053 validation_steps=validation_steps)
1054
1055 def evaluate(self, x=None, y=None,
/home/cory/keras-apache-mxnet/keras/engine/training_arrays.py in fit_loop(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
197 ins_batch[i] = ins_batch[i].toarray()
198
--> 199 outs = f(ins_batch)
200 outs = to_list(outs)
201 for l, o in zip(out_labels, outs):
/home/cory/keras-apache-mxnet/keras/backend/mxnet_backend.py in train_function(inputs)
5454 self._weights_dirty = True
5455 outs = self._module.get_outputs()[:self._ntrain]
-> 5456 return [x.asnumpy().mean() for x in outs]
5457
5458 self.train_function = train_function
/home/cory/keras-apache-mxnet/keras/backend/mxnet_backend.py in <listcomp>(.0)
5454 self._weights_dirty = True
5455 outs = self._module.get_outputs()[:self._ntrain]
-> 5456 return [x.asnumpy().mean() for x in outs]
5457
5458 self.train_function = train_function
/home/cory/incubator-mxnet/python/mxnet/ndarray/ndarray.py in asnumpy(self)
1978 self.handle,
1979 data.ctypes.data_as(ctypes.c_void_p),
-> 1980 ctypes.c_size_t(data.size)))
1981 return data
1982
/home/cory/incubator-mxnet/python/mxnet/base.py in check_call(ret)
251 """
252 if ret != 0:
--> 253 raise MXNetError(py_str(_LIB.MXGetLastError()))
254
255
MXNetError: [15:34:24] src/storage/./pooled_storage_manager.h:119: cudaMalloc failed: out of memory
Stack trace returned 10 entries:
[bt] (0) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::StackTrace[abi:cxx11]()+0x5b) [0x7fb4a2ee029b]
[bt] (1) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x28) [0x7fb4a2ee0e08]
[bt] (2) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::storage::GPUPooledStorageManager::Alloc(mxnet::Storage::Handle*)+0x159) [0x7fb4a5e5f9f9]
[bt] (3) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::StorageImpl::Alloc(mxnet::Storage::Handle*)+0x5d) [0x7fb4a5e61a1d]
[bt] (4) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::NDArray::CheckAndAlloc() const+0x238) [0x7fb4a2fa3d68]
[bt] (5) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(+0x36f67df) [0x7fb4a58cf7df]
[bt] (6) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x1d5) [0x7fb4a58eb995]
[bt] (7) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::engine::ThreadedEngine::BulkAppend(std::function<void (mxnet::RunContext)>, mxnet::Context, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x68) [0x7fb4a5e4c9b8]
[bt] (8) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::engine::ThreadedEngine::BulkAppend(std::function<void (mxnet::RunContext)>, mxnet::Context, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x47) [0x7fb4a5e4c997]
[bt] (9) /home/cory/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::engine::ThreadedEngine::BulkAppend(std::function<void (mxnet::RunContext)>, mxnet::Context, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x47) [0x7fb4a5e4c997]
I have seen
Based on these, I think the most probable root cause might be not synchronizing in a loop such as in:
@keras_mxnet_symbol
def get_iou_vector_mx(A, B):
"""
t = K.greater(A, 0)
p = K.greater(B, 0)
intersection = logical_and(t, p)
union = logical_or(t, p)
iou = (np.sum(K.greater(intersection,0) + 1e-10) / (np.sum(union > 0) + 1e-10)
"""
def step(data, _):
zero = mx.sym.zeros((1))
t = mx.sym.broadcast_greater(data[0], zero)
p = mx.sym.broadcast_greater(data[1], zero)
intersection = mx.sym.broadcast_logical_and(t, p)
union = mx.sym.broadcast_logical_or(t, p)
iou = (mx.sym.sum(mx.sym.broadcast_greater(intersection, zero)) + mx.sym.full((1), 1e-10))/ \
(mx.sym.sum(mx.sym.broadcast_greater(union, zero)) + mx.sym.full((1), 1e-10))
thresholds = mx.sym.arange(0.5, 1 , 0.05)
return mx.sym.mean(mx.sym.broadcast_greater(iou, thresholds)), _
data = [A.symbol, B.symbol]
output, _ = mx.sym.contrib.foreach(step, data, [])
return KerasSymbol(mx.sym.mean(output))
However, synchronizing only seems to be an issue with imperative (ndarray)? In other words, I’m not sure if the “memory leak” is coming from something I’ve done or mxnet internals.