Save_parameters cudaMalloc failed

ubuntu + cuda9.2 + mxnet1.3.0

my model is a Gluon SSD like model( VGG16 base + 6 CNN layers ), use VOC2012 dataset batchsize=32
load all datas into GPU and trained in epoch is always OK ,
but save_parameters() is failed with “cudaMalloc failed: out of memory”,
I put a loop before call the save_parameters, and use nvidia-smi to check the GPU,it has 3G memory left!

Could you provide a minimum reproducible example, please? I would try to debug this case for you. Also, how much GPU memory you have in total?

Thanks,

my model define as follow:

ssd_net.py

def gen_sequential( num,block ):
	'''
	create many HybridSequential containers
	'''
	layers = []
	for _ in range(0,num ):
		seq = nn.HybridSequential()
		
		#this is important :-)
		block.register_child( seq )
		layers.append( seq )
		
	return layers

class MiniSSD( HybridBlock ):
	def __init__(self,  ctx, classes=20, **kwargs):
		super(MiniSSD, self).__init__(**kwargs)

		#assume the input image is 3*310*310

		self.num_layers = 6
		#self.num_class = 21 #VOC object classes + 1
		self.num_class = classes+1 

		with self.name_scope():
			#conv2d layers to generate feature map
			self.conv_layers = gen_sequential( self.num_layers,self )
			self.conv_params = {
				#first layer copied from VGG16,didnot use this parameters 
				"channels" 		: [-1,1024,512,256,256,256],
				"kernel_size" 	: [-1,3,3,3,3,3],
				"strides"		: [-1,1,1,1,1,1],
				"padding"       : [-1,1,2,1,2,1],
				"pool_type"      : [-1,"maxpool","maxpool","maxpool","maxpool","avgpool"],
				"pool_size"       : [-1,2,2,2,2,2]
			}
			
			#conv2d layers to inference object class
			self.cls_layers = gen_sequential( self.num_layers,self )
			
			#conv2d layers to inference object location
			self.loc_layers = gen_sequential( self.num_layers,self )
			
			#prior boxes generated from feature map
			self.prior_boxes = []

			#construct conv layers
			for i in range(0,self.num_layers):
				if i == 0 :
					#load predefined vgg16 model into GPU!(This is important)
					base_model = vision.get_model("vgg16",pretrained=True,ctx=ctx)	
					#and copy 0-20 layers to first conv layer
					for layer in base_model.features[0:21]:
						#print(layer.name)
						self.conv_layers[0].add(layer)
				else:
					#conv2d + batchnorm + relu + maxpool/avgpool
					self.conv_layers[i].add( 
						nn.Conv2D(
							channels=self.conv_params['channels'][i], 
							kernel_size=self.conv_params['kernel_size'][i], 
							strides=self.conv_params['strides'][i], 
							padding=self.conv_params['padding'][i]
						) 
						)
					self.conv_layers[i].add( nn.BatchNorm() )
					self.conv_layers[i].add( nn.Activation("relu") )
					if self.conv_params['pool_type'][i] == "maxpool" :
						self.conv_layers[i].add( nn.MaxPool2D( self.conv_params['pool_size'][i] ) )
					else:
						self.conv_layers[i].add( nn.AvgPool2D( self.conv_params['pool_size'][i] ) )

		
		#prior box parameters
		self.boxes_per_cell = [4,6,6,6,6,4]
		
		#construct class predict layers: batchnorm + conv2d
		for i in range(0,self.num_layers):
			self.cls_layers[i].add( nn.BatchNorm() )
			self.cls_layers[i].add( nn.Conv2D(channels=self.num_class*self.boxes_per_cell[i], kernel_size=3, strides=1, padding=1) )

			
		#construct location predict layers: batchnorm + conv2d
		#in every layer,the output is : 
		#4*channels*feature_map_width*feature_map_height == 4*default_boxes_on_map(generated by MultiBoxPrior) 
		for i in range(0,self.num_layers):
			self.loc_layers[i].add( nn.BatchNorm() )
			self.loc_layers[i].add( nn.Conv2D(channels=4*self.boxes_per_cell[i], kernel_size=3, strides=1, padding=1) )

		#prior parameters for generate anchor boxes when forward ( in train/predict )
		self.prior_box_params = {
			"sizes" : [ [0.2,0.3],[0.3,0.4],[0.4,0.5],[0.5,0.6],[0.6,0.7],[0.8,0.9] ],
			"ratios" : [ [0.5,1,2],[1,2,0.5,3,1/3],[1,2,0.5,3,1/3],[1,2,0.5,3,1/3],[1,2,0.5,3,1/3],[1,2,0.5] ],
			"clip" :[ False,False,False,False,False,False ],
			"steps" : [ [8/310,8/310],[16/310,16/310],[31/310,31/310],[62/310,62/310],[103/310,103/310],[310/310,310/310] ]
		
				
		}

def hybrid_forward(self, F, x, *args, **kwargs):

		cls_pred_list = []
		loc_pred_list = []
		default_box_list = []

		for i in range(0,self.num_layers):
			#get feature map
			x = self.conv_layers[i](x)
			#print("conv output=",x.shape)
			
			#get class data
			cls_pred_list.append( nd.flatten( nd.transpose( self.cls_layers[i](x), axes=(0,2,3,1) ) ) )
			#print("cls output=",cls_pred_list[len(cls_pred_list)-1].shape )
			
			#get location data
			loc_pred_list.append( nd.flatten( nd.transpose( self.loc_layers[i](x), axes=(0,2,3,1) ) ) )
			#print("loc output=",loc_pred_list[len(loc_pred_list)-1].shape )

			#call mxnet.ndarray.contrib.MultiBoxPrior to generate prior boxes
			#it seems MultiBoxPrior only receive and handle batch_size=1's tensor 
			#so we need to slice batch and expand_dims to [1,x,x,x] and then concat...
			tmp = []
			for j in range(0,x.shape[0]):
				
				tmp.append( nd.contrib.MultiBoxPrior( 
				                nd.expand_dims( x[j,:,:,:], axis=0 ),
				                sizes=self.prior_box_params["sizes"][i],
				                ratios=self.prior_box_params["ratios"][i],
				                #clip=self.prior_box_params["clip"][i],
				                steps=self.prior_box_params["steps"][i],
				                offsets=[0.5,0.5] 
			                ) )

			default_box_list.append( nd.concat(*tmp,dim=0) )
			#print("shape of default_box:",default_box_list[len(default_box_list)-1].shape )

			


		#concat and ...
		'''
		cls_pred = nd.concat( *cls_pred_list,dim=1 )
		cls_pred = nd.reshape( cls_pred, shape=(0,-1,self.num_class))
		cls_pred = nd.transpose( cls_pred, axes=(0,2,1))
		'''
		cls_pred = nd.transpose( 
				nd.reshape( nd.concat( *cls_pred_list,dim=1 ), shape=(0,-1,self.num_class)), 
				axes=(0,2,1)
				)
		
		
		loc_pred = nd.concat( *loc_pred_list,dim=1 )

		'''
		default_box = nd.concat( *default_box_list, dim=1 )
		default_box = nd.reshape( default_box, shape=(0,-1,4))
		'''
		default_box = nd.reshape( nd.concat( *default_box_list, dim=1 ), shape=(0,-1,4))
		
		
		return [cls_pred,loc_pred,default_box]

def initialize(self, init=None, ctx=None, verbose=False, force_reinit=False):
		# note: ignore pretrained vgg16 network
		for param in self.collect_params().values():
			#print("init param.....",type(param),param,ctx)
			if param._data is not None:
				#print("reset ctx")
				param.reset_ctx(ctx)
			else:
				param.initialize(init=init, ctx=ctx, force_reinit=True)
-------------------------------------------------------------------------------------------
ssd_train.py:
ctx = mx.gpu()

#train_lst_file = "./trainimage-bak.lst"
#train_lst_file = "./VOC-train-all.lst"
#train_lst_file = "./VOC-mini.lst"
#train_lst_file = "./trainimage.lst"
train_lst_file = "./VOC-train-1000.lst"

model_prefix = "minissd"

batch_size = 32

train_iter = mx.image.ImageDetIter(
        batch_size = batch_size,
        data_shape = (3,310,310),
        path_imglist  = train_lst_file,
        shuffle = True)

num_samples = 1000

'''
profiler.set_config(profile_all=True,
                   filename='chrome_tracing_profile.json', 
                        continuous_dump=True,
                        aggregate_stats=True) 
profiler.set_state('run')
'''

net = MiniSSD( ctx=ctx )
#net.initialize( init=mx.initializer.Xavier(magnitude=2),ctx=ctx )
net.initialize( init=mx.initializer.Normal(),ctx=ctx )
#net.initialize( ctx=ctx )


HuberLoss = mx.gluon.loss.HuberLoss( rho=1 )
SoftmaxCrossEntropyLoss = mx.gluon.loss.SoftmaxCrossEntropyLoss(  axis=-1, sparse_label=True, from_logits=False, weight=None, batch_axis=0 )

loss_weight = 1.0

trainer = mx.gluon.Trainer(net.collect_params(),
              'sgd', {'learning_rate': 0.001, 'wd': 5e-4})

num_epochs = 1
print_batches = 1
period = 1

#net.hybridize()

for epoch in range(0, num_epochs):

	train_iter.reset()
	  
	train_loss, n = 0.0, 0.0

	tic = time.time()  
	for i,batch in enumerate( train_iter ):
		
		batch_data = batch.data[0].as_in_context(ctx)
		batch_label = batch.label[0].as_in_context(ctx)

		with autograd.record():

			cls_pred,loc_pred,default_box = net( batch_data )

			tmp = nd.contrib.MultiBoxTarget(
					default_box[0:1,:,:], #Anchors are shared across batches
					batch_label,
					cls_pred,
					overlap_threshold = 0.5,
					ignore_label=-1,
					negative_mining_ratio=3,
					minimum_negative_samples = 0,
					negative_mining_thresh=.5, 
					variances=(0.1, 0.1, 0.2, 0.2)
				)
			loc_target = tmp[0]
			loc_target_mask = tmp[1]
			cls_target = tmp[2]
				
			loc_loss = HuberLoss( loc_target_mask*loc_pred, loc_target_mask*loc_target )
			cls_loss = SoftmaxCrossEntropyLoss( nd.transpose(cls_pred,axes=(0,2,1)), cls_target )
			loss = loc_loss + cls_loss*loss_weight 

		loss.backward()	
		trainer.step(batch_size)
		
		train_loss += sum([l.asscalar() for l in loss])
		
		n += batch_size

		#sys.exit()

		if (i+1) % print_batches == 0:
			print(
                "Epoch [%d]. Batch [%d]. Loss [%f]. Time %.1f sec" % 
                (epoch, n, train_loss/n, time.time() - tic))
 		

	# save checkpoint
	#profiler.set_state('stop') 
	'''
	if (epoch + 1) % period == 0:
		net.save_parameters("./param/" + model_prefix + "-{}.params".format(epoch + 1))
		print("Saved checkpoint to {}-{}.params".format(model_prefix, epoch + 1))
	'''
	

net.save_parameters("./param/" + model_prefix + "-last.params")

and my 1080Ti has 11G memory.

Thanks!

add this:
I prepare a small and new lst file and reserve only 5 objects ( about 1447 images)
and change the num_classes from 20( VOC default) to 5,
I use nividia-smi to check, the memory usage in train epoch is much less than the usage when use stand classes( 20 ) , about 7-8G,
but when call save_parameters , the problem remains

I played with your example, and I think that the problem is not in save_parameters() method. I believe that there is some kind of error happens during training, and it pops up in a strange way while saving parameters.

I have updated your code and used manually created split of 64 records (I took Main/trainval.txt and deleted everything except of the first 64 records). I used your training procedure, but gluoncv data preprocessing. I had to reduce batch size to 1, so it won’t complain about data mismatches. The code below has no problem saving the parameters, once “param” directory exists, and my memory consumption doesn’t go above 3.5 gb leaving tons of place for an increased batch size:

import time

import mxnet as mx
from gluoncv import data
from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform
from mxnet import autograd, nd
from mxnet.gluon.data import DataLoader

from ssd_net import MiniSSD

ctx = mx.gpu()

# train_lst_file = "./trainimage-bak.lst"
# train_lst_file = "./VOC-train-all.lst"
# train_lst_file = "./VOC-mini.lst"
# train_lst_file = "./trainimage.lst"
train_lst_file = "./VOC-train-1000.lst"

model_prefix = "minissd"

batch_size = 1
height = 310
width = 310

net = MiniSSD(ctx=ctx)
# net.initialize( init=mx.initializer.Xavier(magnitude=2),ctx=ctx )
net.initialize(init=mx.initializer.Normal(), ctx=ctx)
# net.initialize( ctx=ctx )

# sample is a manually created split - you won't have it
# 1. go to ~/.mxnet/datasets/voc/VOC2007/ImageSets/Main directory 
# 2. copy trainval.txt and remove everything except of first 64 records
voc_dataset = data.VOCDetection(splits=((2007, 'sample'),))
tranformed_dataset = voc_dataset.transform(
    SSDDefaultTrainTransform(height, width))
train_iter = DataLoader(tranformed_dataset, batch_size, shuffle=False, last_batch='discard')
# train_iter = mx.image.ImageDetIter(
#     batch_size=batch_size,
#     data_shape=(3, 310, 310),
#     path_imglist=train_lst_file,
#     shuffle=True)

num_samples = 64

'''
profiler.set_config(profile_all=True,
                   filename='chrome_tracing_profile.json', 
                        continuous_dump=True,
                        aggregate_stats=True) 
profiler.set_state('run')
'''

HuberLoss = mx.gluon.loss.HuberLoss(rho=1)
SoftmaxCrossEntropyLoss = mx.gluon.loss.SoftmaxCrossEntropyLoss(axis=-1, sparse_label=True,
                                                                from_logits=False, weight=None,
                                                                batch_axis=0)

loss_weight = 1.0

trainer = mx.gluon.Trainer(net.collect_params(),
                           'sgd', {'learning_rate': 0.001, 'wd': 5e-4})

num_epochs = 1
print_batches = 1
period = 1

# net.hybridize()

for epoch in range(0, num_epochs):
    train_loss, n = 0.0, 0.0

    tic = time.time()
    for i, (batch_data, batch_label) in enumerate(train_iter):

        batch_data = batch_data.as_in_context(ctx)
        batch_label = batch_label.as_in_context(ctx)

        with autograd.record():
            cls_pred, loc_pred, default_box = net(batch_data)

            tmp = nd.contrib.MultiBoxTarget(
                default_box[0:1, :, :],  # Anchors are shared across batches
                batch_label,
                cls_pred,
                overlap_threshold=0.5,
                ignore_label=-1,
                negative_mining_ratio=3,
                minimum_negative_samples=0,
                negative_mining_thresh=.5,
                variances=(0.1, 0.1, 0.2, 0.2)
            )
            loc_target = tmp[0]
            loc_target_mask = tmp[1]
            cls_target = tmp[2]

            loc_loss = HuberLoss(loc_target_mask * loc_pred, loc_target_mask * loc_target)
            cls_loss = SoftmaxCrossEntropyLoss(nd.transpose(cls_pred, axes=(0, 2, 1)), cls_target)
            loss = loc_loss + cls_loss * loss_weight

        loss.backward()
        trainer.step(batch_size)

        train_loss += sum([l.asscalar() for l in loss])

        n += batch_size

        # sys.exit()

        if (i + 1) % print_batches == 0:
            print(
                "Epoch [%d]. Batch [%d]. Loss [%f]. Time %.1f sec" %
                (epoch, n, train_loss / n, time.time() - tic))

    # save checkpoint
    # profiler.set_state('stop')
    '''
    if (epoch + 1) % period == 0:
        net.save_parameters("./param/" + model_prefix + "-{}.params".format(epoch + 1))
        print("Saved checkpoint to {}-{}.params".format(model_prefix, epoch + 1))
    '''

net.save_parameters("./param/" + model_prefix + "-last.params")

I recommend to go through this tutorial to see how the training is done using gluoncv. Try to use their code to save the parameters, and if it works, then there some sort of error in your training code.