No learning happens on titanic dataset using CSVIter

I am trying to follow this example http://gluon.mxnet.io/chapter02_supervised-learning/logistic-regression-gluon.html I am using titanic dataset and have converted categorical fields to numerics. The code is below:

from __future__ import print_function
import mxnet as mx
from mxnet import nd, autograd, gluon
import matplotlib.pyplot as plt
import logging
mx.random.seed(1)

data_ctx = mx.cpu()
model_ctx = mx.cpu()

def logistic(z):
    return 1. / (1. + nd.exp(-z))

def log_loss(output, y):
    yhat = logistic(output)
    return  - nd.nansum(  y * nd.log(yhat) + (1-y) * nd.log(1-yhat))

batch_size = 10
num_fea = 5
train_data = mx.io.CSVIter(data_csv="./titanic_train.csv", data_shape=(num_fea,),
                           label_csv="./titanic_train_lb.csv", label_shape=(1,),
                           batch_size=batch_size, round_batch = False)

net = gluon.nn.Dense(1)
net.collect_params().initialize(mx.init.Normal(sigma=0.01), ctx=model_ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01})

epochs = 30
loss_sequence = []

for e in range(epochs):
    cumulative_loss = 0
    for i, batch in enumerate(train_data):
        data = batch.data[0].as_in_context(model_ctx)
        label = batch.label[0].as_in_context(model_ctx)
        with autograd.record():
            output = net(data)
            loss = log_loss(output, label)
        loss.backward()
        trainer.step(batch_size)
        cumulative_loss += nd.sum(loss).asscalar()
    print("Epoch %s, loss: %s" % (e, cumulative_loss ))
    loss_sequence.append(cumulative_loss)

The output doesn’t show training happens:

Epoch 0, loss: 6015.84739304
Epoch 1, loss: 0
Epoch 2, loss: 0
Epoch 3, loss: 0
Epoch 4, loss: 0
Epoch 5, loss: 0
Epoch 6, loss: 0
Epoch 7, loss: 0
Epoch 8, loss: 0
Epoch 9, loss: 0
Epoch 10, loss: 0
Epoch 11, loss: 0

My data sample:
titanic_train.csv: (age column normalized to (0, 1] )
3,1,0.275,1,0
1,0,0.475,1,0
3,0,0.325,0,0
1,0,0.4375,1,0
3,1,0.4375,0,0
3,1,0.35,0,0
1,1,0.675,0,0
3,1,0.025,3,1
3,0,0.3375,0,2
2,0,0.175,1,0

titanic_train_lb.csv:
0
1
1
1
0
0
0
0
1
1
1
1
0
0

What am I doing wrong? Bug is opened:

Hi,

I don’t know the particular dataset you are using (and no time to go dig in it), but a couple of thoughts that may help:

  1. You have a very shallow model, like one linear layer (net = gluon.nn.Dense(1)), there are very few things you can learn from that. Try increasing the depth of your network.
  2. It seems that you end up having a lot of nans/infs, hence nd.nansum prints out zero. In other words, I suspect overflow in the nd.exp layer. I would carefully look into all of the output values of my network, in your case print on screen output and yhat, this will give you a clear picture of what’s going wrong in the numbers.
  3. Try to use built in functions, that are robust in overflow/underflow, you need gluon.loss.SigmoidBCELoss

Hope this helps.

@feevos - Thanks!

I think this issues relates how I use DataIterator vs. NDArrays. For example on the same dataset the model which uses NDArrays for loading data works just fine and converges:

def process_data(file, num_features):
    with open(file) as f:
        raw_data = f.read()
    
    lines = raw_data.splitlines()
    num_examples = len(lines)
    print("File: %s, Samples: %s" %(file,num_examples))
    X = nd.zeros((num_examples, num_features), ctx=data_ctx)
    Y = nd.zeros((num_examples, 1), ctx=data_ctx)
    for i, line in enumerate(lines):
        tokens = line.split(",")
        Y[i] = int(tokens[0]) # label is first column
        for j, token in enumerate(tokens[1:]):
            X[i, j] = float(token)
            
        if (i%10000 == 0):
            print("Step: %i, Loaded 10000 lines..." % i)
    return X, Y

And train data like this (this is from MxNet tutorials):

Xtrain, Ytrain = process_data("./data/train.csv", num_inputs)
Xtest, Ytest = process_data("./data/test.csv", num_inputs)
batch_size = 32
train_data = gluon.data.DataLoader(gluon.data.ArrayDataset(Xtrain, Ytrain), batch_size=batch_size, shuffle=True)
test_data = gluon.data.DataLoader(gluon.data.ArrayDataset(Xtest, Ytest), batch_size=batch_size, shuffle=False)
net = gluon.nn.Dense(num_outputs)
net.collect_params().initialize(mx.init.Normal(sigma=.1), ctx=model_ctx)
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(model_ctx).reshape((-1,num_inputs))
        label = label.as_in_context(model_ctx)
        output = net(data)
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]
lr = 0.05
epochs = 200
moving_loss = 0.
num_examples = 312797
loss_sequence = []
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
for e in range(epochs):
    cumulative_loss = 0
#    for i, batch in enumerate(train_data):
#        data = batch.data[0].as_in_context(model_ctx) #.reshape((-1,num_inputs))
#        label = batch.label[0].as_in_context(model_ctx)
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(model_ctx).reshape((-1,num_inputs))
        label = label.as_in_context(model_ctx)
        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        trainer.step(batch_size)
        cumulative_loss += nd.sum(loss).asscalar()

    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, round(cumulative_loss/num_examples,6), round(train_accuracy,4), round(test_accuracy,4)))
    loss_sequence.append(cumulative_loss)

It works! Only when data are loaded via CSVIter and executed in commented out section in training loop it converges to “nan”. Maybe I use incorrectly data iterators?