RNN explanation and input data format

I am working on a simple RNN with gluon that predicts a sinus like curve / function.
My problem is that i don’t know whats wrong with my code or (to say it more clearly) how i define my input data structure. In my opinion the Input should be BATCH x WINDOW x FEATURES but the documentation says SEQUENCE x BATCH x FEATURES and i don’t know how that should work.

Also there is a lack of (simple) RNN tutorials compared to CNN / etc. Maby someone is willingly to write some very short RNN / LSTM tutorials. That would be great!

My code:

def dataset():
    x = np.arange(0, (smpl_length + seq_length) * steps, steps)
    sliding_window = []

    for i in range(len(x)):
        if i + seq_length < len(x):
           sliding_window.append(x[i:i + seq_length])

    sliding_window = np.array(sliding_window)

    labels = np.sin(sliding_window[1:, 0])
    sliding_window = sliding_window[:-1]

    return nd.array(sliding_window).reshape(len(sliding_window), -1, 1), nd.array(labels).reshape(-1, 1)

X, y = dataset()
train_dataset = gluon.data.dataset.ArrayDataset(X[:train_length], y[:train_length])
train_dataloader = gluon.data.DataLoader(train_dataset, batch_size, shuffle=False)

test_dataset = gluon.data.dataset.ArrayDataset(X[train_length:], y[train_length:])
test_dataloader = gluon.data.DataLoader(test_dataset, batch_size, shuffle=False)
ctx = mx.cpu()

net = gluon.nn.Sequential()
with net.name_scope():
   net.add(gluon.rnn.RNN(5, 1, layout='NTC')) # or TNC tried both but nothing works

net.initialize(mx.init.Xavier(), ctx=ctx)

l2 = gluon.loss.L2Loss()
trainer = gluon.Trainer(net.collect_params(), 'adam')

def evaluate_mse(data_iterator, net):
    loss_avg = 0.

    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = net(data)
        loss = l2(output, label)
        loss_avg = (loss_avg * i + nd.mean(loss).asscalar()) / (i + 1)

    return loss_avg
loss_history = []
smoothing_constant = .01

for e in range(epochs):
    for i, (data, label) in enumerate(train_dataloader):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)

        with autograd.record():
            output = net(data)
            loss = l2(output, label)

        current_loss = nd.mean(loss).asscalar()
        moving_loss = (current_loss if ((i == 0) and (e == 0)) else (1 - smoothing_constant) * moving_loss + smoothing_constant * current_loss)

    train_mse = evaluate_mse(train_dataloader, net)
    test_mse = evaluate_mse(test_dataloader, net)
    print('Epoch %s. Train MSE: %s, Test MSE: %s' % (e, train_mse, test_mse))