Difficulties with recurrent network

Hi,
I’m new in the topic of neural network and even newer in recurrent network.
I tried tried to implement a very basic recurrent network, just to see if it can learn a noisy sinus.

Input data:
A, T, epsilon, l =10, 10, 1, 3000
data = Anp.sin(2np.pi*np.arange(l)/T) + np.random.normal(0, epsilon, l)
train_features = data[:2000]

Recurrent network model:

class RnnTest(nn.HybridBlock):
    def __init__(self, size_hidden, **kwargs):
        super().__init__(**kwargs)
        with self.name_scope():
            self.size_hidden = size_hidden
            self.rnn = rnn.RNN(self.size_hidden)
            self.dense = nn.Dense(1)
    
    def hybrid_forward(self, F, x, state):
        out, state = self.rnn(x, state)
        return self.dense(out.reshape(-1, self.size_hidden)), state
    
    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

I used the L2Loss from gluon for the training:

size_hidden=16
num_epochs, lr, clip, batch_size, num_steps = 500, 0.01, 10, 32, 10
model = RnnTest(size_hidden)
model.initialize(init.Normal(sigma=0.1), force_reinit=True)
model.hybridize()
adam_optimizer = mx.optimizer.Adam(clip_gradient=clip, learning_rate=lr, wd=0)
trainer = gluon.Trainer(model.collect_params(), optimizer=adam_optimizer)

From that, I iterate over the data (the complete code is just below).
But in the end, the loss oscillates around a constant value, corresponding to the mean of the data (or zero, it is the same number as I used sinusoidal data).
I spent a lot of time playing with the hyperparameters, the optimizer (I first used sgd).

Here the complete code (I used some convenient functions described in the book d2l.ai):

import mxnet as mx
from mxnet import gluon, init, np, npx, autograd
npx.set_np()
from mxnet.gluon import nn, rnn
from d2l import mxnet as d2l
import random

A, T, epsilon, l =10, 10, 1, 3000
data = A*np.sin(2*np.pi*np.arange(l)/T) + np.random.normal(0, epsilon, l)
train_features = data[:2000]

def create_random_iter(data, batch_size, num_steps):
    offset = random.randint(0, num_steps)
    data = data[offset:]
    num_examples = ((len(data) - 1) // num_steps)
    example_indices = list(range(0, num_examples * num_steps, num_steps))
    random.shuffle(example_indices)

    num_batches = num_examples // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        batch_indices = example_indices[i:(i+batch_size)]
        X = np.stack([data[j: j+num_steps] for j in batch_indices]).T
        Y = np.stack([data[j+1: j+1+num_steps] for j in batch_indices]).T
        yield X.reshape(X.shape[0], batch_size, 1), Y.reshape(Y.shape[0], batch_size, 1)

        
class RnnTest(nn.HybridBlock):
    def __init__(self, size_hidden, **kwargs):
        super().__init__(**kwargs)
        with self.name_scope():
            self.size_hidden = size_hidden
            self.rnn = rnn.RNN(self.size_hidden)
            self.dense = nn.Dense(1)

    def hybrid_forward(self, F, x, state):
        out, state = self.rnn(x, state)
        return self.dense(out.reshape(-1, self.size_hidden)), state

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

    
def train_epoch(model, train_features, loss, trainer, batch_size=8, num_steps=30, use_random_iter=True):
    train_iter = create_random_iter(data, batch_size=batch_size, num_steps=num_steps)
    metric = d2l.Accumulator(2)
    for X, Y in train_iter:
        state = model.begin_state(batch_size=batch_size)
        y = Y.T.reshape(-1, 1)
        with autograd.record():
            pred, state = model(X, state)
            l = loss(pred, y)
        l.backward()
        trainer.step(batch_size=batch_size)
        metric.add(float(l.mean()) * Y.shape[0], Y.shape[0])
    return metric[0]/metric[1]

def train(model, train_features, loss, trainer, batch_size=4, num_steps=10):
    animator = d2l.Animator(xlabel='epoch', ylabel='loss', legend=['train'], xlim=[1, num_epochs])
    for i in range(num_epochs):
        err=train_epoch(model, train_features, loss, trainer, batch_size=batch_size, num_steps=num_steps)
        if i % 10 == 0:
            print(err)
            animator.add(i+1, [err])
    print('final loss:', err)

size_hidden=16
num_epochs, lr, clip, batch_size, num_steps = 100, 0.01, 10, 32, 10
model = RnnTest(size_hidden)
model.initialize(init.Normal(sigma=0.1), force_reinit=True)
model.hybridize()
adam_optimizer = mx.optimizer.Adam(clip_gradient=clip, learning_rate=lr, wd=0)
trainer = gluon.Trainer(model.collect_params(), optimizer=adam_optimizer)
loss = gluon.loss.L2Loss()


train(model, train_features, loss, trainer, batch_size=batch_size, num_steps=num_steps)