I have the following Decoder which is fed with captions of various sizes (captions in a batch are padded to the same length).
The problem is that on the first forward pass, the linear layer locks the dimensions of the input, which don’t match those belonging to the same batch and onward. throwing me: Error in operator dense1_fwd: Shape inconsistent, Provided = [9956,9728], inferred shape=(9956,9216)
.
What are the possible solutions that I can employ to overcome this issue, preferably without changing much in the network design.
class DecoderRNN(HybridBlock):
@overrides
def hybrid_forward(self, F, features, captions, *args, **kwargs):
embeddings = self.embed(captions)
features_and_embeddings = F.concat(features.expand_dims(axis=1), embeddings, dim=1)
output = self.lstm(features_and_embeddings)
result = self.linear(output)
return result
def __init__(self, embed_size: int, hidden_size: int, vocab_size: int, num_layers: int):
super(DecoderRNN, self).__init__()
self.embed = Embedding(input_dim=vocab_size, output_dim=embed_size)
self.lstm = LSTM(hidden_size, num_layers, layout="NTC")
self.linear = Dense(vocab_size, flatten=True)
@overrides
def initialize(self, **kwargs):
self.embed.initialize(**kwargs)
self.lstm.initialize(**kwargs)
self.linear.initialize(**kwargs)