Updating to the nightly fixed the error - thanks! But unfortunately I think I’m still doing something wrong, since hybridizing doesn’t seem to improve network performance as much as the examples suggest it should.

Here’s an minimum reproducible example:

```
import time
import mxnet as mx
from mxnet import nd, gluon
class GGNN(gluon.HybridBlock):
def __init__(self, hidden_size, **kwargs):
super().__init__(**kwargs)
self.hidden_size = hidden_size
with self.name_scope():
self.message_fxns = []
for t in range(10):
layer = gluon.nn.Dense(self.hidden_size, in_units=self.hidden_size)
self.register_child(layer)
self.message_fxns.append(layer)
self.hidden_gru = gluon.rnn.GRUCell(self.hidden_size, input_size=self.hidden_size)
def compute_messages(self, F, values, edges):
summed_msgs = []
for adj_mat, msg_fxn in zip(edges, self.message_fxns):
passed_msgs = msg_fxn(values)
summed_msgs.append(F.dot(adj_mat, passed_msgs))
values = F.sum(F.stack(*summed_msgs), axis=0)
return values
def update_values(self, F, values, messages):
values, _ = self.hidden_gru(messages, [values])
return values
def hybrid_forward(self, F, values, *args, **kwargs):
edges = args[0]
for t in range(8):
messages = self.compute_messages(F, values, edges)
values = self.update_values(F, values, messages)
return values
def time_model(model, ctx):
tic = time.time()
for b in range(10):
values = nd.random.normal(shape=(10000, hidden_size), ctx=ctx)
edges = [nd.random.normal(shape=(10000, 10000), ctx=ctx) for _ in range(10)]
model(values, edges)
nd.waitall()
return time.time() - tic
if __name__=='__main__':
hidden_size = 64
ctx = mx.gpu(0)
model = GGNN(hidden_size)
model.collect_params().initialize(ctx=ctx)
print('Without hybridize: {}'.format(time_model(model, ctx)))
model.hybridize()
print('With hybridize: {}'.format(time_model(model, ctx)))
```

This returns

“Without hybridize: 3.265143871307373

With hybridize: 3.2073848247528076”

for me.

Is it possible that the mxnet scheduler is somehow not performing the for loop in compute_messages in parallel?