As described in doc, i used same example for distributed training with autogluon still its failing with below errors. can you help out ?
code:
@ag.args(
batch_size=64,
lr=ag.Real(1e-4, 1e-1, log=True),
momentum=0.9,
wd=ag.Real(1e-4, 5e-4),
)
def train_fn(args, reporter):
print(‘task_id: {}, lr: {}’.format(args.task_id, args.lr))
for e in range(10):
top1_accuracy = 1 - np.power(1.8, -np.random.uniform(e, 2*e))
reporter(epoch=e, accuracy=top1_accuracy)
extra_node_ips = [‘x.x.x.y’,‘x.x.x.z’,‘x.x.x.x’,‘x.x.x.v’]
scheduler = ag.scheduler.FIFOScheduler(
train_fn,
resource={‘num_cpus’: 2, ‘num_gpus’: 0},
num_trials=20,
reward_attr=‘accuracy’,
time_attr=‘epoch’,
dist_ip_addrs=extra_node_ips)
print(scheduler)
IPs are masked
errors:
Starting Experiments
Num of Finished Tasks is 0
Num of Pending Tasks is 20
25%|██████████████████████████████▎ | 5/20 [00:00<00:00, 44.57it/s]task_id: 0, lr: 0.0031622777
Exception in thread Thread-37:
Traceback (most recent call last):
File “/apps/miniconda3/envs/automl/lib/python3.7/threading.py”, line 926, in _bootstrap_inner
self.run()
File “/apps/miniconda3/envs/automl/lib/python3.7/threading.py”, line 870, in run
self._target(*self._args, **self._kwargs)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/autogluon/scheduler/fifo.py”, line 261, in _run_reporter
reward=last_result[self._reward_attr], **last_result)
KeyError: ‘accuracy’
Exception in thread Thread-36:
Traceback (most recent call last):
File “/apps/miniconda3/envs/automl/lib/python3.7/threading.py”, line 926, in _bootstrap_inner
self.run()
File “/apps/miniconda3/envs/automl/lib/python3.7/threading.py”, line 870, in run
self._target(*self._args, **self._kwargs)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/autogluon/scheduler/fifo.py”, line 261, in _run_reporter
reward=last_result[self._reward_attr], **last_result)
KeyError: ‘accuracy’
Finished Task with config: {‘lr’: 0.0031622777, ‘wd’: 0.0003} and reward: 0.9997084979745358
[ worker x.x.x.x ] : task_id: 1, lr: 0.096898.v081482162
[ worker x.x.x.x ] : remote process exited with exit status -1
[ scheduler x.x.x.y:8703 ] : remote process exited with exit status -1
[ worker x.x.x.y ] : remote process exited with exit status -1
[ worker x.x.x.z ] : Traceback (most recent call last):
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/worker.py”, line 2340, in _maybe_deserialize_task
[ worker x.x.x.z ] : function, args, kwargs = _deserialize(*self.tasks[key])
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/worker.py”, line 3132, in _deserialize
[ worker x.x.x.z ] : args = pickle.loads(args)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/protocol/pickle.py”, line 59, in loads
[ worker x.x.x.z ] : return pickle.loads(x)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/queues.py”, line 266, in setstate
[ worker x.x.x.z ] : client = get_client(address)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/worker.py”, line .v01, in get_client
[ worker x.x.x.z ] : return Client(address, timeout=timeout)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 721, in init
[ worker x.x.x.z ] : self.start(timeout=timeout)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 886, in start
[ worker x.x.x.z ] : sync(self.loop, self._start, **kwargs)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 333, in sync
[ worker x.x.x.z ] : raise exc.with_traceback(tb)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 317, in f
[ worker x.x.x.z ] : result[0] = yield future
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/tornado/gen.py”, line 735, in run
[ worker x.x.x.z ] : value = future.result()
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 979, in _start
[ worker x.x.x.z ] : await self._ensure_connected(timeout=timeout)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 1036, in _ensure_connected
[ worker x.x.x.z ] : connection_args=self.connection_args,
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/core.py”, line 217, in connect
[ worker x.x.x.z ] : quiet_exceptions=EnvironmentError,
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/inproc.py”, line 286, in connect
[ worker x.x.x.z ] : listener = self.manager.get_listener_for(address)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/inproc.py”, line 52, in get_listener_for
[ worker x.x.x.z ] : self.validate_address(addr)
[ worker x.x.x.z ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/inproc.py”, line 66, in validate_address
[ worker x.x.x.z ] : % (addr, self.ip, os.getpid())
[ worker x.x.x.z ] : ValueError: inproc address ‘x.x.x.u/646/1’ does not match host (‘x.x.x.z’) or pid (4429)
[ worker x.x.x.z ] : remote process exited with exit status -1
[ worker x.x.x…v ] : Traceback (most recent call last):
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/worker.py”, line 2340, in _maybe_deserialize_task
[ worker x.x.x…v ] : function, args, kwargs = _deserialize(*self.tasks[key])
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/worker.py”, line 3132, in _deserialize
[ worker x.x.x…v ] : args = pickle.loads(args)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/protocol/pickle.py”, line 59, in loads
[ worker x.x.x…v ] : return pickle.loads(x)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/queues.py”, line 266, in setstate
[ worker x.x.x…v ] : client = get_client(address)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/worker.py”, line .v01, in get_client
[ worker x.x.x…v ] : return Client(address, timeout=timeout)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 721, in init
[ worker x.x.x…v ] : self.start(timeout=timeout)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 886, in start
[ worker x.x.x…v ] : sync(self.loop, self._start, **kwargs)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 333, in sync
[ worker x.x.x…v ] : raise exc.with_traceback(tb)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 317, in f
[ worker x.x.x…v ] : result[0] = yield future
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/tornado/gen.py”, line 735, in run
[ worker x.x.x…v ] : value = future.result()
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 979, in _start
[ worker x.x.x…v ] : await self._ensure_connected(timeout=timeout)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 1036, in _ensure_connected
[ worker x.x.x…v ] : connection_args=self.connection_args,
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/core.py”, line 217, in connect
[ worker x.x.x…v ] : quiet_exceptions=EnvironmentError,
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/inproc.py”, line 286, in connect
[ worker x.x.x…v ] : listener = self.manager.get_listener_for(address)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/inproc.py”, line 52, in get_listener_for
[ worker x.x.x…v ] : self.validate_address(addr)
[ worker x.x.x…v ] : File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/inproc.py”, line 66, in validate_address
[ worker x.x.x…v ] : % (addr, self.ip, os.getpid())
[ worker x.x.x…v ] : ValueError: inproc address ‘x.x.x.u/646/1’ does not match host (‘x.x.x.v’) or pid (14687)
Traceback (most recent call last):
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/core.py”, line 221, in connect
_raise(error)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/core.py”, line 204, in _raise
raise IOError(msg)
OSError: Timed out trying to connect to ‘tcp://x.x.x.z:8704’ after 10 s: in <distributed.comm.tcp.TCPConnector object at 0x7f701c0d7a10>: ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “autogluonStartSimpleDist.py”, line 39, in
scheduler.run()
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/autogluon/scheduler/fifo.py”, line 146, in run
self.schedule_next()
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/autogluon/scheduler/fifo.py”, line 177, in schedule_next
self.add_job(task, **extra_kwargs)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/autogluon/scheduler/fifo.py”, line 218, in add_job
reporter = DistStatusReporter(remote=task.resources.node)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/autogluon/scheduler/reporter.py”, line 1.v, in init
self._queue = Queue(client=remote)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/queues.py”, line 184, in init
maxsize=maxsize,
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 333, in sync
raise exc.with_traceback(tb)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 317, in f
result[0] = yield future
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/tornado/gen.py”, line 735, in run
value = future.result()
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/core.py”, line 735, in send_recv_from_rpc
comm = await self.pool.connect(self.addr)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/core.py”, line 869, in connect
connection_args=self.connection_args,
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/core.py”, line 2.v, in connect
_raise(error)
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/comm/core.py”, line 204, in _raise
raise IOError(msg)
OSError: Timed out trying to connect to ‘tcp://x.x.x.z:8704’ after 10 s: Timed out trying to connect to ‘tcp://x.x.x.z:8704’ after 10 s: in <distributed.comm.tcp.TCPConnector object at 0x7f701c0d7a10>: ConnectionRefusedError: [Errno 111] Connection refused
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR -
Traceback (most recent call last):
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 662, in log_errors
yield
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 1290, in _close
await gen.with_timeout(timedelta(seconds=2), list(coroutines))
concurrent.futures._base.CancelledError
distributed.utils - ERROR -
Traceback (most recent call last):
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 662, in log_errors
yield
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 1019, in _reconnect
await self._close()
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 1290, in _close
await gen.with_timeout(timedelta(seconds=2), list(coroutines))
concurrent.futures._base.CancelledError
distributed.utils - ERROR -
Traceback (most recent call last):
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 662, in log_errors
yield
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 1290, in _close
await gen.with_timeout(timedelta(seconds=2), list(coroutines))
concurrent.futures._base.CancelledError
distributed.utils - ERROR -
Traceback (most recent call last):
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/utils.py”, line 662, in log_errors
yield
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 1019, in _reconnect
await self._close()
File “/apps/miniconda3/envs/automl/lib/python3.7/site-packages/distributed/client.py”, line 1290, in _close
await gen.with_timeout(timedelta(seconds=2), list(coroutines))
concurrent.futures._base.CancelledError
25%|██████████████████████████████▎ | 5/20 [00:20<00:00, 44.57it/s]