Log monitor 报错 FileNotFound

Ray版本:ray2.2.0, python 3.10
Ray集群:一个cpu节点作为头节点,2个GPU节点作为计算节点。

正常提交任务,执行一个类Pathway大模型训练的过程中,Ray log monitor报错,报错信息如下:

The log monitor on node SH-IDC1-10-140-1-176 failed with the following error:
Traceback (most recent call last):
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 520, in <module>
    log_monitor.run()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 436, in run
    anything_published = self.check_log_files_and_publish_updates()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 339, in check_log_files_and_publish_updates
    file_info.reopen_if_necessary()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 79, in reopen_if_necessary
    new_inode = os.stat(self.filename).st_ino
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/cache/share_data/huangting.p/ray/session_2023-02-20_17-46-26_882787_1059999/logs/worker-65110474ef8dc70b12a4df98c6ab9c9a671758254b23109f56b79a
a6-01000000-1061751.err'

The log monitor on node SH-IDC1-10-140-0-76 failed with the following error:
Traceback (most recent call last):
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 520, in <module>
    log_monitor.run()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 436, in run
    anything_published = self.check_log_files_and_publish_updates()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 339, in check_log_files_and_publish_updates
    file_info.reopen_if_necessary()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 79, in reopen_if_necessary
    new_inode = os.stat(self.filename).st_ino
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/cache/share_data/huangting.p/ray/session_2023-02-20_17-46-26_882787_1059999/logs/worker-65110474ef8dc70b12a4df98c6ab9c9a671758254b23109f56b79a
a6-01000000-1061751.err'

Log channel is reconnecting. Logs produced while the connection was down can be found on the head node of the cluster in `ray_client_server_[port].out`
2023-02-20 21:06:21,034 WARNING dataclient.py:396 -- Encountered connection issues in the data channel. Attempting to reconnect.
2023-02-20 21:06:51,245 WARNING dataclient.py:403 -- Failed to reconnect the data channel

....

Traceback (most recent call last):
  File "/mnt/petrelfs/huangting.p/chenxun/mu2net_uniscale/main.py", line 501, in <module>
    main()
  File "/mnt/petrelfs/huangting.p/chenxun/mu2net_uniscale/main.py", line 417, in main
    task_iter(task, generation_size, pop, generation_id, mu2net.utils.loop_id, exp_config, args)
  File "/mnt/petrelfs/huangting.p/chenxun/mu2net_uniscale/main.py", line 156, in task_iter
    paths_return_info = train_loop(paths, task, ds_hparams, exp_config, train_infos)
  File "/mnt/petrelfs/huangting.p/chenxun/mu2net_uniscale/mu2net/train.py", line 567, in train_loop
    if single_train_path.status in [PathStatus.RUNNING, PathStatus.PENDING]:
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/uniscale/path/ddppath.py", line 407, in status
    self.__handle_status_from_pending_to_running()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/uniscale/path/ddppath.py", line 354, in __handle_status_from_pending_to_running
    path_status = rpc._get(self._inner_actor.get_path_status.remote(self.uuid))
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/uniscale/_internal/rpc.py", line 271, in _get
    raise UniscaleInternalError(traceback.format_exc()) from None
uniscale.exceptions.UniscaleInternalError: Traceback (most recent call last):
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/uniscale/_internal/rpc.py", line 269, in _get
    return ray.get(object_refs, timeout=timeout)
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
    return getattr(ray, func.__name__)(*args, **kwargs)
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/util/client/api.py", line 42, in get
    return self.worker.get(vals, timeout=timeout)
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/util/client/worker.py", line 434, in get
    res = self._get(to_get, op_timeout)
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/util/client/worker.py", line 451, in _get
    req = ray_client_pb2.GetRequest(ids=[r.id for r in ref], timeout=timeout)
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/util/client/worker.py", line 451, in <listcomp>
    req = ray_client_pb2.GetRequest(ids=[r.id for r in ref], timeout=timeout)
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/util/client/common.py", line 136, in id
    return self.binary()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/util/client/common.py", line 115, in binary
    self._wait_for_id()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/util/client/common.py", line 192, in _wait_for_id
    self._set_id(self._id_future.result(timeout=timeout))
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/concurrent/futures/_base.py", line 446, in result
    return self.__get_result()
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/concurrent/futures/_base.py", line 391, in __get_result
    raise self._exception
ConnectionError: Failed during this or a previous request. Exception that broke the connection: Failed to reconnect within the reconnection grace period (30s)

该文件确实在/mnt/cache/share_data/huangting.p/ray/session_2023-02-20_17-46-26_882787_1059999/logs/目录下不存在,但是在/mnt/cache/share_data/huangting.p/ray/session_2023-02-20_17-46-26_882787_1059999/logs/old/目录下存在,并且在出现该故障后,集群不再可用,重新提交任务,会报ray client connection timeout, 请教一下解决方法。

1 个赞

看下 1061751 这个进程是否活着。

另外,log monitor不在关键链路上,理论上不会影响ray client提交任务。需要看下head节点ray client server的日志是否异常。

集群已经被销毁,无法获取1061751这个进程是否alive,另外我检查了ray client server的日志,包括gcs_server.out。发现核心组件的日志均停留在故障时间2-20T21:06:15,之后再无输出,包括gcs_server.*, raylet.*, dashboard.*monitor.*等等,通过slurm的数据库,该时间点和任务失败的时间点一致,在故障时间点,有明显错误信息的如下:

log_monitor.log:

2023-02-20 21:06:19,584|INFO log_monitor.py:247 -- Beginning to track file worker-7184a2fe3399ee0c45917de1781d419b7715fb6ee4923f2a960675fd-02000000-80841.out
2023-02-20 21:06:20,586|ERROR log_monitor.py:534 -- The log monitor on node SH-IDC1-10-140-1-176 failed with the following error:
Traceback (most recent call last):                                                 
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 520, in <module>
    log_monitor.run()                                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 436, in run 
    anything_published = self.check_log_files_and_publish_updates()                
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 339, in check_log_files_and_publish_updates
    file_info.reopen_if_necessary()                                             
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 79, in reopen_if_necessary
    new_inode = os.stat(self.filename).st_ino                                      
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/cache/share_data/huangting.p/ray/session_2023-02-20_17-46-26_882787_1059999/logs/worker-65110474ef8dc70b12a4df98c6ab9c9a671758254b23109f56b79aa6-01000000-1061751.err'
                                                                                   
2023-02-20 21:06:20,612|ERROR log_monitor.py:534 -- The log monitor on node SH-IDC1-10-140-0-76 failed with the following error:
Traceback (most recent call last):                                                 
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 520, in <module>
    log_monitor.run()                                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 436, in run 
    anything_published = self.check_log_files_and_publish_updates()                
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 339, in check_log_files_and_publish_updates
    file_info.reopen_if_necessary()                                                
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 79, in reopen_if_necessary
    new_inode = os.stat(self.filename).st_ino                                      
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/cache/share_data/huangting.p/ray/session_2023-02-20_17-46-26_882787_1059999/logs/worker-65110474ef8dc70b12a4df98c6ab9c9a671758254b23109f56b79aa6-01000000-1061751.err'

dashboard_agent.log:

2023-02-20 21:06:27,468|ERROR reporter_agent.py:938 -- Error publishing node physical stats.
Traceback (most recent call last):                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/dashboard/modules/reporter/reporter_agent.py", line 915, in _perform_iteration
    formatted_status_string = await self._gcs_aio_client.internal_kv_get(       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 151, in wrapper                                                                 
    return await f(self, *args, **kwargs)                                                                      
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 446, in internal_kv_get
    reply = await self._kv_stub.InternalKVGet(req, timeout=timeout)                                                                                                                                         
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/grpc/aio/_call.py", line 290, in __await__                                                                       
    raise _create_rpc_error(self._cython_call._initial_metadata,                
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:                                          
|   status = StatusCode.UNAVAILABLE                                             
|   details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused"
|   debug_error_string = "UNKNOWN:failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused {created_time:"2023-02-20T21:06:27.468451143+08:00", grpc_status:14}"
>                                                                               
2023-02-20 21:06:34,990|ERROR reporter_agent.py:938 -- Error publishing node physical stats.
Traceback (most recent call last):                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/dashboard/modules/reporter/reporter_agent.py", line 915, in _perform_iteration
    formatted_status_string = await self._gcs_aio_client.internal_kv_get(       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 151, in wrapper
    return await f(self, *args, **kwargs)                                       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 446, in internal_kv_get
    reply = await self._kv_stub.InternalKVGet(req, timeout=timeout)             
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/grpc/aio/_call.py", line 290, in __await__
    raise _create_rpc_error(self._cython_call._initial_metadata,                
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:           
|   status = StatusCode.UNAVAILABLE                                             
|   details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused"
|   debug_error_string = "UNKNOWN:failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused {created_time:"2023-02-20T21:06:34.989999268+08:00", grpc_status:14}"
>                                                                               
2023-02-20 21:06:42,532|ERROR reporter_agent.py:938 -- Error publishing node physical stats.
Traceback (most recent call last):                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/dashboard/modules/reporter/reporter_agent.py", line 915, in _perform_iteration
    formatted_status_string = await self._gcs_aio_client.internal_kv_get(       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 151, in wrapper
    return await f(self, *args, **kwargs)                                       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 446, in internal_kv_get
    reply = await self._kv_stub.InternalKVGet(req, timeout=timeout)             
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/grpc/aio/_call.py", line 290, in __await__                                                                       
    raise _create_rpc_error(self._cython_call._initial_metadata,                
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:           
|   status = StatusCode.UNAVAILABLE                                             
|   details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused"
|   debug_error_string = "UNKNOWN:failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused {created_time:"2023-02-20T21:06:42.53237816+08:00", grpc_status:14}"
2023-02-20 21:06:50,092|ERROR reporter_agent.py:938 -- Error publishing node physical stats.
Traceback (most recent call last):                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/dashboard/modules/reporter/reporter_agent.py", line 915, in _perform_iteration
    formatted_status_string = await self._gcs_aio_client.internal_kv_get(       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 151, in wrapper
    return await f(self, *args, **kwargs)                                       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 446, in internal_kv_get
    reply = await self._kv_stub.InternalKVGet(req, timeout=timeout)             
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/grpc/aio/_call.py", line 290, in __await__
    raise _create_rpc_error(self._cython_call._initial_metadata,                
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:           
|   status = StatusCode.UNAVAILABLE                                             
|   details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused"
|   debug_error_string = "UNKNOWN:failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused {created_time:"2023-02-20T21:06:50.092078005+08:00", grpc_status:14}"
>                                                                               
2023-02-20 21:06:57,605|ERROR reporter_agent.py:938 -- Error publishing node physical stats.
Traceback (most recent call last):                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/dashboard/modules/reporter/reporter_agent.py", line 915, in _perform_iteration
    formatted_status_string = await self._gcs_aio_client.internal_kv_get(       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 151, in wrapper
    return await f(self, *args, **kwargs)                                       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 446, in internal_kv_get
    reply = await self._kv_stub.InternalKVGet(req, timeout=timeout)             
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/grpc/aio/_call.py", line 290, in __await__
    raise _create_rpc_error(self._cython_call._initial_metadata,                
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:           
|   status = StatusCode.UNAVAILABLE                                             
|   details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused"
|   debug_error_string = "UNKNOWN:failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused {grpc_status:14, created_time:"2023-02-20T21:06:57.605693228+08:00"}"
>                                                                               
2023-02-20 21:07:05,119|ERROR reporter_agent.py:938 -- Error publishing node physical stats.
Traceback (most recent call last):                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/dashboard/modules/reporter/reporter_agent.py", line 915, in _perform_iteration
    formatted_status_string = await self._gcs_aio_client.internal_kv_get(       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 151, in wrapper
    return await f(self, *args, **kwargs)                                       
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/gcs_utils.py", line 446, in internal_kv_get
    reply = await self._kv_stub.InternalKVGet(req, timeout=timeout)             
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/grpc/aio/_call.py", line 290, in __await__
    raise _create_rpc_error(self._cython_call._initial_metadata,                
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:           
|   status = StatusCode.UNAVAILABLE                                             
|   details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:10.140.0.76:6379: Failed to connect to remote host: Connection refused" 

// 略,需要补充请回复说明。

ray_client_server_23001.err:

2023-02-20 21:06:20,587|WARNING worker.py:1851 -- The log monitor on node SH-IDC1-10-140-1-176 failed with the following error:
Traceback (most recent call last):                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 520, in <module>
    log_monitor.run()                                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 436, in run 
    anything_published = self.check_log_files_and_publish_updates()                
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 339, in check_log_files_and_publish_updates
    file_info.reopen_if_necessary()                                             
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 79, in reopen_if_necessary
    new_inode = os.stat(self.filename).st_ino                                      
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/cache/share_data/huangting.p/ray/session_2023-02-20_17-46-26_882787_1059999/logs/worker-65110474ef8dc70b12a4df98c6ab9c9a671758254b23109f56b79aa6-01000000-1061751.err'
                                                                                   
2023-02-20 21:06:20,612|WARNING worker.py:1851 -- The log monitor on node SH-IDC1-10-140-0-76 failed with the following error:
Traceback (most recent call last):                                                 
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 520, in <module>
    log_monitor.run()                                                              
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 436, in run 
    anything_published = self.check_log_files_and_publish_updates()                
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 339, in check_log_files_and_publish_updates
    file_info.reopen_if_necessary()                                                
  File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/_private/log_monitor.py", line 79, in reopen_if_necessary
    new_inode = os.stat(self.filename).st_ino                                      
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/cache/share_data/huangting.p/ray/session_2023-02-20_17-46-26_882787_1059999/logs/worker-65110474ef8dc70b12a4df98c6ab9c9a671758254b23109f56b79aa6-01000000-1061751.err'

raylet.2.err:

  1 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/dashboard/agent.py:51: DeprecationWarning: There is no current event loop
  2   aiogrpc.init_grpc_aio()                                                                                      
  3 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/dashboard/modules/reporter/reporter_agent.py:65: UserWarning: Importing gpustat failed, fix this to have full fun    ctionality of the dashboard. The original error was:
  4                                                                                                                
  5 libtinfow.so.6: cannot open shared object file: No such file or directory       
  6   warnings.warn(                                                                                               
  7 [2023-02-20 21:07:26,085 C 39637 39637] (raylet) gcs_rpc_client.h:537:  Check failed: absl::ToInt64Seconds(absl::Now() - gcs_last_alive_time_) < ::RayConfig::instance().gcs_rpc_server_reconnect_timeou    t_s() Failed to connect to GCS within 60 seconds                                                                                                                                                        
  8 *** StackTrace Information ***                                                                                 
  9 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0x4fa17a) [0x556a3ea8d17a] ray::operator<<()
 10 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0x4fbc52) [0x556a3ea8ec52] ray::SpdLogMessage::Flush()
 11 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0x4fbf67) [0x556a3ea8ef67] ray::RayLog::~RayLog()
 12 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0x386493) [0x556a3e919493] ray::rpc::GcsRpcClient::CheckChannelStatus()
 13 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0x386903) [0x556a3e919903] boost::asio::detail::wait_handler<>::do_complete()
 14 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0xa4a87b) [0x556a3efdd87b] boost::asio::detail::scheduler::do_run_one()
 15 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0xa4c041) [0x556a3efdf041] boost::asio::detail::scheduler::run()
 16 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0xa4c270) [0x556a3efdf270] boost::asio::io_context::run()
 17 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0x15b1a9) [0x556a3e6ee1a9] main
 18 /lib64/libc.so.6(__libc_start_main+0xf5) [0x7fe4ef4f2555] __libc_start_main     
 19 /mnt/petrelfs/huangting.p/.conda/envs/ray2.2.0-py3.10/lib/python3.10/site-packages/ray/core/src/ray/raylet/raylet(+0x19b1b7) [0x556a3e72e1b7]

gcs_server.err和gcs_server.out均无报错输出。

关于该日志文件worker-65110474ef8dc70b12a4df98c6ab9c9a671758254b23109f56b79aa6-01000000-1061751.err的补充说明,这个actor的创建事件时间,从对应的python-core-worker-*日志文件来看,其运行是属于前一个Job的,该Actor在2023-02-20 17:52:44,135已经退出。

感觉比较关键的问题是看看raylet的报错,为什么连接GCS超时了

Raylet Check failed 说明raylet已经挂了,这就会导致ray client连接超时。

okkk,感谢您的回复,我们这边被另外一个紧急的项目耽误了,一直没有上论坛,没有跟进,抱歉。我们这边最近又尝试了ray 2.3.1,最开始遇见了segment fault的问题,参考github的issue 解决了。但是遇见一个新的问题,训练还是不能稳定,训练7小时后,任务失败。
环境如下:CentOS Linux release 7.6.1810 Linux 3.10.0-957.el7.x86_64
ray: 2.3.1, python 3.10, 包:

absl-py==1.4.0
aiohttp==3.8.4
aiohttp-cors==0.7.0
aiosignal==1.3.1
async-timeout==4.0.2
attrs==23.1.0
blessed==1.20.0
boto3==1.26.118
botocore==1.29.118
brotlipy==0.7.0
cachetools==5.3.0
certifi==2022.12.7
charset-normalizer==2.1.1
click==8.1.3
coloredlogs==15.0.1
colorful==0.5.5
conda==23.3.1
contextlib2==21.6.0
distlib==0.3.6
dm-tree==0.1.8
einops==0.6.1
environs==9.5.0
fairscale==0.4.13
filelock==3.12.0
flash-attn==1.0.3.post0
frozenlist==1.3.3
gitdb==4.0.10
GitPython==3.1.31
google-api-core==2.11.0
google-auth==2.17.3
googleapis-common-protos==1.59.0
gpustat==1.1
GPUtil==1.4.0
grpcio==1.51.1
h5py==3.8.0
huggingface-hub==0.13.4
humanfriendly==10.0
humanize==4.6.0
jmespath==1.0.1
jsonpointer==2.1
jsonschema==4.17.3
marshmallow==3.19.0
mkl-fft==1.3.1
mkl-service==2.4.0
ml-collections==0.1.1
msgpack==1.0.5
multidict==6.0.4
multiprocessing-logging==0.3.4
natsort==8.3.1
numpy==1.24.2
nvidia-ml-py==11.525.112
opencensus==0.11.2
opencensus-context==0.1.3
opencv-python==4.7.0.72
packaging==23.1
pandas==2.0.0
Pillow==9.2.0
platformdirs==3.2.0
prettytable==3.7.0
prometheus-client==0.16.0
protobuf==4.22.3
psutil==5.9.5
py-spy==0.3.14
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycosat==0.6.3
pydantic==1.10.7
pyrsistent==0.19.3
python-dateutil==2.8.2
python-dotenv==1.0.0
PyTurboJPEG==1.7.1
pytz==2023.3
PyYAML==6.0
ray==2.3.1
requests==2.28.2
rsa==4.9
rwlock==0.0.7
s3transfer==0.6.0
scipy==1.10.1
smart-open==6.3.0
smmap==5.0.0
tifffile==2023.4.12
timm==0.6.12
torch==1.12.1
torchaudio==0.12.1
torchvision==0.13.1
tzdata==2023.3
urllib3
virtualenv==20.22.0
wcwidth==0.2.6
yarl==1.8.2

任务日志:

^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m Traceback (most recent call last):
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m   File "/mnt/petrelfs/huangting.p/workspace/uniscale/uniscale/path/ddppath.py", line 228, in ddp_coordinator_routine
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m     phase_2_result.append(obj_get(tmp))
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m   File "/mnt/petrelfs/huangting.p/workspace/uniscale/uniscale/_internal/rpc.py", line 261, in _get
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m     raise UniscaleInternalError(traceback.format_exc()) from None
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m uniscale.exceptions.UniscaleInternalError: Traceback (most recent call last):
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m   File "/mnt/petrelfs/huangting.p/workspace/uniscale/uniscale/_internal/rpc.py", line 259, in _get
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m     return ray.get(object_refs, timeout=timeout)
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m   File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.3.1-py3.10/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m     return func(*args, **kwargs)
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m   File "/mnt/petrelfs/huangting.p/.conda/envs/ray2.3.1-py3.10/lib/python3.10/site-packages/ray/_private/worker.py", line 2382, in get 
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m     raise value        
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m |  class_name: _RunnerCls
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m |  actor_id: 99c5762bdfcf5764ff37103003000000
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m |  pid: 64557          
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m |  name: imagenet2012_loop_0_gen_2_model_0_train__0_49478640815826100530583609975395234515
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m |  namespace: uniscale_user
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m |  ip: 10.140.1.152       
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m The actor is dead because it was killed by `ray.kill`.
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m-                       
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m-                       
Traceback (most recent call last):                                              
  File "/mnt/petrelfs/huangting.p/workspace/mu2net_uniscale/main.py", line 501, in <module>
    main()                                                                      
  File "/mnt/petrelfs/huangting.p/workspace/mu2net_uniscale/main.py", line 417, in main
    task_iter(task, generation_size, pop, generation_id, mu2net.utils.loop_id, exp_config, args)
  File "/mnt/petrelfs/huangting.p/workspace/mu2net_uniscale/main.py", line 156, in task_iter
    paths_return_info = train_loop(paths, task, ds_hparams, exp_config, train_infos)
  File "/mnt/petrelfs/huangting.p/workspace/mu2net_uniscale/mu2net/train.py", line 596, in train_loop
    assert False, "none of the paths successfully returned something!"          
AssertionError: none of the paths successfully returned something!                                                                                                                                                                                                   
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m Traceback (most recent call last):
^[[2m^[[36m(_RunnerCls pid=250831, ip=10.140.1.155)^[[0m   File "python/ray/_raylet.pyx", line 1166, in ray._raylet.task_execution_handler
srun: error: SH-IDC1-10-140-0-133: task 0: Exited with exit code 1

ray日志:

从日志看起来,是由于“Owner Process” died,导致worker被ray kill,但关于owner process 030…ffff…没有搜索到别的任何信息,这里想请教几个问题:

  1. owner process是什么样的概念,能否稍微详细的解释下
  2. 有没有什么进一步排查的推荐
  3. 从上面提到的github issue来看,结合之前的经验,感觉核心组件的版本对ray的稳定性有大的影响,你们这边对核心组件的版本有啥推荐吗?

上面报错挺明显的,assert False, “none of the paths successfully returned something!” 这个应该是你业务代码吧,assert了导致worker挂了。

额,应该不是,这部分代码是这样的:

for xxx in xxxx:
    single_train_path = uniscale.TorchDDPPath(
            name=build_path_name(fmt_task_name(task.name), infos["loop_id"], infos["gen_id"], ind) + "_train_",
            worker_num=get_config_info(exp_config, mu2net.utils.loop_id, config_key="ddp_num_worker"),
            worker_num_per_node=min(get_config_info(exp_config, mu2net.utils.loop_id, config_key="ddp_num_worker"), 8),
            resource_per_work=resource_per_work,
            train_func=single_model_train_loop,
            train_kwargs=train_kwargs,
            dist_backend="nccl",
        )
        uniscale.submit_path(single_train_path)
# ......
while True:
        if len(submitted_paths) == 0:
            break
        time.sleep(20)
        alive_path_ids = list(submitted_paths.keys())
        for path_id in alive_path_ids:
            single_train_path = submitted_paths[path_id]
            if single_train_path.status in [PathStatus.RUNNING, PathStatus.PENDING]:
                if single_train_path.status == PathStatus.RUNNING:
                    if path_beg_times.get(single_train_path, None) is None:
                        path_beg_times[single_train_path] = time.time()
                    else:
                        time_used = time.time() - path_beg_times[single_train_path]
                        if time_used > path_time_out:
                            logger.warning(f"path {path_id} used time: {time_used} > {path_time_out}, aborting...")
                            uniscale.delete_path(single_train_path)
                            submitted_paths.pop(path_id)

                continue
            elif single_train_path.status == PathStatus.COMPLETED:
                single_train_path_result = single_train_path.get_result()[0]
                submitted_paths.pop(path_id)
                uniscale.delete_path(single_train_path)
                results_kv[path_id] = single_train_path_result
            else:
                print(single_train_path.status)
                print(single_train_path.uuid)
                print(f"Error: train_loop failed of child_model {path_id}")
                logger.error(f"Error: train_loop failed of child_model {path_id}")
                submitted_paths.pop(path_id)
                uniscale.delete_path(single_train_path)
                if path_try_ids[path_id] < max_try_num:
                    # resubmit path
                    logger.info(f"start retry path {path_id}")
                    submit_mu2net_path(path_id)
                else:
                    logger.info(f"path {path_id} failed with try_id={path_try_ids[path_id]}")
    if len(results_kv) == 0:
        assert False, "none of the paths successfully returned something!"
    for path_id in range(len(mu2net_paths)):
        results.append(results_kv.get(path_id, None))

Path是对ray actor的封装,assert报错的原因是没有从path即ray中获取到正确的输出,根本原因还是ray actor出了问题