TwT-6's picture
Upload 2667 files
256a159 verified
03/06 17:10:13 - OpenCompass - INFO - Task [my_api/siqa]
/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
torch.utils._pytree._register_pytree_node(
/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
torch.utils._pytree._register_pytree_node(
Python 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
Type 'copyright', 'credits' or 'license' for more information
IPython 8.22.2 -- An enhanced Interactive Python. Type '?' for help.
In [1]: [2024-03-06 17:22:48,542] torch.distributed.elastic.agent.server.api: [WARNING] Received Signals.SIGINT death signal, shutting down workers
[2024-03-06 17:22:48,543] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 235854 closing signal SIGINT
[2024-03-06 17:22:48,642] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 235854 closing signal SIGTERM
Traceback (most recent call last):
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 727, in run
result = self._invoke_run(role)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 868, in _invoke_run
time.sleep(monitor_interval)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
torch.distributed.elastic.multiprocessing.api.SignalException: Process 235781 got signal: 2
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 734, in run
self._shutdown(e.sigval)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 311, in _shutdown
self._pcontext.close(death_sig)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 318, in close
self._close(death_sig=death_sig, timeout=timeout)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 706, in _close
handler.proc.wait(time_to_wait)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/subprocess.py", line 1209, in wait
return self._wait(timeout=timeout)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/subprocess.py", line 1953, in _wait
time.sleep(delay)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
torch.distributed.elastic.multiprocessing.api.SignalException: Process 235781 got signal: 2
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/export/home/tanwentao1/anaconda3/envs/opencompass/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
return f(*args, **kwargs)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
result = agent.run()
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
result = f(*args, **kwargs)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 739, in run
self._shutdown()
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 311, in _shutdown
self._pcontext.close(death_sig)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 318, in close
self._close(death_sig=death_sig, timeout=timeout)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 706, in _close
handler.proc.wait(time_to_wait)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/subprocess.py", line 1209, in wait
return self._wait(timeout=timeout)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/subprocess.py", line 1953, in _wait
time.sleep(delay)
File "/export/home/tanwentao1/anaconda3/envs/opencompass/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
torch.distributed.elastic.multiprocessing.api.SignalException: Process 235781 got signal: 2