Describe the bug
when source build latest code on Ubuntu 18.04.3 LTS and run bert pruning sparse example in DeepSpeedExample, you will see crash.
from the log, it's because the code in deepspeed/runtime/config_utils.py was changed recently to use pydantic BaseModel.fields which doesn't exist in latest pydantic version.
To Reproduce
- cd DeepSpeed
- python setup.py install
- cd DeepSpeedExamples/compression/bert
- source ./bash_script/pruning_sparse.sh # port number in sh file is wrong, need manual change.
Expected behavior
functionality work
Screenshots
/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/pydantic-2.0a2-py3.8.egg/pydantic/_internal/_config.py:230: UserWarning: Valid config keys have changed in V2:
- 'allow_population_by_field_name' has been renamed to 'populate_by_name'
- 'validate_all' has been renamed to 'validate_default'
warnings.warn(message, UserWarning)
Traceback (most recent call last):
File "run_glue_no_trainer.py", line 49, in
from huggingface_transformer.modeling_bert import BertForSequenceClassification
File "/home/wenhuach/DeepSpeedExamples/compression/bert/huggingface_transformer/modeling_bert.py", line 51, in
from transformers.modeling_utils import (
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/transformers/modeling_utils.py", line 37, in
from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/transformers/deepspeed.py", line 38, in
from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/init.py", line 7, in
from .accelerator import Accelerator
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/accelerator.py", line 29, in
from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/checkpointing.py", line 24, in
from .utils import (
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/utils/init.py", line 124, in
from .other import (
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/utils/other.py", line 27, in
from deepspeed import DeepSpeedEngine
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/init.py", line 16, in
from . import module_inject
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/module_inject/init.py", line 6, in
from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/module_inject/replace_module.py", line 732, in
from ..pipe import PipelineModule
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/pipe/init.py", line 6, in
from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/pipe/init.py", line 6, in
from .module import PipelineModule, LayerSpec, TiedLayerSpec
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/pipe/module.py", line 19, in
from ..activation_checkpointing import checkpointing
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 25, in
from deepspeed.runtime.config import DeepSpeedConfig
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/config.py", line 28, in
from .zero.config import get_zero_config, ZeroStageEnum
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/zero/init.py", line 6, in
from .partition_parameters import ZeroParamType
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/zero/partition_parameters.py", line 601, in
class Init(InsertPostInitMethodToModuleSubClasses):
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/zero/partition_parameters.py", line 603, in Init
param_persistence_threshold = get_config_default(DeepSpeedZeroConfig, "param_persistence_threshold")
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/config_utils.py", line 114, in get_config_default
assert field_name in config.fields, f"'{field_name}' is not a field in {config}"
AttributeError: type object 'DeepSpeedZeroConfig' has no attribute 'fields'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2220051) of binary: /home/wenhuach/anaconda3/envs/torch/bin/python
Traceback (most recent call last):
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launch.py", line 195, in
main()
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
run_glue_no_trainer.py FAILED
System info (please complete the following information):
- OS: Ubuntu 18.04
- GPU: NV P40
- Python version: 3.8
Describe the bug
when source build latest code on Ubuntu 18.04.3 LTS and run bert pruning sparse example in DeepSpeedExample, you will see crash.
from the log, it's because the code in deepspeed/runtime/config_utils.py was changed recently to use pydantic BaseModel.fields which doesn't exist in latest pydantic version.
To Reproduce
Expected behavior
functionality work
Screenshots
/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/pydantic-2.0a2-py3.8.egg/pydantic/_internal/_config.py:230: UserWarning: Valid config keys have changed in V2:
warnings.warn(message, UserWarning)
Traceback (most recent call last):
File "run_glue_no_trainer.py", line 49, in
from huggingface_transformer.modeling_bert import BertForSequenceClassification
File "/home/wenhuach/DeepSpeedExamples/compression/bert/huggingface_transformer/modeling_bert.py", line 51, in
from transformers.modeling_utils import (
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/transformers/modeling_utils.py", line 37, in
from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/transformers/deepspeed.py", line 38, in
from accelerate.utils.deepspeed import HfDeepSpeedConfig as DeepSpeedConfig
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/init.py", line 7, in
from .accelerator import Accelerator
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/accelerator.py", line 29, in
from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/checkpointing.py", line 24, in
from .utils import (
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/utils/init.py", line 124, in
from .other import (
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/accelerate/utils/other.py", line 27, in
from deepspeed import DeepSpeedEngine
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/init.py", line 16, in
from . import module_inject
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/module_inject/init.py", line 6, in
from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/module_inject/replace_module.py", line 732, in
from ..pipe import PipelineModule
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/pipe/init.py", line 6, in
from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/pipe/init.py", line 6, in
from .module import PipelineModule, LayerSpec, TiedLayerSpec
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/pipe/module.py", line 19, in
from ..activation_checkpointing import checkpointing
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/activation_checkpointing/checkpointing.py", line 25, in
from deepspeed.runtime.config import DeepSpeedConfig
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/config.py", line 28, in
from .zero.config import get_zero_config, ZeroStageEnum
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/zero/init.py", line 6, in
from .partition_parameters import ZeroParamType
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/zero/partition_parameters.py", line 601, in
class Init(InsertPostInitMethodToModuleSubClasses):
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/zero/partition_parameters.py", line 603, in Init
param_persistence_threshold = get_config_default(DeepSpeedZeroConfig, "param_persistence_threshold")
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/deepspeed-0.9.1+cc67f22f-py3.8.egg/deepspeed/runtime/config_utils.py", line 114, in get_config_default
assert field_name in config.fields, f"'{field_name}' is not a field in {config}"
AttributeError: type object 'DeepSpeedZeroConfig' has no attribute 'fields'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2220051) of binary: /home/wenhuach/anaconda3/envs/torch/bin/python
Traceback (most recent call last):
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launch.py", line 195, in
main()
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/wenhuach/anaconda3/envs/torch/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
run_glue_no_trainer.py FAILED
System info (please complete the following information):