Skip to content
Merged

Fix ppu #6489

Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -843,3 +843,4 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还
- VLLM_USE_V1: 用于切换vLLM使用V0/V1版本。
- SWIFT_TIMEOUT: (ms-swift>=3.10) 若多模态数据集中存在图像URL,该参数用于控制获取图片的timeout,默认为20s。
- ROOT_IMAGE_DIR: (ms-swift>=3.8) 图像(多模态)资源的根目录。通过设置该参数,可以在数据集中使用相对于 `ROOT_IMAGE_DIR` 的相对路径。默认情况下,是相对于运行目录的相对路径。
- SWIFT_SINGLE_DEVICE_MODE: (ms-swift>=3.10) 单设备模式,在此模式下,所有进程只能看到一个设备,目前用于兼容PPU设备
1 change: 1 addition & 0 deletions docs/source_en/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -868,3 +868,4 @@ The meanings of the following parameters can be found in the example code [here]
- VLLM_USE_V1: Used to switch between V0 and V1 versions of vLLM.
- SWIFT_TIMEOUT: (ms-swift >= 3.10) If the multimodal dataset contains image URLs, this parameter controls the timeout for fetching images, defaulting to 20 seconds.
- ROOT_IMAGE_DIR: (ms-swift>=3.8) The root directory for image (multimodal) resources. By setting this parameter, relative paths in the dataset can be interpreted relative to `ROOT_IMAGE_DIR`. By default, paths are relative to the current working directory.
- SWIFT_SINGLE_DEVICE_MODE: (ms-swift>=3.10) Single device mode. In this mode, all processes can only see one device. Currently used for compatibility with PPU devices.
4 changes: 3 additions & 1 deletion swift/cli/pt.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from swift.llm import pt_main

if __name__ == '__main__':
from swift.cli.utils import try_use_single_device_mode
try_use_single_device_mode()
from swift.llm import pt_main
pt_main()
4 changes: 3 additions & 1 deletion swift/cli/rlhf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from swift.llm import rlhf_main

if __name__ == '__main__':
from swift.cli.utils import try_use_single_device_mode
try_use_single_device_mode()
from swift.llm import rlhf_main
rlhf_main()
2 changes: 2 additions & 0 deletions swift/cli/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ def try_init_unsloth():


if __name__ == '__main__':
from swift.cli.utils import try_use_single_device_mode
try_use_single_device_mode()
try_init_unsloth()
from swift.ray import try_init_ray
try_init_ray()
Expand Down
9 changes: 9 additions & 0 deletions swift/cli/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import os


def try_use_single_device_mode():
if os.environ.get('SWIFT_SINGLE_DEVICE_MODE', '0') == '1':
visible_devices = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
visible_device = visible_devices[int(os.environ['LOCAL_RANK'])]
os.environ['CUDA_VISIBLE_DEVICES'] = str(visible_device)
os.environ['LOCAL_RANK'] = '0'
11 changes: 7 additions & 4 deletions swift/utils/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,13 @@ def is_mp() -> bool:
from swift.utils import get_device_count
n_gpu = get_device_count()
local_world_size = get_dist_setting()[3]
assert n_gpu % local_world_size == 0, f'n_gpu: {n_gpu}, local_world_size: {local_world_size}'
if n_gpu // local_world_size >= 2:
return True
return False
if os.environ.get('SWIFT_SINGLE_DEVICE_MODE', '0') != '1':
assert n_gpu % local_world_size == 0, f'n_gpu: {n_gpu}, local_world_size: {local_world_size}'
if n_gpu // local_world_size >= 2:
return True
return False
else:
return False


def is_mp_ddp() -> bool:
Expand Down
Loading