Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/test_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
- run: |
export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
pip install "paddlepaddle-gpu==3.0.0" -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
pip install --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cu126/paddlepaddle-gpu/" --index-url https://pypi.org/simple "paddlepaddle-gpu==3.3.0.dev20251204"
source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py --reinstall-package deepmd-kit
env:
DP_VARIANT: cuda
Expand All @@ -61,6 +61,7 @@ jobs:
# See https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
XLA_PYTHON_CLIENT_PREALLOCATE: false
XLA_PYTHON_CLIENT_ALLOCATOR: platform
FLAGS_use_stride_compute_kernel: 0
- name: Convert models
run: source/tests/infer/convert-models.sh
- run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py --group pin_jax
source/install/uv_with_retry.sh pip install --system --pre "paddlepaddle==3.0.0" -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
source/install/uv_with_retry.sh pip install --system --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cpu/paddlepaddle/" --index-url https://pypi.org/simple paddlepaddle==3.3.0.dev20251204
env:
# Please note that uv has some issues with finding
# existing TensorFlow package. Currently, it uses
Expand Down Expand Up @@ -60,6 +60,7 @@ jobs:
- run: pytest --cov=deepmd source/tests --splits 6 --group ${{ matrix.group }} --store-durations --clean-durations --durations-path=.test_durations --splitting-algorithm least_duration
env:
NUM_WORKERS: 0
FLAGS_use_stride_compute_kernel: 0
- name: Test TF2 eager mode
run: pytest --cov=deepmd --cov-append source/tests/consistent/io/test_io.py source/jax2tf_tests
env:
Expand Down
4 changes: 2 additions & 2 deletions deepmd/pd/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def get_trainer(
# Initialize DDP
world_size = dist.get_world_size()
if world_size > 1:
assert paddle.version.nccl() != "0"
assert not paddle.core.is_compiled_with_nccl() or paddle.version.nccl() != "0"
fleet.init(is_collective=True)

def prepare_trainer_input_single(
Expand Down Expand Up @@ -214,7 +214,7 @@ def get_compute_device(self) -> str:

def get_ngpus(self) -> int:
"""Get the number of GPUs."""
return paddle.device.cuda.device_count()
return paddle.device.device_count()

def get_backend_info(self) -> dict:
"""Get backend information."""
Expand Down
4 changes: 2 additions & 2 deletions deepmd/pd/utils/auto_batch_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def is_gpu_available(self) -> bool:
bool
True if GPU is available
"""
return paddle.device.cuda.device_count() > 0
return paddle.device.device_count() > 0

def is_oom_error(self, e: Exception) -> bool:
"""Check if the exception is an OOM error.
Expand All @@ -51,6 +51,6 @@ def is_oom_error(self, e: Exception) -> bool:
# (the meaningless error message should be considered as a bug in cusolver)
if isinstance(e, MemoryError) and ("ResourceExhaustedError" in e.args[0]):
# Release all unoccupied cached memory
paddle.device.cuda.empty_cache()
paddle.device.empty_cache()
return True
return False
4 changes: 2 additions & 2 deletions deepmd/pd/utils/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
# Make sure DDP uses correct device if applicable
LOCAL_RANK = int(os.environ.get("PADDLE_LOCAL_RANK", 0))

if os.environ.get("DEVICE") == "cpu" or paddle.device.cuda.device_count() <= 0:
if os.environ.get("DEVICE") == "cpu" or paddle.device.device_count() <= 0:
DEVICE = "cpu"
else:
DEVICE = f"gpu:{LOCAL_RANK}"
DEVICE = paddle.device.get_device()

paddle.device.set_device(DEVICE)

Expand Down
16 changes: 15 additions & 1 deletion deepmd/pd/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
annotations,
)

import warnings
from contextlib import (
contextmanager,
)
Expand Down Expand Up @@ -345,8 +346,21 @@ def get_generator(
generator = paddle.framework.core.default_cuda_generator(
int(DEVICE.split("gpu:")[1])
)
elif DEVICE == "xpu":
generator = paddle.framework.core.default_xpu_generator(0)
elif DEVICE.startswith("xpu:"):
generator = paddle.framework.core.default_xpu_generator(
int(DEVICE.split("xpu:")[1])
)
else:
raise ValueError("DEVICE should be cpu or gpu or gpu:x")
# return none for compability in different devices
warnings.warn(
f"DEVICE is {DEVICE}, which is not supported. Returning None.",
category=UserWarning,
stacklevel=2,
)
return None
# raise ValueError("DEVICE should be cpu or gpu or gpu:x or xpu or xpu:x")
generator.manual_seed(seed)
return generator
else:
Expand Down
3 changes: 2 additions & 1 deletion source/tests/pd/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@
@pytest.fixture(scope="package", autouse=True)
def clear_cuda_memory(request):
yield
paddle.device.cuda.empty_cache()
if paddle.device.get_device() != "cpu":
paddle.device.empty_cache()
12 changes: 12 additions & 0 deletions source/tests/pd/test_multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)

import numpy as np
import paddle

from deepmd.pd.entrypoints.main import (
get_trainer,
Expand Down Expand Up @@ -232,8 +233,13 @@ def setUp(self) -> None:
self.config["model"], self.shared_links = preprocess_shared_params(
self.config["model"]
)
self.FLAGS_use_stride_kernel = paddle.get_flags("FLAGS_use_stride_kernel")[
"FLAGS_use_stride_kernel"
]
paddle.set_flags({"FLAGS_use_stride_kernel": False})

def tearDown(self) -> None:
paddle.set_flags({"FLAGS_use_stride_kernel": self.FLAGS_use_stride_kernel})
MultiTaskTrainTest.tearDown(self)


Expand Down Expand Up @@ -271,9 +277,15 @@ def setUp(self) -> None:
self.config["model"], self.shared_links = preprocess_shared_params(
self.config["model"]
)
self.config["learning_rate"]["start_lr"] = 1e-5
self.share_fitting = True
self.FLAGS_use_stride_kernel = paddle.get_flags("FLAGS_use_stride_kernel")[
"FLAGS_use_stride_kernel"
]
paddle.set_flags({"FLAGS_use_stride_kernel": False})

def tearDown(self) -> None:
paddle.set_flags({"FLAGS_use_stride_kernel": self.FLAGS_use_stride_kernel})
MultiTaskTrainTest.tearDown(self)


Expand Down