deepmodeling · njzjz · Dec 9, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -47,7 +47,7 @@ jobs:
     - run: |
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
-        pip install "paddlepaddle-gpu==3.0.0" -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+        pip install --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cu126/paddlepaddle-gpu/" --index-url https://pypi.org/simple "paddlepaddle-gpu==3.3.0.dev20251204"
         source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py --reinstall-package deepmd-kit
       env:
         DP_VARIANT: cuda
@@ -61,6 +61,7 @@ jobs:
         # See https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
         XLA_PYTHON_CLIENT_PREALLOCATE: false
         XLA_PYTHON_CLIENT_ALLOCATOR: platform
+        FLAGS_use_stride_compute_kernel: 0
     - name: Convert models
       run: source/tests/infer/convert-models.sh
     - run: |

diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
@@ -32,7 +32,7 @@ jobs:
         export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py --group pin_jax
-        source/install/uv_with_retry.sh pip install --system --pre "paddlepaddle==3.0.0" -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+        source/install/uv_with_retry.sh pip install --system --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cpu/paddlepaddle/" --index-url https://pypi.org/simple paddlepaddle==3.3.0.dev20251204
       env:
         # Please note that uv has some issues with finding
         # existing TensorFlow package. Currently, it uses
@@ -60,6 +60,7 @@ jobs:
     - run: pytest --cov=deepmd source/tests  --splits 6 --group ${{ matrix.group }} --store-durations --clean-durations --durations-path=.test_durations --splitting-algorithm least_duration
       env:
         NUM_WORKERS: 0
+        FLAGS_use_stride_compute_kernel: 0
     - name: Test TF2 eager mode
       run: pytest --cov=deepmd --cov-append source/tests/consistent/io/test_io.py source/jax2tf_tests
       env:

diff --git a/deepmd/pd/entrypoints/main.py b/deepmd/pd/entrypoints/main.py
@@ -95,7 +95,7 @@ def get_trainer(
     # Initialize DDP
     world_size = dist.get_world_size()
     if world_size > 1:
-        assert paddle.version.nccl() != "0"
+        assert not paddle.core.is_compiled_with_nccl() or paddle.version.nccl() != "0"
         fleet.init(is_collective=True)
 
     def prepare_trainer_input_single(
@@ -214,7 +214,7 @@ def get_compute_device(self) -> str:
 
     def get_ngpus(self) -> int:
         """Get the number of GPUs."""
-        return paddle.device.cuda.device_count()
+        return paddle.device.device_count()
 
     def get_backend_info(self) -> dict:
         """Get backend information."""

diff --git a/deepmd/pd/utils/auto_batch_size.py b/deepmd/pd/utils/auto_batch_size.py
@@ -36,7 +36,7 @@ def is_gpu_available(self) -> bool:
         bool
             True if GPU is available
         """
-        return paddle.device.cuda.device_count() > 0
+        return paddle.device.device_count() > 0
 
     def is_oom_error(self, e: Exception) -> bool:
         """Check if the exception is an OOM error.
@@ -51,6 +51,6 @@ def is_oom_error(self, e: Exception) -> bool:
         # (the meaningless error message should be considered as a bug in cusolver)
         if isinstance(e, MemoryError) and ("ResourceExhaustedError" in e.args[0]):
             # Release all unoccupied cached memory
-            paddle.device.cuda.empty_cache()
+            paddle.device.empty_cache()
             return True
         return False
diff --git a/deepmd/pd/utils/env.py b/deepmd/pd/utils/env.py
@@ -29,10 +29,10 @@
 # Make sure DDP uses correct device if applicable
 LOCAL_RANK = int(os.environ.get("PADDLE_LOCAL_RANK", 0))
 
-if os.environ.get("DEVICE") == "cpu" or paddle.device.cuda.device_count() <= 0:
+if os.environ.get("DEVICE") == "cpu" or paddle.device.device_count() <= 0:
     DEVICE = "cpu"
 else:
-    DEVICE = f"gpu:{LOCAL_RANK}"
+    DEVICE = paddle.device.get_device()
 
 paddle.device.set_device(DEVICE)
 

diff --git a/deepmd/pd/utils/utils.py b/deepmd/pd/utils/utils.py
@@ -3,6 +3,7 @@
     annotations,
 )
 
+import warnings
 from contextlib import (
     contextmanager,
 )
@@ -345,8 +346,21 @@ def get_generator(
             generator = paddle.framework.core.default_cuda_generator(
                 int(DEVICE.split("gpu:")[1])
             )
+        elif DEVICE == "xpu":
+            generator = paddle.framework.core.default_xpu_generator(0)
+        elif DEVICE.startswith("xpu:"):
+            generator = paddle.framework.core.default_xpu_generator(
+                int(DEVICE.split("xpu:")[1])
+            )
         else:
-            raise ValueError("DEVICE should be cpu or gpu or gpu:x")
+            # return none for compability in different devices
+            warnings.warn(
+                f"DEVICE is {DEVICE}, which is not supported. Returning None.",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            return None
+            # raise ValueError("DEVICE should be cpu or gpu or gpu:x or xpu or xpu:x")
         generator.manual_seed(seed)
         return generator
     else:

diff --git a/source/tests/pd/conftest.py b/source/tests/pd/conftest.py
@@ -6,4 +6,5 @@
 @pytest.fixture(scope="package", autouse=True)
 def clear_cuda_memory(request):
     yield
-    paddle.device.cuda.empty_cache()
+    if paddle.device.get_device() != "cpu":
+        paddle.device.empty_cache()
diff --git a/source/tests/pd/test_multitask.py b/source/tests/pd/test_multitask.py
@@ -11,6 +11,7 @@
 )
 
 import numpy as np
+import paddle
 
 from deepmd.pd.entrypoints.main import (
     get_trainer,
@@ -232,8 +233,13 @@ def setUp(self) -> None:
         self.config["model"], self.shared_links = preprocess_shared_params(
             self.config["model"]
         )
+        self.FLAGS_use_stride_kernel = paddle.get_flags("FLAGS_use_stride_kernel")[
+            "FLAGS_use_stride_kernel"
+        ]
+        paddle.set_flags({"FLAGS_use_stride_kernel": False})
 
     def tearDown(self) -> None:
+        paddle.set_flags({"FLAGS_use_stride_kernel": self.FLAGS_use_stride_kernel})
         MultiTaskTrainTest.tearDown(self)
 
 
@@ -271,9 +277,15 @@ def setUp(self) -> None:
         self.config["model"], self.shared_links = preprocess_shared_params(
             self.config["model"]
         )
+        self.config["learning_rate"]["start_lr"] = 1e-5
         self.share_fitting = True
+        self.FLAGS_use_stride_kernel = paddle.get_flags("FLAGS_use_stride_kernel")[
+            "FLAGS_use_stride_kernel"
+        ]
+        paddle.set_flags({"FLAGS_use_stride_kernel": False})
 
     def tearDown(self) -> None:
+        paddle.set_flags({"FLAGS_use_stride_kernel": self.FLAGS_use_stride_kernel})
         MultiTaskTrainTest.tearDown(self)