Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/ut/attention/test_mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ def test_q_proj_and_k_up_proj(self):
self.assertEqual(q_pe.shape[1], self.impl.num_heads)
self.assertEqual(q_pe.shape[2], self.impl.qk_rope_head_dim)

@patch('vllm_ascend.utils._ENABLE_NZ', True)
@patch('torch_npu.npu_format_cast')
def test_process_weights_after_loading(self, mock_format_cast):
layer = MagicMock(spec=LinearBase)
Expand Down
4 changes: 4 additions & 0 deletions tests/ut/models/test_qwen2_5_vl.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from unittest.mock import patch

import pytest
import torch
import torch.nn.functional as F
Expand Down Expand Up @@ -365,6 +367,7 @@ def test_pad_qkv_bias(self, mocker: MockerFixture):
res = attention.pad_qkv_bias(torch.rand((300)))
assert res.shape[0] == 384

@patch('vllm_ascend.utils._ENABLE_NZ', True)
def test_pad_qkv_weight(self, mocker: MockerFixture):
attention = self.init_vision_transformer(mocker)
mocker.patch("torch.nn.Module.__setattr__")
Expand All @@ -377,6 +380,7 @@ def test_pad_qkv_weight(self, mocker: MockerFixture):
res = attention.pad_qkv_weight(torch.rand((300, 300)))
assert res.shape == (384, 300)

@patch('vllm_ascend.utils._ENABLE_NZ', True)
def test_pad_proj_weight(self, mocker: MockerFixture):
attention = self.init_vision_transformer(mocker)
mocker.patch("torch.nn.Module.__setattr__")
Expand Down
2 changes: 2 additions & 0 deletions tests/ut/quantization/test_w4a8_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def test_get_pergroup_param(self):
self.assertEqual(params["scale_bias"].dtype, torch.float32)
self.assertEqual(params["scale_bias"].shape, (32, 16))

@patch('vllm_ascend.utils._ENABLE_NZ', True)
@patch('torch_npu.npu_convert_weight_to_int4pack')
@patch('torch.Tensor.npu')
def test_process_weights_after_loading(self, mock_npu,
Expand Down Expand Up @@ -260,6 +261,7 @@ def build_layer(self,
requires_grad=False)
return layer

@patch('vllm_ascend.utils._ENABLE_NZ', True)
@patch('torch_npu.npu_format_cast')
@patch('torch_npu.npu_quantize')
@patch('torch.Tensor.npu')
Expand Down
18 changes: 12 additions & 6 deletions tests/ut/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,18 @@ def test_is_310p(self):
self.assertFalse(utils.is_310p())

def test_is_enable_nz(self):
with mock.patch("vllm_ascend.utils.envs_ascend.VLLM_ASCEND_ENABLE_NZ",
1):
self.assertTrue(utils.is_enable_nz())
with mock.patch("vllm_ascend.utils.envs_ascend.VLLM_ASCEND_ENABLE_NZ",
0):
self.assertFalse(utils.is_enable_nz())
# Case when _ENABLE_NZ is already set
utils._ENABLE_NZ = True
self.assertTrue(utils.is_enable_nz())

utils._ENABLE_NZ = False
self.assertFalse(utils.is_enable_nz())

# Case when _ENABLE_NZ is None and vllm_config is not provided
utils._ENABLE_NZ = None
with self.assertRaises(ValueError) as context:
utils.is_enable_nz()
self.assertIn("vllm_config must be provided", str(context.exception))

def test_sleep_mode_enabled(self):
utils._SLEEP_MODE_ENABLED = None
Expand Down
9 changes: 8 additions & 1 deletion tests/ut/worker/test_worker_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@ def setUp(self):
self.model_config_mock = MagicMock(spec=ModelConfig)
self.model_config_mock.dtype = torch.float16
self.model_config_mock.trust_remote_code = False
self.model_config_mock.hf_config = None

self.hf_config_mock = MagicMock()
self.hf_config_mock.model_type = "test_model"
if hasattr(self.hf_config_mock, 'index_topk'):
delattr(self.hf_config_mock, 'index_topk')

self.model_config_mock.hf_config = self.hf_config_mock

self.parallel_config_mock = MagicMock(spec=ParallelConfig)

Expand Down Expand Up @@ -244,6 +250,7 @@ def test_sleep_mode_disabled_raises_error(self, mock_sleep_mode_enabled):

self.assertIn("Sleep mode is not enabled", str(cm.exception))

@patch('vllm_ascend.utils._ENABLE_NZ', False)
@patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled")
@patch("vllm_ascend.worker.worker_v1.CaMemAllocator")
def test_wake_up_mode_enabled(self, mock_allocator_class,
Expand Down
11 changes: 9 additions & 2 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
_IS_MOE_MODEL = None
_ENABLE_SP = None
_HAS_LAYER_IDX = None
_ENABLE_NZ = None


def is_310p():
Expand All @@ -69,8 +70,14 @@ def is_310p():
return _IS_310P


def is_enable_nz():
return envs_ascend.VLLM_ASCEND_ENABLE_NZ
def is_enable_nz(vllm_config: Optional[VllmConfig] = None) -> bool:
global _ENABLE_NZ
if _ENABLE_NZ is None:
if not vllm_config:
raise ValueError(
"vllm_config must be provided when _ENABLE_NZ is None")
_ENABLE_NZ = envs_ascend.VLLM_ASCEND_ENABLE_NZ and vllm_config.model_config.hf_config.model_type != "qwen3_next"
return _ENABLE_NZ


def sleep_mode_enabled():
Expand Down
3 changes: 2 additions & 1 deletion vllm_ascend/worker/worker_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
from vllm_ascend.device_allocator.camem import CaMemAllocator
from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (init_ascend_soc_version,
from vllm_ascend.utils import (init_ascend_soc_version, is_enable_nz,
register_ascend_customop, sleep_mode_enabled,
try_register_lib)
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
Expand Down Expand Up @@ -81,6 +81,7 @@ def __init__(
# register patch for vllm
from vllm_ascend.utils import adapt_patch
adapt_patch()
is_enable_nz(vllm_config)
# Register ops when worker init.
from vllm_ascend import ops
ops.register_dummy_fusion_op()
Expand Down