xorbitsai
diff --git a/‎xinference/model/embedding/core.py‎
Lines changed: 40 additions & 0 deletions b/‎xinference/model/embedding/core.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎xinference/model/embedding/llama_cpp/core.py‎
Lines changed: 60 additions & 2 deletions b/‎xinference/model/embedding/llama_cpp/core.py‎
Lines changed: 60 additions & 2 deletions
diff --git a/‎xinference/model/embedding/sentence_transformers/core.py‎
Lines changed: 75 additions & 2 deletions b/‎xinference/model/embedding/sentence_transformers/core.py‎
Lines changed: 75 additions & 2 deletions
diff --git a/‎xinference/model/llm/core.py‎
Lines changed: 38 additions & 0 deletions b/‎xinference/model/llm/core.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎xinference/model/llm/llama_cpp/core.py‎
Lines changed: 56 additions & 3 deletions b/‎xinference/model/llm/llama_cpp/core.py‎
Lines changed: 56 additions & 3 deletions
diff --git a/‎xinference/model/llm/lmdeploy/core.py‎
Lines changed: 59 additions & 5 deletions b/‎xinference/model/llm/lmdeploy/core.py‎
Lines changed: 59 additions & 5 deletions
@@ -171,6 +171,46 @@ def match_json(
     ) -> bool:
         pass
 
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: EmbeddingModelFamilyV2,
+        model_spec: EmbeddingSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        """
+        Check if the engine can handle the given embedding model with detailed error information.
+
+        This method provides detailed failure reasons and suggestions when an engine
+        cannot handle a specific model configuration. The default implementation
+        falls back to the boolean match_json method for backward compatibility.
+
+        Args:
+            model_family: The embedding model family information
+            model_spec: The model specification
+            quantization: The quantization method
+
+        Returns:
+            MatchResult: Detailed match result with reasons and suggestions
+        """
+        from .match_result import ErrorType, MatchResult
+
+        # Default implementation for backward compatibility
+        if cls.match_json(model_family, model_spec, quantization):
+            return MatchResult.success()
+        else:
+            # Get basic reason based on common failure patterns
+            if not cls.check_lib():
+                return MatchResult.failure(
+                    reason=f"Required library for {cls.__name__} is not available",
+                    error_type=ErrorType.DEPENDENCY_MISSING,
+                )
+            else:
+                return MatchResult.failure(
+                    reason=f"Embedding model configuration is not compatible with {cls.__name__}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                )
+
     @classmethod
     def match(
         cls,
 
@@ -235,6 +235,64 @@ def match_json(
         model_spec: EmbeddingSpecV1,
         quantization: str,
     ) -> bool:
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: EmbeddingModelFamilyV2,
+        model_spec: EmbeddingSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="llama.cpp library (xllamacpp) is not installed for embedding",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="xllamacpp package not found in Python environment",
+            )
+
+        # Check model format compatibility
         if model_spec.model_format not in ["ggufv2"]:
-            return False
-        return True
+            return MatchResult.failure(
+                reason=f"llama.cpp embedding only supports GGUF v2 format, got: {model_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {model_spec.model_format}, required: ggufv2",
+            )
+
+        # Check embedding-specific requirements
+        if not hasattr(model_spec, "model_file_name_template"):
+            return MatchResult.failure(
+                reason="GGUF embedding model requires proper file configuration",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details="Missing model_file_name_template for GGUF embedding",
+            )
+
+        # Check model dimensions for llama.cpp compatibility
+        model_dimensions = model_family.dimensions
+        if model_dimensions > 4096:  # llama.cpp may have limitations
+            return MatchResult.failure(
+                reason=f"Large embedding model may have compatibility issues with llama.cpp ({model_dimensions} dimensions)",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Large embedding dimensions: {model_dimensions}",
+            )
+
+        # Check platform-specific considerations
+        import platform
+
+        current_platform = platform.system()
+
+        # llama.cpp works across platforms but may have performance differences
+        if current_platform == "Windows":
+            return MatchResult.failure(
+                reason="llama.cpp embedding may have limited performance on Windows",
+                error_type=ErrorType.OS_REQUIREMENT,
+                technical_details=f"Windows platform: {current_platform}",
+            )
+
+        return MatchResult.success()
@@ -434,5 +434,78 @@ def match_json(
         model_spec: EmbeddingSpecV1,
         quantization: str,
     ) -> bool:
-        # As default embedding engine, sentence-transformer support all models
-        return model_spec.model_format in ["pytorch"]
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(model_family, model_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls,
+        model_family: EmbeddingModelFamilyV2,
+        model_spec: EmbeddingSpecV1,
+        quantization: str,
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="Sentence Transformers library is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="sentence_transformers package not found in Python environment",
+            )
+
+        # Check model format compatibility
+        if model_spec.model_format not in ["pytorch"]:
+            return MatchResult.failure(
+                reason=f"Sentence Transformers only supports pytorch format, got: {model_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {model_spec.model_format}, required: pytorch",
+            )
+
+        # Check model dimensions compatibility
+        model_dimensions = model_family.dimensions
+        if model_dimensions > 1536:  # Very large embedding models
+            return MatchResult.failure(
+                reason=f"Large embedding model detected ({model_dimensions} dimensions)",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Large embedding dimensions: {model_dimensions}",
+            )
+
+        # Check token limits
+        max_tokens = model_family.max_tokens
+        if max_tokens > 8192:  # Very high token limits
+            return MatchResult.failure(
+                reason=f"High token limit model detected (max_tokens: {max_tokens})",
+                error_type=ErrorType.CONFIGURATION_ERROR,
+                technical_details=f"High max_tokens: {max_tokens}",
+            )
+
+        # Check for special model requirements
+        model_name = model_family.model_name.lower()
+
+        # Check Qwen2 GTE models
+        if "gte" in model_name and "qwen2" in model_name:
+            # These models have specific requirements
+            if not hasattr(cls, "_check_qwen_gte_requirements"):
+                return MatchResult.failure(
+                    reason="Qwen2 GTE models require special handling",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                    technical_details="Qwen2 GTE model special requirements",
+                )
+
+        # Check Qwen3 models
+        if "qwen3" in model_name:
+            # Qwen3 has flash attention requirements
+            try:
+                # This would be checked during actual loading
+                pass
+            except Exception:
+                return MatchResult.failure(
+                    reason="Qwen3 embedding model may have compatibility issues",
+                    error_type=ErrorType.VERSION_REQUIREMENT,
+                    technical_details="Qwen3 model compatibility check",
+                )
+
+        return MatchResult.success()
@@ -31,6 +31,7 @@
 
 if TYPE_CHECKING:
     from .llm_family import LLMFamilyV2, LLMSpecV1
+    from .match_result import ErrorType, MatchResult
 
 logger = logging.getLogger(__name__)
 
@@ -157,6 +158,43 @@ def match_json(
     ) -> bool:
         raise NotImplementedError
 
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        """
+        Check if the engine can handle the given model with detailed error information.
+
+        This method provides detailed failure reasons and suggestions when an engine
+        cannot handle a specific model configuration. The default implementation
+        falls back to the boolean match_json method for backward compatibility.
+
+        Args:
+            llm_family: The model family information
+            llm_spec: The model specification
+            quantization: The quantization method
+
+        Returns:
+            MatchResult: Detailed match result with reasons and suggestions
+        """
+        from .match_result import ErrorType, MatchResult
+
+        # Default implementation for backward compatibility
+        if cls.match_json(llm_family, llm_spec, quantization):
+            return MatchResult.success()
+        else:
+            # Get basic reason based on common failure patterns
+            if not cls.check_lib():
+                return MatchResult.failure(
+                    reason=f"Required library for {cls.__name__} is not available",
+                    error_type=ErrorType.DEPENDENCY_MISSING,
+                )
+            else:
+                return MatchResult.failure(
+                    reason=f"Model configuration is not compatible with {cls.__name__}",
+                    error_type=ErrorType.MODEL_COMPATIBILITY,
+                )
+
     def prepare_parse_reasoning_content(
         self, reasoning_content: bool, enable_thinking: bool = True
     ):
 
@@ -84,14 +84,67 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
     ) -> bool:
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: LLMFamilyV2, llm_spec: LLMSpecV1, quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability
+        if not cls.check_lib():
+            return MatchResult.failure(
+                reason="llama.cpp library (xllamacpp) is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="xllamacpp package not found in Python environment",
+            )
+
+        # Check model format compatibility
         if llm_spec.model_format not in ["ggufv2"]:
-            return False
+            return MatchResult.failure(
+                reason=f"llama.cpp only supports GGUF v2 format, got: {llm_spec.model_format}",
+                error_type=ErrorType.MODEL_FORMAT,
+                technical_details=f"Unsupported format: {llm_spec.model_format}, required: ggufv2",
+            )
+
+        # Check model abilities - llama.cpp supports both chat and generation
         if (
             "chat" not in llm_family.model_ability
             and "generate" not in llm_family.model_ability
         ):
-            return False
-        return True
+            return MatchResult.failure(
+                reason=f"llama.cpp requires 'chat' or 'generate' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        # Check platform-specific issues
+        import platform
+
+        current_platform = platform.system()
+
+        # Check for ARM64 specific issues
+        if current_platform == "Darwin" and platform.machine() == "arm64":
+            # Apple Silicon specific checks could go here
+            pass
+        elif current_platform == "Windows":
+            # Windows specific checks could go here
+            pass
+
+        # Check memory requirements (basic heuristic)
+        model_size = float(str(llm_spec.model_size_in_billions))
+        if model_size > 70:  # Very large models
+            return MatchResult.failure(
+                reason=f"llama.cpp may struggle with very large models ({model_size}B parameters)",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Large model size: {model_size}B parameters",
+            )
+
+        return MatchResult.success()
 
     def load(self):
         try:
 
@@ -119,7 +119,22 @@ def check_lib(cls) -> bool:
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        return False
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        return MatchResult.failure(
+            reason="LMDeploy base model does not support direct inference",
+            error_type=ErrorType.MODEL_COMPATIBILITY,
+            technical_details="LMDeploy base model class is not intended for direct use",
+        )
 
     def generate(
         self,
@@ -172,13 +187,52 @@ def load(self):
     def match_json(
         cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
+        from ..match_result import MatchResult
+
+        result = cls.match_json_with_reason(llm_family, llm_spec, quantization)
+        return result.is_match
+
+    @classmethod
+    def match_json_with_reason(
+        cls, llm_family: "LLMFamilyV2", llm_spec: "LLMSpecV1", quantization: str
+    ) -> "MatchResult":
+        from ..match_result import ErrorType, MatchResult
+
+        # Check library availability first
+        if not LMDEPLOY_INSTALLED:
+            return MatchResult.failure(
+                reason="LMDeploy library is not installed",
+                error_type=ErrorType.DEPENDENCY_MISSING,
+                technical_details="lmdeploy package not found in Python environment",
+            )
+
+        # Check model format compatibility and quantization
         if llm_spec.model_format == "awq":
-            # Currently, only 4-bit weight quantization is supported for AWQ, but got 8 bits.
+            # LMDeploy has specific AWQ quantization requirements
             if "4" not in quantization:
-                return False
+                return MatchResult.failure(
+                    reason=f"LMDeploy AWQ format requires 4-bit quantization, got: {quantization}",
+                    error_type=ErrorType.QUANTIZATION,
+                    technical_details=f"AWQ + {quantization} not supported by LMDeploy",
+                )
+
+        # Check model compatibility
         if llm_family.model_name not in LMDEPLOY_SUPPORTED_CHAT_MODELS:
-            return False
-        return LMDEPLOY_INSTALLED
+            return MatchResult.failure(
+                reason=f"Chat model not supported by LMDeploy: {llm_family.model_name}",
+                error_type=ErrorType.MODEL_COMPATIBILITY,
+                technical_details=f"Unsupported chat model: {llm_family.model_name}",
+            )
+
+        # Check model abilities - LMDeploy primarily supports chat models
+        if "chat" not in llm_family.model_ability:
+            return MatchResult.failure(
+                reason=f"LMDeploy Chat requires 'chat' ability, model has: {llm_family.model_ability}",
+                error_type=ErrorType.ABILITY_MISMATCH,
+                technical_details=f"Model abilities: {llm_family.model_ability}",
+            )
+
+        return MatchResult.success()
 
     async def async_chat(
         self,