deeppavlov · voorhs · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/.github/workflows/reusable-test.yaml b/.github/workflows/reusable-test.yaml
@@ -7,6 +7,11 @@ on:
         required: true
         type: string
         description: 'Command to run tests'
+      extras:
+        required: false
+        type: string
+        default: ''
+        description: 'Space-separated --extra flags (e.g., "--extra transformers --extra peft")'
 
 jobs:
   test:
@@ -39,7 +44,7 @@ jobs:
     - name: Install dependencies for Python ${{ matrix.python-version }}
       run: |
         uv python pin ${{ matrix.python-version }}
-        uv sync --group test
+        uv sync --group test ${{ inputs.extras }}
 
     - name: Run tests
       run: |

diff --git a/.github/workflows/test-embedder.yaml b/.github/workflows/test-embedder.yaml
@@ -0,0 +1,15 @@
+name: test embedder
+
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+
+jobs:
+  test:
+    uses: ./.github/workflows/reusable-test.yaml
+    with:
+      test_command: pytest -n auto tests/embedder/ tests/callback/
+      extras: --extra sentence-transformers --extra transformers
+
diff --git a/.github/workflows/test-inference.yaml b/.github/workflows/test-inference.yaml
@@ -11,3 +11,4 @@ jobs:
     uses: ./.github/workflows/reusable-test.yaml
     with:
       test_command: pytest -n auto tests/pipeline/test_inference.py
+      extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
diff --git a/.github/workflows/test-optimization.yaml b/.github/workflows/test-optimization.yaml
@@ -11,3 +11,4 @@ jobs:
     uses: ./.github/workflows/reusable-test.yaml
     with:
       test_command: pytest -n auto tests/pipeline/test_optimization.py
+      extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
diff --git a/.github/workflows/test-presets.yaml b/.github/workflows/test-presets.yaml
@@ -11,3 +11,4 @@ jobs:
     uses: ./.github/workflows/reusable-test.yaml
     with:
       test_command: pytest -n auto tests/pipeline/test_presets.py
+      extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
diff --git a/.github/workflows/test-scorers.yaml b/.github/workflows/test-scorers.yaml
@@ -0,0 +1,47 @@
+name: test scorers
+
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [ "3.10", "3.11", "3.12" ]
+        dependency-group: [ "base", "transformers", "peft", "catboost" ]
+        include:
+          - os: windows-latest
+            python-version: "3.10"
+            dependency-group: "base"
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Cache Hugging Face
+      id: cache-hf
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/huggingface
+        key: ${{ runner.os }}-hf
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v6
+      with:
+        version: "0.8.8"
+
+    - name: Install dependencies for Python ${{ matrix.python-version }}
+      run: |
+        uv python pin ${{ matrix.python-version }}
+        uv sync --group test ${{ matrix.dependency-group != 'base' && format('--extra {0}', matrix.dependency-group) || '' }}
+
+    - name: Run scorer tests
+      run: |
+        uv run pytest -n auto tests/modules/scoring/
+
diff --git a/.github/workflows/typing.yml b/.github/workflows/typing.yml
@@ -18,7 +18,7 @@ jobs:
       - name: Install dependencies
         run: |
           uv lock
-          uv sync --group typing
+          uv sync --group typing --extra peft --extra sentence-transformers
 
       - name: Run mypy
         run: uv run mypy src/autointent
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
@@ -10,4 +10,4 @@ jobs:
   test:
     uses: ./.github/workflows/reusable-test.yaml
     with:
-      test_command: pytest -n auto --ignore=tests/nodes --ignore=tests/pipeline
+      test_command: pytest -n auto --ignore=tests/modules/scoring/ --ignore=tests/pipeline --ignore=tests/embedder --ignore=tests/callback
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,7 +51,9 @@ dependencies = [
 [project.optional-dependencies]
 catboost = ["catboost (>=1.2.8,<2.0.0)"]
 peft = ["peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)"]
-transformers = ["transformers (>=4.49.0,<5.0.0)"]
+transformers = [
+    "transformers[torch] (>=4.49.0,<5.0.0)",
+]
 sentence-transformers = ["sentence-transformers (>=3,<4)"]
 dspy = [
     "dspy (>=2.6.5,<3.0.0)",

diff --git a/src/autointent/_presets/classic-medium.yaml b/src/autointent/_presets/classic-medium.yaml
@@ -12,7 +12,6 @@ search_space:
         k:
           low: 1
           high: 20
-      - module_name: catboost
       - module_name: sklearn
         clf_name: [RandomForestClassifier]
         n_estimators: [150]

diff --git a/src/autointent/_wrappers/embedder/__init__.py b/src/autointent/_wrappers/embedder/__init__.py
@@ -2,12 +2,14 @@
 
 from .base import BaseEmbeddingBackend
 from .embedder import Embedder
+from .hashing_vectorizer import HashingVectorizerEmbeddingBackend
 from .openai import OpenaiEmbeddingBackend
 from .sentence_transformers import SentenceTransformerEmbeddingBackend
 
 __all__ = [
     "BaseEmbeddingBackend",
     "Embedder",
+    "HashingVectorizerEmbeddingBackend",
     "OpenaiEmbeddingBackend",
     "SentenceTransformerEmbeddingBackend",
 ]
diff --git a/src/autointent/_wrappers/embedder/embedder.py b/src/autointent/_wrappers/embedder/embedder.py
@@ -15,10 +15,16 @@
 import torch
 
 from autointent.configs import EmbedderFineTuningConfig, TaskTypeEnum
-from autointent.configs._embedder import EmbedderConfig, OpenaiEmbeddingConfig, SentenceTransformerEmbeddingConfig
+from autointent.configs._embedder import (
+    EmbedderConfig,
+    HashingVectorizerEmbeddingConfig,
+    OpenaiEmbeddingConfig,
+    SentenceTransformerEmbeddingConfig,
+)
 from autointent.custom_types import ListOfLabels
 
 from .base import BaseEmbeddingBackend
+from .hashing_vectorizer import HashingVectorizerEmbeddingBackend
 from .openai import OpenaiEmbeddingBackend
 from .sentence_transformers import SentenceTransformerEmbeddingBackend
 
@@ -52,6 +58,8 @@ def _init_backend(self) -> BaseEmbeddingBackend:
             return SentenceTransformerEmbeddingBackend(self.config)
         if isinstance(self.config, OpenaiEmbeddingConfig):
             return OpenaiEmbeddingBackend(self.config)
+        if isinstance(self.config, HashingVectorizerEmbeddingConfig):
+            return HashingVectorizerEmbeddingBackend(self.config)
         # Check if it's exactly the abstract base config (not a subclass)
 
         msg = f"Cannot instantiate abstract EmbedderConfig: {self.config.__repr__()}"
@@ -147,6 +155,8 @@ def load(cls, path: Path | str, override_config: EmbedderConfig | None = None) -
             instance._backend = SentenceTransformerEmbeddingBackend.load(backend_path)  # noqa: SLF001
         elif isinstance(config, OpenaiEmbeddingConfig):
             instance._backend = OpenaiEmbeddingBackend.load(backend_path)  # noqa: SLF001
+        elif isinstance(config, HashingVectorizerEmbeddingConfig):
+            instance._backend = HashingVectorizerEmbeddingBackend.load(backend_path)  # noqa: SLF001
         else:
             msg = f"Cannot load abstract EmbedderConfig: {config.__repr__()}"
             raise TypeError(msg)

diff --git a/src/autointent/_wrappers/embedder/hashing_vectorizer.py b/src/autointent/_wrappers/embedder/hashing_vectorizer.py
@@ -0,0 +1,175 @@
+"""HashingVectorizer-based embedding backend for lightweight testing."""
+
+import json
+import logging
+from pathlib import Path
+from typing import Literal, overload
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+from autointent._hash import Hasher
+from autointent.configs import TaskTypeEnum
+from autointent.configs._embedder import HashingVectorizerEmbeddingConfig
+
+from .base import BaseEmbeddingBackend
+
+logger = logging.getLogger(__name__)
+
+
+class HashingVectorizerEmbeddingBackend(BaseEmbeddingBackend):
+    """HashingVectorizer-based embedding backend implementation.
+
+    This backend uses sklearn's HashingVectorizer for fast, stateless text vectorization.
+    Ideal for testing as it requires no model downloads and is very fast.
+    """
+
+    supports_training: bool = False
+
+    def __init__(self, config: HashingVectorizerEmbeddingConfig) -> None:
+        """Initialize the HashingVectorizer backend.
+
+        Args:
+            config: Configuration for HashingVectorizer embeddings.
+        """
+        self.config = config
+        self._vectorizer = HashingVectorizer(
+            n_features=config.n_features,
+            ngram_range=config.ngram_range,
+            analyzer=config.analyzer,
+            lowercase=config.lowercase,
+            norm=config.norm,
+            binary=config.binary,
+            dtype=getattr(np, config.dtype),
+        )
+
+    def clear_ram(self) -> None:
+        """Clear the backend from RAM.
+
+        HashingVectorizer is stateless, so this is a no-op.
+        """
+
+    def get_hash(self) -> int:
+        """Compute a hash value for the backend.
+
+        Returns:
+            The hash value of the backend.
+        """
+        hasher = Hasher()
+        # Hash all relevant config parameters
+        hasher.update(self.config.n_features)
+        hasher.update(self.config.ngram_range)
+        hasher.update(self.config.analyzer)
+        hasher.update(self.config.lowercase)
+        hasher.update(self.config.norm if self.config.norm is not None else "None")
+        hasher.update(self.config.binary)
+        hasher.update(self.config.dtype)
+        return int(hasher.hexdigest(), 16)
+
+    @overload
+    def embed(
+        self, utterances: list[str], task_type: TaskTypeEnum | None = None, *, return_tensors: Literal[True]
+    ) -> torch.Tensor: ...
+
+    @overload
+    def embed(
+        self, utterances: list[str], task_type: TaskTypeEnum | None = None, *, return_tensors: Literal[False] = False
+    ) -> npt.NDArray[np.float32]: ...
+
+    def embed(
+        self,
+        utterances: list[str],
+        task_type: TaskTypeEnum | None = None,  # noqa: ARG002
+        return_tensors: bool = False,
+    ) -> npt.NDArray[np.float32] | torch.Tensor:
+        """Calculate embeddings for a list of utterances.
+
+        Args:
+            utterances: List of input texts to calculate embeddings for.
+            task_type: Type of task for which embeddings are calculated (ignored for HashingVectorizer).
+            return_tensors: If True, return a PyTorch tensor; otherwise, return a numpy array.
+
+        Returns:
+            A numpy array or PyTorch tensor of embeddings.
+        """
+        # Transform texts to sparse matrix, then convert to dense
+        embeddings_sparse = self._vectorizer.transform(utterances)
+        embeddings: npt.NDArray[np.float32] = embeddings_sparse.toarray().astype(np.float32)
+
+        if return_tensors:
+            return torch.from_numpy(embeddings)
+        return embeddings
+
+    def similarity(
+        self, embeddings1: npt.NDArray[np.float32], embeddings2: npt.NDArray[np.float32]
+    ) -> npt.NDArray[np.float32]:
+        """Calculate cosine similarity between two sets of embeddings.
+
+        Args:
+            embeddings1: First set of embeddings with shape (n_samples, n_features).
+            embeddings2: Second set of embeddings with shape (m_samples, n_features).
+
+        Returns:
+            Similarity matrix with shape (n_samples, m_samples).
+        """
+        similarity_matrix: npt.NDArray[np.float32] = cosine_similarity(embeddings1, embeddings2).astype(np.float32)
+        return similarity_matrix
+
+    def dump(self, path: Path) -> None:
+        """Save the backend state to disk.
+
+        Args:
+            path: Directory path where the backend should be saved.
+        """
+        path.mkdir(parents=True, exist_ok=True)
+
+        # Save a metadata file indicating this is a HashingVectorizer backend
+        metadata = {
+            "backend_type": "hashing_vectorizer",
+            "config": self.config.model_dump(),
+        }
+
+        metadata_path = path / "metadata.json"
+        with metadata_path.open("w", encoding="utf-8") as f:
+            json.dump(metadata, f, indent=2)
+
+        logger.debug("Saved HashingVectorizer backend to %s", path)
+
+    @classmethod
+    def load(cls, path: Path) -> "HashingVectorizerEmbeddingBackend":
+        """Load the backend from disk.
+
+        Args:
+            path: Directory path where the backend is stored.
+
+        Returns:
+            Loaded HashingVectorizerEmbeddingBackend instance.
+        """
+        metadata_path = path / "metadata.json"
+        with metadata_path.open("r", encoding="utf-8") as f:
+            metadata = json.load(f)
+
+        config = HashingVectorizerEmbeddingConfig.model_validate(metadata["config"])
+        instance = cls(config)
+
+        logger.debug("Loaded HashingVectorizer backend from %s", path)
+        return instance
+
+    def train(self, utterances: list[str], labels: list[int], config) -> None:  # type: ignore[no-untyped-def]  # noqa: ANN001
+        """Train the backend.
+
+        HashingVectorizer is stateless and doesn't support training.
+
+        Args:
+            utterances: Training utterances.
+            labels: Training labels.
+            config: Training configuration.
+
+        Raises:
+            NotImplementedError: HashingVectorizer doesn't support training.
+        """
+        msg = "HashingVectorizer backend does not support training"
+        raise NotImplementedError(msg)
diff --git a/src/autointent/configs/__init__.py b/src/autointent/configs/__init__.py
@@ -2,6 +2,7 @@
 
 from ._embedder import (
     EmbedderConfig,
+    HashingVectorizerEmbeddingConfig,
     OpenaiEmbeddingConfig,
     SentenceTransformerEmbeddingConfig,
     TaskTypeEnum,
@@ -29,6 +30,7 @@
     "FaissConfig",
     "HFModelConfig",
     "HPOConfig",
+    "HashingVectorizerEmbeddingConfig",
     "InferenceNodeConfig",
     "LoggingConfig",
     "OpenSearchConfig",