Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/reusable-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ on:
required: true
type: string
description: 'Command to run tests'
extras:
required: false
type: string
default: ''
description: 'Space-separated --extra flags (e.g., "--extra transformers --extra peft")'

jobs:
test:
Expand Down Expand Up @@ -39,7 +44,7 @@ jobs:
- name: Install dependencies for Python ${{ matrix.python-version }}
run: |
uv python pin ${{ matrix.python-version }}
uv sync --group test
uv sync --group test ${{ inputs.extras }}

- name: Run tests
run: |
Expand Down
15 changes: 15 additions & 0 deletions .github/workflows/test-embedder.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: test embedder

on:
push:
branches:
- dev
pull_request:

jobs:
test:
uses: ./.github/workflows/reusable-test.yaml
with:
test_command: pytest -n auto tests/embedder/ tests/callback/
extras: --extra sentence-transformers --extra transformers

1 change: 1 addition & 0 deletions .github/workflows/test-inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ jobs:
uses: ./.github/workflows/reusable-test.yaml
with:
test_command: pytest -n auto tests/pipeline/test_inference.py
extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
1 change: 1 addition & 0 deletions .github/workflows/test-optimization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ jobs:
uses: ./.github/workflows/reusable-test.yaml
with:
test_command: pytest -n auto tests/pipeline/test_optimization.py
extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
1 change: 1 addition & 0 deletions .github/workflows/test-presets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ jobs:
uses: ./.github/workflows/reusable-test.yaml
with:
test_command: pytest -n auto tests/pipeline/test_presets.py
extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
47 changes: 47 additions & 0 deletions .github/workflows/test-scorers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: test scorers

on:
push:
branches:
- dev
pull_request:

jobs:
test:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest ]
python-version: [ "3.10", "3.11", "3.12" ]
dependency-group: [ "base", "transformers", "peft", "catboost" ]
include:
- os: windows-latest
python-version: "3.10"
dependency-group: "base"

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Cache Hugging Face
id: cache-hf
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: ${{ runner.os }}-hf

- name: Install uv
uses: astral-sh/setup-uv@v6
with:
version: "0.8.8"

- name: Install dependencies for Python ${{ matrix.python-version }}
run: |
uv python pin ${{ matrix.python-version }}
uv sync --group test ${{ matrix.dependency-group != 'base' && format('--extra {0}', matrix.dependency-group) || '' }}

- name: Run scorer tests
run: |
uv run pytest -n auto tests/modules/scoring/

2 changes: 1 addition & 1 deletion .github/workflows/typing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- name: Install dependencies
run: |
uv lock
uv sync --group typing
uv sync --group typing --extra peft --extra sentence-transformers

- name: Run mypy
run: uv run mypy src/autointent
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ jobs:
test:
uses: ./.github/workflows/reusable-test.yaml
with:
test_command: pytest -n auto --ignore=tests/nodes --ignore=tests/pipeline
test_command: pytest -n auto --ignore=tests/modules/scoring/ --ignore=tests/pipeline --ignore=tests/embedder --ignore=tests/callback
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ dependencies = [
[project.optional-dependencies]
catboost = ["catboost (>=1.2.8,<2.0.0)"]
peft = ["peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)"]
transformers = ["transformers (>=4.49.0,<5.0.0)"]
transformers = [
"transformers[torch] (>=4.49.0,<5.0.0)",
]
sentence-transformers = ["sentence-transformers (>=3,<4)"]
dspy = [
"dspy (>=2.6.5,<3.0.0)",
Expand Down
1 change: 0 additions & 1 deletion src/autointent/_presets/classic-medium.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ search_space:
k:
low: 1
high: 20
- module_name: catboost
- module_name: sklearn
clf_name: [RandomForestClassifier]
n_estimators: [150]
Expand Down
2 changes: 2 additions & 0 deletions src/autointent/_wrappers/embedder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from .base import BaseEmbeddingBackend
from .embedder import Embedder
from .hashing_vectorizer import HashingVectorizerEmbeddingBackend
from .openai import OpenaiEmbeddingBackend
from .sentence_transformers import SentenceTransformerEmbeddingBackend

__all__ = [
"BaseEmbeddingBackend",
"Embedder",
"HashingVectorizerEmbeddingBackend",
"OpenaiEmbeddingBackend",
"SentenceTransformerEmbeddingBackend",
]
12 changes: 11 additions & 1 deletion src/autointent/_wrappers/embedder/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,16 @@
import torch

from autointent.configs import EmbedderFineTuningConfig, TaskTypeEnum
from autointent.configs._embedder import EmbedderConfig, OpenaiEmbeddingConfig, SentenceTransformerEmbeddingConfig
from autointent.configs._embedder import (
EmbedderConfig,
HashingVectorizerEmbeddingConfig,
OpenaiEmbeddingConfig,
SentenceTransformerEmbeddingConfig,
)
from autointent.custom_types import ListOfLabels

from .base import BaseEmbeddingBackend
from .hashing_vectorizer import HashingVectorizerEmbeddingBackend
from .openai import OpenaiEmbeddingBackend
from .sentence_transformers import SentenceTransformerEmbeddingBackend

Expand Down Expand Up @@ -52,6 +58,8 @@ def _init_backend(self) -> BaseEmbeddingBackend:
return SentenceTransformerEmbeddingBackend(self.config)
if isinstance(self.config, OpenaiEmbeddingConfig):
return OpenaiEmbeddingBackend(self.config)
if isinstance(self.config, HashingVectorizerEmbeddingConfig):
return HashingVectorizerEmbeddingBackend(self.config)
# Check if it's exactly the abstract base config (not a subclass)

msg = f"Cannot instantiate abstract EmbedderConfig: {self.config.__repr__()}"
Expand Down Expand Up @@ -147,6 +155,8 @@ def load(cls, path: Path | str, override_config: EmbedderConfig | None = None) -
instance._backend = SentenceTransformerEmbeddingBackend.load(backend_path) # noqa: SLF001
elif isinstance(config, OpenaiEmbeddingConfig):
instance._backend = OpenaiEmbeddingBackend.load(backend_path) # noqa: SLF001
elif isinstance(config, HashingVectorizerEmbeddingConfig):
instance._backend = HashingVectorizerEmbeddingBackend.load(backend_path) # noqa: SLF001
else:
msg = f"Cannot load abstract EmbedderConfig: {config.__repr__()}"
raise TypeError(msg)
Expand Down
175 changes: 175 additions & 0 deletions src/autointent/_wrappers/embedder/hashing_vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""HashingVectorizer-based embedding backend for lightweight testing."""

import json
import logging
from pathlib import Path
from typing import Literal, overload

import numpy as np
import numpy.typing as npt
import torch
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from autointent._hash import Hasher
from autointent.configs import TaskTypeEnum
from autointent.configs._embedder import HashingVectorizerEmbeddingConfig

from .base import BaseEmbeddingBackend

logger = logging.getLogger(__name__)


class HashingVectorizerEmbeddingBackend(BaseEmbeddingBackend):
"""HashingVectorizer-based embedding backend implementation.

This backend uses sklearn's HashingVectorizer for fast, stateless text vectorization.
Ideal for testing as it requires no model downloads and is very fast.
"""

supports_training: bool = False

def __init__(self, config: HashingVectorizerEmbeddingConfig) -> None:
"""Initialize the HashingVectorizer backend.

Args:
config: Configuration for HashingVectorizer embeddings.
"""
self.config = config
self._vectorizer = HashingVectorizer(
n_features=config.n_features,
ngram_range=config.ngram_range,
analyzer=config.analyzer,
lowercase=config.lowercase,
norm=config.norm,
binary=config.binary,
dtype=getattr(np, config.dtype),
)

def clear_ram(self) -> None:
"""Clear the backend from RAM.

HashingVectorizer is stateless, so this is a no-op.
"""

def get_hash(self) -> int:
"""Compute a hash value for the backend.

Returns:
The hash value of the backend.
"""
hasher = Hasher()
# Hash all relevant config parameters
hasher.update(self.config.n_features)
hasher.update(self.config.ngram_range)
hasher.update(self.config.analyzer)
hasher.update(self.config.lowercase)
hasher.update(self.config.norm if self.config.norm is not None else "None")
hasher.update(self.config.binary)
hasher.update(self.config.dtype)
return int(hasher.hexdigest(), 16)

@overload
def embed(
self, utterances: list[str], task_type: TaskTypeEnum | None = None, *, return_tensors: Literal[True]
) -> torch.Tensor: ...

@overload
def embed(
self, utterances: list[str], task_type: TaskTypeEnum | None = None, *, return_tensors: Literal[False] = False
) -> npt.NDArray[np.float32]: ...

def embed(
self,
utterances: list[str],
task_type: TaskTypeEnum | None = None, # noqa: ARG002
return_tensors: bool = False,
) -> npt.NDArray[np.float32] | torch.Tensor:
"""Calculate embeddings for a list of utterances.

Args:
utterances: List of input texts to calculate embeddings for.
task_type: Type of task for which embeddings are calculated (ignored for HashingVectorizer).
return_tensors: If True, return a PyTorch tensor; otherwise, return a numpy array.

Returns:
A numpy array or PyTorch tensor of embeddings.
"""
# Transform texts to sparse matrix, then convert to dense
embeddings_sparse = self._vectorizer.transform(utterances)
embeddings: npt.NDArray[np.float32] = embeddings_sparse.toarray().astype(np.float32)

if return_tensors:
return torch.from_numpy(embeddings)
return embeddings

def similarity(
self, embeddings1: npt.NDArray[np.float32], embeddings2: npt.NDArray[np.float32]
) -> npt.NDArray[np.float32]:
"""Calculate cosine similarity between two sets of embeddings.

Args:
embeddings1: First set of embeddings with shape (n_samples, n_features).
embeddings2: Second set of embeddings with shape (m_samples, n_features).

Returns:
Similarity matrix with shape (n_samples, m_samples).
"""
similarity_matrix: npt.NDArray[np.float32] = cosine_similarity(embeddings1, embeddings2).astype(np.float32)
return similarity_matrix

def dump(self, path: Path) -> None:
"""Save the backend state to disk.

Args:
path: Directory path where the backend should be saved.
"""
path.mkdir(parents=True, exist_ok=True)

# Save a metadata file indicating this is a HashingVectorizer backend
metadata = {
"backend_type": "hashing_vectorizer",
"config": self.config.model_dump(),
}

metadata_path = path / "metadata.json"
with metadata_path.open("w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2)

logger.debug("Saved HashingVectorizer backend to %s", path)

@classmethod
def load(cls, path: Path) -> "HashingVectorizerEmbeddingBackend":
"""Load the backend from disk.

Args:
path: Directory path where the backend is stored.

Returns:
Loaded HashingVectorizerEmbeddingBackend instance.
"""
metadata_path = path / "metadata.json"
with metadata_path.open("r", encoding="utf-8") as f:
metadata = json.load(f)

config = HashingVectorizerEmbeddingConfig.model_validate(metadata["config"])
instance = cls(config)

logger.debug("Loaded HashingVectorizer backend from %s", path)
return instance

def train(self, utterances: list[str], labels: list[int], config) -> None: # type: ignore[no-untyped-def] # noqa: ANN001
"""Train the backend.

HashingVectorizer is stateless and doesn't support training.

Args:
utterances: Training utterances.
labels: Training labels.
config: Training configuration.

Raises:
NotImplementedError: HashingVectorizer doesn't support training.
"""
msg = "HashingVectorizer backend does not support training"
raise NotImplementedError(msg)
2 changes: 2 additions & 0 deletions src/autointent/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from ._embedder import (
EmbedderConfig,
HashingVectorizerEmbeddingConfig,
OpenaiEmbeddingConfig,
SentenceTransformerEmbeddingConfig,
TaskTypeEnum,
Expand Down Expand Up @@ -29,6 +30,7 @@
"FaissConfig",
"HFModelConfig",
"HPOConfig",
"HashingVectorizerEmbeddingConfig",
"InferenceNodeConfig",
"LoggingConfig",
"OpenSearchConfig",
Expand Down
Loading