From df23e9d1ccb10cbd2e84745df7108ef67fd37599 Mon Sep 17 00:00:00 2001 From: "Hsekumsti@gmail.com" Date: Fri, 28 Nov 2025 07:11:41 +0530 Subject: [PATCH 1/2] Fix Qdrant cloud indexing: Add keyword index for 'type' field during collection creation --- src/intugle/core/semantic_search/crud.py | 10 +++++++++- src/intugle/core/vector_store/qdrant.py | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/intugle/core/semantic_search/crud.py b/src/intugle/core/semantic_search/crud.py index c2236a5..6bd26a0 100644 --- a/src/intugle/core/semantic_search/crud.py +++ b/src/intugle/core/semantic_search/crud.py @@ -50,7 +50,15 @@ def configuration(self): } embeddings_configurations = {**embeddings_configurations, **config} - configuration = QdrantVectorConfiguration(vectors_config=embeddings_configurations) + # Payload schema with keyword index for "type" field required for filtering + payload_schema = { + "type": models.PayloadSchemaType.KEYWORD, + } + + configuration = QdrantVectorConfiguration( + vectors_config=embeddings_configurations, + payload_schema=payload_schema + ) return configuration diff --git a/src/intugle/core/vector_store/qdrant.py b/src/intugle/core/vector_store/qdrant.py index f83301e..3d6eb55 100644 --- a/src/intugle/core/vector_store/qdrant.py +++ b/src/intugle/core/vector_store/qdrant.py @@ -21,6 +21,8 @@ class QdrantVectorConfiguration(BaseModel): sparse_vectors_config: Optional[Mapping[str, qdrant_types.SparseVectorParams]] = None + payload_schema: Optional[Mapping[str, models.PayloadSchemaType]] = None + # Used for standardization From 7644827e799f2d7a1a00580fa83794205249b47a Mon Sep 17 00:00:00 2001 From: "Hsekumsti@gmail.com" Date: Fri, 28 Nov 2025 07:12:30 +0530 Subject: [PATCH 2/2] Add auto-load datasets from directory feature to SemanticModel --- src/intugle/semantic_model.py | 59 ++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/src/intugle/semantic_model.py b/src/intugle/semantic_model.py index 31d1af0..ade9106 100644 --- a/src/intugle/semantic_model.py +++ b/src/intugle/semantic_model.py @@ -1,6 +1,7 @@ import logging +import pathlib -from typing import TYPE_CHECKING, Any, Dict, List +from typing import TYPE_CHECKING, Any, Dict, List, Union import pandas as pd import yaml @@ -20,7 +21,7 @@ class SemanticModel: - def __init__(self, data_input: Dict[str, Any] | List[DataSet], domain: str = ""): + def __init__(self, data_input: Union[Dict[str, Any], List[DataSet], str], domain: str = ""): self.datasets: Dict[str, DataSet] = {} self.links: list[PredictedLink] = [] self.domain = domain @@ -30,9 +31,11 @@ def __init__(self, data_input: Dict[str, Any] | List[DataSet], domain: str = "") self._initialize_from_dict(data_input) elif isinstance(data_input, list): self._initialize_from_list(data_input) + elif isinstance(data_input, str): + self._initialize_from_folder(data_input) else: raise TypeError( - "Input must be a dictionary of named dataframes or a list of DataSet objects." + "Input must be a dictionary of named dataframes, a list of DataSet objects, or a string path to a folder." ) def _initialize_from_dict(self, data_dict: Dict[str, Any]): @@ -50,6 +53,55 @@ def _initialize_from_list(self, data_list: List[DataSet]): ) self.datasets[dataset.name] = dataset + def _initialize_from_folder(self, folder_path: str): + """Scans a folder for supported data files (CSV, Parquet, Excel) and loads them as datasets.""" + folder = pathlib.Path(folder_path) + + if not folder.exists(): + raise FileNotFoundError(f"Folder path does not exist: {folder_path}") + + if not folder.is_dir(): + raise NotADirectoryError(f"Path is not a directory: {folder_path}") + + # Extension to DuckDB type mapping + extension_mapping = { + '.csv': 'csv', + '.parquet': 'parquet', + '.xlsx': 'xlsx', + '.xls': 'xlsx' + } + + found_files = False + for file_path in folder.iterdir(): + if file_path.is_file(): + file_extension = file_path.suffix.lower() + if file_extension in extension_mapping: + found_files = True + + # Use filename without extension as dataset name + dataset_name = file_path.stem + + # Create DuckDB config for this file + config = { + "path": str(file_path.resolve()), + "type": extension_mapping[file_extension] + } + + # Create DataSet with DuckDB adapter + dataset = DataSet(config, name=dataset_name) + self.datasets[dataset_name] = dataset + + console.print( + f"Loaded dataset '{dataset_name}' from {file_path.name}", + style="green" + ) + + if not found_files: + raise FileNotFoundError( + f"No supported files found in {folder_path}. Supported formats: " + f"{', '.join(extension_mapping.keys())}" + ) + def profile(self, force_recreate: bool = False): """Run profiling, datatype identification, and key identification for all datasets.""" console.print( @@ -262,4 +314,3 @@ def deploy(self, target: str, **kwargs): f"Failed to deploy semantic model to '{target}': {e}", style="bold red" ) raise -