Skip to content

Commit c151a1d

Browse files
committed
Sync chromasql from 4abf1948cdb8ced8924e7450f0d64fbb17592454
1 parent 716d82f commit c151a1d

22 files changed

+481
-106
lines changed

CLI_README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ The CLI provides a similar developer experience to the factory pattern in `adri_
235235

236236
### Import Errors
237237

238-
If you see import errors related to `indexer` or `adri_agents`:
238+
If you see import errors related to `idxr` or `adri_agents`:
239239

240240
- Make sure you're running the CLI from the main project: `poetry run chromasql-server`
241241
- Ensure all dependencies are installed: `poetry install`

CONTRIBUTING.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,9 @@ clause. ChromaSQL provides a generic `MetadataFieldRouter` adapter:
145145
from pathlib import Path
146146
from chromasql.adapters import MetadataFieldRouter
147147
from chromasql.multi_collection import execute_multi_collection
148-
from indexer.query_lib.async_multi_collection_adapter import AsyncMultiCollectionAdapter
149-
from indexer.vectorize_lib.query_client import AsyncMultiCollectionQueryClient
150-
from indexer.vectorize_lib.query_config import load_query_config
148+
from idxr.query_lib.async_multi_collection_adapter import AsyncMultiCollectionAdapter
149+
from idxr.vectorize_lib.query_client import AsyncMultiCollectionQueryClient
150+
from idxr.vectorize_lib.query_config import load_query_config
151151

152152
# Load your query config (maps discriminator values to collections)
153153
config = load_query_config(Path("query_config.json"))

_ast_nodes.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"BetweenPredicate",
3333
"LikePredicate",
3434
"ContainsPredicate",
35+
"RegexPredicate",
3536
"BooleanPredicate",
3637
"Query",
3738
]
@@ -136,12 +137,21 @@ class BetweenPredicate(Predicate):
136137
class LikePredicate(Predicate):
137138
field: Field
138139
pattern: str
140+
negated: bool = False
139141

140142

141143
@dataclass(frozen=True)
142144
class ContainsPredicate(Predicate):
143145
field: Field
144146
value: Union[str, int, float, bool]
147+
negated: bool = False
148+
149+
150+
@dataclass(frozen=True)
151+
class RegexPredicate(Predicate):
152+
field: Field
153+
pattern: str
154+
negated: bool = False
145155

146156

147157
@dataclass(frozen=True)

adapters.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
MetadataFieldRouter,
1212
)
1313
from chromasql.multi_collection import execute_multi_collection
14-
from indexer.vectorize_lib.query_client import AsyncMultiCollectionQueryClient
15-
from indexer.query_lib.async_multi_collection_adapter import AsyncMultiCollectionAdapter
14+
from idxr.vectorize_lib.query_client import AsyncMultiCollectionQueryClient
15+
from idxr.query_lib.async_multi_collection_adapter import AsyncMultiCollectionAdapter
1616
1717
# Setup your existing client
1818
client = AsyncMultiCollectionQueryClient(

cli.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def create_collection_environment_from_config(
136136
Raises:
137137
ValueError: If configuration is invalid or missing required fields
138138
"""
139-
from indexer.models import CollectionEnvironment
139+
from idxr.models import CollectionEnvironment
140140

141141
client_type = config.get("type", "").lower()
142142

@@ -175,12 +175,13 @@ def create_collection_environment_from_config(
175175
module_path = ".".join(rel_path.parts) + ".test_registry:MODEL_REGISTRY"
176176
model_registry_target = module_path
177177
else:
178-
model_registry_target = "indexer.registry:MODEL_REGISTRY"
178+
model_registry_target = "idxr.registry:MODEL_REGISTRY"
179179

180180
embedding_model = config.get("embedding_model", "text-embedding-3-small")
181181
local_collection_name = config.get("collection_name", collection_name)
182182

183183
return CollectionEnvironment(
184+
collection_name=local_collection_name,
184185
query_config_path=query_config_path,
185186
discriminator_field=discriminator_field,
186187
model_registry_target=model_registry_target,
@@ -214,11 +215,18 @@ def create_collection_environment_from_config(
214215

215216
discriminator_field = config.get("discriminator_field", "model_name")
216217
model_registry_target = config.get(
217-
"model_registry_target", "indexer.registry:MODEL_REGISTRY"
218+
"model_registry_target", "idxr.registry:MODEL_REGISTRY"
218219
)
219220
embedding_model = config.get("embedding_model", "text-embedding-3-small")
220221

222+
# For cloud collections, use database name as collection_name or
223+
# fallback to provided name
224+
cloud_collection_name = config.get(
225+
"collection_name", database or collection_name
226+
)
227+
221228
return CollectionEnvironment(
229+
collection_name=cloud_collection_name,
222230
query_config_path=query_config_path,
223231
discriminator_field=discriminator_field,
224232
model_registry_target=model_registry_target,

collection_service.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from fastapi import HTTPException
1313
from pydantic import BaseModel
1414

15-
from indexer.models import CollectionEnvironment as ResearchAgentEnvironment
15+
from idxr.models import CollectionEnvironment
1616

1717
try:
1818
from adri_agents.app.utils.uni_vectordb_agent_harness.query_executor_config import (
@@ -21,11 +21,11 @@
2121
)
2222
except ImportError:
2323
# Fallback for standalone chromasql usage
24-
from indexer.query_lib.config import ( # type: ignore[no-redef]
24+
from idxr.query_lib.config import ( # type: ignore[no-redef]
2525
build_query_executor_kwargs,
2626
load_model_registry,
2727
)
28-
from indexer.query_lib.executor import QueryExecutor
28+
from idxr.query_lib.executor import QueryExecutor
2929
from chromasql.errors import (
3030
ChromaSQLParseError,
3131
ChromaSQLPlanningError,
@@ -35,7 +35,7 @@
3535
logger = getLogger(__name__)
3636

3737

38-
# System metadata fields automatically managed by the indexer
38+
# System metadata fields automatically managed by the idxr
3939
SYSTEM_METADATA_FIELDS: tuple[tuple[str, str, str], ...] = (
4040
("model_name", "string", "String"),
4141
("source_path", "string", "String"),
@@ -82,7 +82,7 @@ class CollectionService:
8282
1. Retrieving index metadata (for /indices endpoint)
8383
2. Executing ChromaSQL queries (for /chromasql/execute endpoint)
8484
85-
It can work with any ResearchAgentEnvironment, supporting both
85+
It can work with any CollectionEnvironment, supporting both
8686
system collections and user-created collections.
8787
8888
Usage:
@@ -107,7 +107,7 @@ def __init__(
107107
self,
108108
collection_name: str,
109109
display_name: str,
110-
system_env: ResearchAgentEnvironment,
110+
system_env: CollectionEnvironment,
111111
):
112112
"""
113113
Initialize the collection service for a single collection.
@@ -123,13 +123,13 @@ def __init__(
123123

124124
@classmethod
125125
def from_env_map(
126-
cls, env_map: Dict[str, ResearchAgentEnvironment]
126+
cls, env_map: Dict[str, CollectionEnvironment]
127127
) -> "MultiCollectionService":
128128
"""
129129
Create a multi-collection service from an environment map.
130130
131131
Args:
132-
env_map: Map of display_name -> ResearchAgentEnvironment
132+
env_map: Map of display_name -> CollectionEnvironment
133133
134134
Returns:
135135
MultiCollectionService instance managing multiple collections
@@ -477,19 +477,19 @@ class MultiCollectionService:
477477
methods to work with all collections at once.
478478
"""
479479

480-
def __init__(self, env_map: Dict[str, ResearchAgentEnvironment]):
480+
def __init__(self, env_map: Dict[str, CollectionEnvironment]):
481481
"""
482482
Initialize the multi-collection service.
483483
484484
Args:
485-
env_map: Map of collection_name -> ResearchAgentEnvironment
485+
env_map: Map of display_name -> CollectionEnvironment
486486
"""
487487
self.services: Dict[str, CollectionService] = {}
488-
for collection_name, system_env in env_map.items():
489-
# Use collection_name as both internal name and display name
490-
self.services[collection_name] = CollectionService(
491-
collection_name=collection_name,
492-
display_name=collection_name,
488+
for display_name, system_env in env_map.items():
489+
# Key by sanitized collection_name for API lookups, but preserve display_name
490+
self.services[system_env.collection_name] = CollectionService(
491+
collection_name=system_env.collection_name,
492+
display_name=display_name,
493493
system_env=system_env,
494494
)
495495

docs/reference/language-grammar.md

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,25 +52,34 @@ embedding_batch: "BATCH" "(" embedding_batch_item ("," embedding_batch_item)* ")
5252

5353
```
5454
where_clause: "WHERE" predicate
55-
where_document_clause: "WHERE_DOCUMENT" document_predicate
55+
where_document_clause: "WHERE_DOCUMENT" document_predicate_expr
5656
5757
predicate:
5858
or_expr
5959
60-
document_predicate:
60+
document_predicate_expr:
61+
document_or_expr
62+
document_or_expr:
63+
document_and_expr ("OR" document_and_expr)*
64+
document_and_expr:
65+
document_atom ("AND" document_atom)*
66+
document_atom:
67+
| "(" document_predicate_expr ")"
6168
| "CONTAINS" value
6269
| "LIKE" string_literal
70+
| "document" "CONTAINS" value
71+
| "document" "LIKE" string_literal
6372
```
6473

6574
Metadata predicates support:
6675

6776
- Comparisons (`=`, `!=`, `<`, `<=`, `>`, `>=`)
6877
- `IN` / `NOT IN`
6978
- `BETWEEN`
70-
- `LIKE` (`%value%` form)
71-
- `CONTAINS`
7279

73-
Boolean expressions use `AND` / `OR` with parentheses for grouping.
80+
**Note:** `LIKE` and `CONTAINS` are only supported for document predicates (via `WHERE_DOCUMENT`), not for metadata filters. This is a ChromaDB limitation.
81+
82+
Both `WHERE` and `WHERE_DOCUMENT` support boolean expressions with `AND` / `OR` and parentheses for grouping.
7483

7584
## Similarity & TopK
7685

docs/reference/multi-collection-architecture.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,9 +146,9 @@ query_config.json maps models → collections
146146
from pathlib import Path
147147
from chromasql.adapters import MetadataFieldRouter
148148
from chromasql.multi_collection import execute_multi_collection
149-
from indexer.query_lib.async_multi_collection_adapter import AsyncMultiCollectionAdapter
150-
from indexer.vectorize_lib.query_client import AsyncMultiCollectionQueryClient
151-
from indexer.vectorize_lib.query_config import load_query_config
149+
from idxr.query_lib.async_multi_collection_adapter import AsyncMultiCollectionAdapter
150+
from idxr.vectorize_lib.query_client import AsyncMultiCollectionQueryClient
151+
from idxr.vectorize_lib.query_config import load_query_config
152152

153153
# Load config
154154
config = load_query_config(Path("output/query_config.json"))

docs/using-chromasql/quick-reference.md

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,23 +33,63 @@ omitting it produces a metadata-only `get`.
3333

3434
### Metadata (`WHERE`)
3535

36-
Supports equality, inequality, numeric comparisons, `IN`/`NOT IN`, `BETWEEN`,
37-
`LIKE` (with `%value%` pattern), and `CONTAINS`.
36+
Supports equality, inequality, numeric comparisons, `IN`/`NOT IN`, and `BETWEEN`.
37+
38+
**Note:** `LIKE` and `CONTAINS` are NOT supported on metadata (ChromaDB limitation).
39+
Use `WHERE_DOCUMENT` for text/pattern matching.
40+
41+
**Note:** `BETWEEN` with mixed int/float types may behave unexpectedly due to ChromaDB type coercion. Use matching types (integer boundaries for integer metadata).
42+
43+
**Important:** Different filter types (metadata vs. document) can **only** be combined with `AND`, not `OR` (ChromaDB limitation). Within each type, use `OR` freely.
3844

3945
```sql
46+
-- Valid: metadata AND document
4047
WHERE metadata.category = 'outerwear'
41-
AND metadata.tags CONTAINS 'waterproof'
48+
AND metadata.price BETWEEN 50 AND 150
49+
AND document CONTAINS 'waterproof'
50+
51+
-- Invalid: metadata OR document ❌
52+
-- WHERE metadata.category = 'outerwear' OR document CONTAINS 'sale'
4253
```
4354

4455
### Document (`WHERE_DOCUMENT`)
4556

46-
Applied after metadata filters. Supports `CONTAINS` and `%value%` `LIKE`
47-
patterns.
57+
Applied after metadata filters. Supports text search operators: `CONTAINS`, `NOT CONTAINS`, `LIKE`, `NOT LIKE`, `REGEX`, `NOT REGEX`.
58+
59+
**These operators ONLY work with WHERE_DOCUMENT**, not with WHERE (metadata).
60+
61+
**Boolean expressions supported**: Use `AND`, `OR`, and parentheses for complex filters.
62+
63+
**Important:** Use `WHERE_DOCUMENT` **once** at the beginning, then combine predicates with boolean operators. Don't repeat `WHERE_DOCUMENT` for each condition.
4864

4965
```sql
66+
-- Simple filter
5067
WHERE_DOCUMENT LIKE '%gore-tex%'
68+
69+
-- OR: Match multiple terms (don't repeat WHERE_DOCUMENT!)
70+
WHERE_DOCUMENT CONTAINS 'waterproof' OR CONTAINS 'breathable'
71+
72+
-- AND: Match all terms
73+
WHERE_DOCUMENT CONTAINS 'outdoor' AND LIKE '%jacket%'
74+
75+
-- Complex: Nested with parentheses
76+
WHERE_DOCUMENT (CONTAINS 'outdoor' AND LIKE '%jacket%') OR CONTAINS 'windproof'
77+
78+
-- Real-world: Multiple organization names
79+
WHERE_DOCUMENT CONTAINS 'BofA' OR CONTAINS 'Bank of America' OR LIKE '%Wells Fargo%'
80+
81+
-- Exclude patterns
82+
WHERE_DOCUMENT NOT LIKE '%test%'
83+
WHERE_DOCUMENT NOT CONTAINS 'deprecated'
84+
85+
-- Regex patterns
86+
WHERE_DOCUMENT REGEX '[a-z]+@[a-z]+\.com' -- Email pattern
87+
WHERE_DOCUMENT REGEX '(?i)python' -- Case-insensitive matching
88+
WHERE_DOCUMENT NOT REGEX '\d{3}-\d{2}-\d{4}' -- Exclude SSN patterns
5189
```
5290

91+
**Note:** Text operators are **case-sensitive**. `WHERE_DOCUMENT CONTAINS 'urgent'` will NOT match "Urgent" or "URGENT". To handle multiple cases, use OR: `CONTAINS 'urgent' OR CONTAINS 'Urgent'`, or use REGEX with `(?i)` flag: `REGEX '(?i)urgent'`.
92+
5393
## Embedding Clauses
5494

5595
```sql

0 commit comments

Comments
 (0)