diff --git a/dev/README.md b/dev/README.md index 0efd539d..14731dd4 100644 --- a/dev/README.md +++ b/dev/README.md @@ -35,3 +35,26 @@ python dev/generate_cli_docs.py - `cocoindex` package must be importable (the CLI module) This ensures that CLI documentation is always kept in sync with the actual command-line interface. + +## Type-checking Examples + +We provide a helper script to run mypy on each example entry point individually with minimal assumptions about optional dependencies. + +### `mypy_check_examples.ps1` + +Runs mypy for every `main.py` (and `colpali_main.py`) under the `examples/` folder using these rules: + +- Only ignore missing imports (no broad suppressions) +- Avoid type-checking CocoIndex internals by setting `--follow-imports=silent` +- Make CocoIndex sources discoverable via `MYPYPATH=python` + +Usage (Windows PowerShell): + +```powershell +powershell -NoProfile -ExecutionPolicy Bypass -File dev/mypy_check_examples.ps1 +``` + +Notes: + +- Ensure you have a local virtual environment with `mypy` installed (e.g. `.venv` with `pip install mypy`). +- The script will report any failing example files and exit non-zero on failures. diff --git a/dev/mypy_check_examples.ps1 b/dev/mypy_check_examples.ps1 new file mode 100644 index 00000000..b2adb538 --- /dev/null +++ b/dev/mypy_check_examples.ps1 @@ -0,0 +1,34 @@ +$ErrorActionPreference = 'Stop' + +# Resolve python in local venv +$repoRoot = Split-Path -Parent $PSScriptRoot +$python = Join-Path $repoRoot '.venv\Scripts\python.exe' +if (-not (Test-Path $python)) { + $python = 'python' +} + +# Ensure mypy can resolve local cocoindex package sources +$env:MYPYPATH = Join-Path $repoRoot 'python' + +# Collect example entry files +$examples = Join-Path $repoRoot 'examples' +$files = Get-ChildItem -Path $examples -Recurse -File | + Where-Object { $_.Name -in @('main.py','colpali_main.py') } | + Sort-Object FullName + +$failed = @() +foreach ($f in $files) { + Write-Host (">>> Checking " + $f.FullName) + & $python -m mypy --ignore-missing-imports --follow-imports=silent $f.FullName + if ($LASTEXITCODE -ne 0) { + $failed += $f.FullName + } +} + +if ($failed.Count -gt 0) { + Write-Host "\nFailures:" + $failed | ForEach-Object { Write-Host $_ } + exit 1 +} else { + Write-Host "\nAll example entry files passed mypy." +} diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index 5bbfa83d..9a48ad8e 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -4,6 +4,7 @@ import cocoindex from markdown_it import MarkdownIt +from typing import cast _markdown_it = MarkdownIt("gfm-like") @@ -96,7 +97,7 @@ def mutate( @cocoindex.op.function() def markdown_to_html(text: str) -> str: - return _markdown_it.render(text) + return cast(str, _markdown_it.render(text)) @cocoindex.flow_def(name="CustomOutputFiles") diff --git a/examples/face_recognition/main.py b/examples/face_recognition/main.py index cd05c705..b4e04a24 100644 --- a/examples/face_recognition/main.py +++ b/examples/face_recognition/main.py @@ -7,6 +7,7 @@ import face_recognition import numpy as np from PIL import Image +from typing import cast QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/") QDRANT_COLLECTION = "face_embeddings" @@ -85,7 +86,7 @@ def extract_face_embedding( np.array(img), known_face_locations=[(0, img.width - 1, img.height - 1, 0)], )[0] - return embedding + return cast(cocoindex.Vector[cocoindex.Float32], embedding) @cocoindex.flow_def(name="FaceRecognition") diff --git a/examples/fastapi_server_docker/main.py b/examples/fastapi_server_docker/main.py index 752e2436..f2b7b049 100644 --- a/examples/fastapi_server_docker/main.py +++ b/examples/fastapi_server_docker/main.py @@ -6,6 +6,7 @@ from psycopg_pool import ConnectionPool from contextlib import asynccontextmanager import os +from typing import Any, AsyncIterator @cocoindex.transform_flow() @@ -26,7 +27,7 @@ def text_to_embedding( @cocoindex.flow_def(name="MarkdownEmbeddingFastApiExample") def markdown_embedding_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define an example flow that embeds markdown files into a vector database. """ @@ -65,7 +66,7 @@ def markdown_embedding_flow( ) -def search(pool: ConnectionPool, query: str, top_k: int = 5): +def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]: # Get the table name, for the export target in the text_embedding_flow above. table_name = cocoindex.utils.get_target_default_name( markdown_embedding_flow, "doc_embeddings" @@ -89,7 +90,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5): @asynccontextmanager -def lifespan(app: FastAPI): +async def lifespan(app: FastAPI) -> AsyncIterator[None]: load_dotenv() cocoindex.init() pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) @@ -103,16 +104,19 @@ def lifespan(app: FastAPI): fastapi_app = FastAPI(lifespan=lifespan) -@fastapi_app.get("/search") def search_endpoint( request: Request, q: str = Query(..., description="Search query"), limit: int = Query(5, description="Number of results"), -): +) -> dict[str, Any]: pool = request.app.state.pool results = search(pool, q, limit) return {"results": results} +# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable +fastapi_app.get("/search")(search_endpoint) + + if __name__ == "__main__": uvicorn.run(fastapi_app, host="0.0.0.0", port=8080) diff --git a/examples/gdrive_text_embedding/main.py b/examples/gdrive_text_embedding/main.py index c9b7b630..746fe024 100644 --- a/examples/gdrive_text_embedding/main.py +++ b/examples/gdrive_text_embedding/main.py @@ -3,6 +3,7 @@ import cocoindex import datetime import os +from typing import Any @cocoindex.transform_flow() @@ -23,7 +24,7 @@ def text_to_embedding( @cocoindex.flow_def(name="GoogleDriveTextEmbedding") def gdrive_text_embedding_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define an example flow that embeds text into a vector database. """ @@ -71,7 +72,7 @@ def gdrive_text_embedding_flow( ) -def search(pool: ConnectionPool, query: str, top_k: int = 5): +def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]: # Get the table name, for the export target in the gdrive_text_embedding_flow above. table_name = cocoindex.utils.get_target_default_name( gdrive_text_embedding_flow, "doc_embeddings" @@ -94,7 +95,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5): ] -def _main(): +def _main() -> None: # Initialize the database connection pool. pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) # Run queries in a loop to demonstrate the query capabilities. diff --git a/examples/image_search/colpali_main.py b/examples/image_search/colpali_main.py index feec3fab..49ad61b7 100644 --- a/examples/image_search/colpali_main.py +++ b/examples/image_search/colpali_main.py @@ -1,7 +1,7 @@ import datetime import os from contextlib import asynccontextmanager -from typing import Any +from typing import Any, AsyncIterator import cocoindex from dotenv import load_dotenv @@ -71,7 +71,7 @@ def image_object_embedding_flow( @asynccontextmanager -async def lifespan(app: FastAPI) -> None: +async def lifespan(app: FastAPI) -> AsyncIterator[None]: load_dotenv() cocoindex.init() image_object_embedding_flow.setup(report_to_stdout=True) @@ -100,11 +100,10 @@ async def lifespan(app: FastAPI) -> None: # --- Search API --- -@app.get("/search") def search( q: str = Query(..., description="Search query"), limit: int = Query(5, description="Number of results"), -) -> Any: +) -> dict[str, Any]: # Get the multi-vector embedding for the query query_embedding = text_to_colpali_embedding.eval(q) print( @@ -132,3 +131,7 @@ def search( for result in search_results.points ] } + + +# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable +app.get("/search")(search) diff --git a/examples/image_search/main.py b/examples/image_search/main.py index 2fd12c4a..9ad7b80b 100644 --- a/examples/image_search/main.py +++ b/examples/image_search/main.py @@ -3,7 +3,7 @@ import io import os from contextlib import asynccontextmanager -from typing import Any, Literal +from typing import Any, Literal, Final, TypeAlias, cast, AsyncIterator import cocoindex import torch @@ -19,7 +19,8 @@ QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/") QDRANT_COLLECTION = "ImageSearch" CLIP_MODEL_NAME = "openai/clip-vit-large-patch14" -CLIP_MODEL_DIMENSION = 768 +CLIP_MODEL_DIMENSION: Final[int] = 768 +CLIPVector: TypeAlias = cocoindex.Vector[cocoindex.Float32, Literal[768]] @functools.cache @@ -37,13 +38,13 @@ def embed_query(text: str) -> list[float]: inputs = processor(text=[text], return_tensors="pt", padding=True) with torch.no_grad(): features = model.get_text_features(**inputs) - return features[0].tolist() + return cast(list[float], features[0].tolist()) @cocoindex.op.function(cache=True, behavior_version=1, gpu=True) def embed_image( img_bytes: bytes, -) -> cocoindex.Vector[cocoindex.Float32, Literal[CLIP_MODEL_DIMENSION]]: +) -> CLIPVector: """ Convert image to embedding using CLIP model. """ @@ -52,7 +53,7 @@ def embed_image( inputs = processor(images=image, return_tensors="pt") with torch.no_grad(): features = model.get_image_features(**inputs) - return features[0].tolist() + return cast(CLIPVector, features[0].tolist()) # CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant @@ -112,7 +113,7 @@ def image_object_embedding_flow( @asynccontextmanager -async def lifespan(app: FastAPI) -> None: +async def lifespan(app: FastAPI) -> AsyncIterator[None]: load_dotenv() cocoindex.init() image_object_embedding_flow.setup(report_to_stdout=True) @@ -141,11 +142,10 @@ async def lifespan(app: FastAPI) -> None: # --- Search API --- -@app.get("/search") def search( q: str = Query(..., description="Search query"), limit: int = Query(5, description="Number of results"), -) -> Any: +) -> dict[str, Any]: # Get the embedding for the query query_embedding = embed_query(q) @@ -169,3 +169,7 @@ def search( for result in search_results ] } + + +# Attach route without using decorator to avoid untyped-decorator when FastAPI types are unavailable +app.get("/search")(search) diff --git a/examples/manuals_llm_extraction/main.py b/examples/manuals_llm_extraction/main.py index 76927226..c81f6c1b 100644 --- a/examples/manuals_llm_extraction/main.py +++ b/examples/manuals_llm_extraction/main.py @@ -5,6 +5,7 @@ from marker.models import create_model_dict from marker.output import text_from_rendered from marker.config.parser import ConfigParser +from typing import cast import cocoindex @@ -20,7 +21,7 @@ class PdfToMarkdownExecutor: spec: PdfToMarkdown _converter: PdfConverter - def prepare(self): + def prepare(self) -> None: config_parser = ConfigParser({}) self._converter = PdfConverter( create_model_dict(), config=config_parser.generate_config_dict() @@ -30,8 +31,8 @@ def __call__(self, content: bytes) -> str: with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file: temp_file.write(content) temp_file.flush() - text, _, _ = text_from_rendered(self._converter(temp_file.name)) - return text + text_any, _, _ = text_from_rendered(self._converter(temp_file.name)) + return cast(str, text_any) @dataclasses.dataclass @@ -90,7 +91,7 @@ def summarize_module(module_info: ModuleInfo) -> ModuleSummary: @cocoindex.flow_def(name="ManualExtraction") def manual_extraction_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define an example flow that extracts manual information from a Markdown. """ diff --git a/examples/paper_metadata/main.py b/examples/paper_metadata/main.py index 195f472a..b1235274 100644 --- a/examples/paper_metadata/main.py +++ b/examples/paper_metadata/main.py @@ -9,6 +9,7 @@ from marker.models import create_model_dict from marker.output import text_from_rendered from functools import cache +from typing import cast from pypdf import PdfReader, PdfWriter @@ -66,8 +67,8 @@ def pdf_to_markdown(content: bytes) -> str: with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file: temp_file.write(content) temp_file.flush() - text, _, _ = text_from_rendered(get_marker_converter()(temp_file.name)) - return text + text_any, _, _ = text_from_rendered(get_marker_converter()(temp_file.name)) + return cast(str, text_any) @cocoindex.flow_def(name="PaperMetadata") diff --git a/examples/patient_intake_extraction/main.py b/examples/patient_intake_extraction/main.py index fd7ec21b..635a7fc4 100644 --- a/examples/patient_intake_extraction/main.py +++ b/examples/patient_intake_extraction/main.py @@ -5,6 +5,7 @@ from markitdown import MarkItDown from openai import OpenAI +from typing import cast import cocoindex @@ -97,7 +98,7 @@ class ToMarkdownExecutor: spec: ToMarkdown _converter: MarkItDown - def prepare(self): + def prepare(self) -> None: client = OpenAI() self._converter = MarkItDown(llm_client=client, llm_model="gpt-4o") @@ -106,14 +107,15 @@ def __call__(self, content: bytes, filename: str) -> str: with tempfile.NamedTemporaryFile(delete=True, suffix=suffix) as temp_file: temp_file.write(content) temp_file.flush() - text = self._converter.convert(temp_file.name).text_content + text_any = self._converter.convert(temp_file.name).text_content + text: str = cast(str, text_any) return text @cocoindex.flow_def(name="PatientIntakeExtraction") def patient_intake_extraction_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define a flow that extracts patient information from intake forms. """ diff --git a/examples/pdf_elements_embedding/main.py b/examples/pdf_elements_embedding/main.py index 23f287c9..19fd5e0d 100644 --- a/examples/pdf_elements_embedding/main.py +++ b/examples/pdf_elements_embedding/main.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from pypdf import PdfReader from transformers import CLIPModel, CLIPProcessor -from typing import Literal +from typing import Literal, TypeAlias, Final, cast QDRANT_GRPC_URL = "http://localhost:6334" @@ -15,8 +15,8 @@ QDRANT_COLLECTION_TEXT = "PdfElementsEmbeddingText" CLIP_MODEL_NAME = "openai/clip-vit-large-patch14" -CLIP_MODEL_DIMENSION = 768 -ClipVectorType = cocoindex.Vector[cocoindex.Float32, Literal[CLIP_MODEL_DIMENSION]] +CLIP_MODEL_DIMENSION: Final = 768 +ClipVectorType: TypeAlias = cocoindex.Vector[cocoindex.Float32, Literal[768]] IMG_THUMBNAIL_SIZE = (512, 512) @@ -38,7 +38,7 @@ def clip_embed_image(img_bytes: bytes) -> ClipVectorType: inputs = processor(images=image, return_tensors="pt") with torch.no_grad(): features = model.get_image_features(**inputs) - return features[0].tolist() + return cast(ClipVectorType, features[0].tolist()) def clip_embed_query(text: str) -> ClipVectorType: @@ -49,7 +49,7 @@ def clip_embed_query(text: str) -> ClipVectorType: inputs = processor(text=[text], return_tensors="pt", padding=True) with torch.no_grad(): features = model.get_text_features(**inputs) - return features[0].tolist() + return cast(ClipVectorType, features[0].tolist()) @cocoindex.transform_flow() diff --git a/examples/pdf_embedding/main.py b/examples/pdf_embedding/main.py index 4bbe4564..c1814a4a 100644 --- a/examples/pdf_embedding/main.py +++ b/examples/pdf_embedding/main.py @@ -9,6 +9,7 @@ from marker.output import text_from_rendered from psycopg_pool import ConnectionPool from jinja2 import Template +from typing import Any, cast class PdfToMarkdown(cocoindex.op.FunctionSpec): @@ -22,7 +23,7 @@ class PdfToMarkdownExecutor: spec: PdfToMarkdown _converter: PdfConverter - def prepare(self): + def prepare(self) -> None: config_parser = ConfigParser({}) self._converter = PdfConverter( create_model_dict(), config=config_parser.generate_config_dict() @@ -32,7 +33,8 @@ def __call__(self, content: bytes) -> str: with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file: temp_file.write(content) temp_file.flush() - text, _, _ = text_from_rendered(self._converter(temp_file.name)) + text_any, _, _ = text_from_rendered(self._converter(temp_file.name)) + text: str = cast(str, text_any) return text @@ -54,7 +56,7 @@ def text_to_embedding( @cocoindex.flow_def(name="PdfEmbedding") def pdf_embedding_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define an example flow that embeds files into a vector database. """ @@ -96,7 +98,7 @@ def pdf_embedding_flow( ) -def search(pool: ConnectionPool, query: str, top_k: int = 5): +def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]: # Get the table name, for the export target in the pdf_embedding_flow above. table_name = cocoindex.utils.get_target_default_name( pdf_embedding_flow, "pdf_embeddings" @@ -130,7 +132,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5): """) -def _main(): +def _main() -> None: # Initialize the database connection pool. pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) # Run queries in a loop to demonstrate the query capabilities. diff --git a/examples/postgres_source/main.py b/examples/postgres_source/main.py index deef6172..5e752ea2 100644 --- a/examples/postgres_source/main.py +++ b/examples/postgres_source/main.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, cast import os import datetime @@ -134,7 +134,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, A """, (query_vector, top_k), ) - return cur.fetchall() + return cast(list[dict[str, Any]], cur.fetchall()) def _main() -> None: