diff --git a/.github/prompts/testcov.prompt.md b/.github/prompts/testcov.prompt.md new file mode 100644 index 0000000000..76a318deb9 --- /dev/null +++ b/.github/prompts/testcov.prompt.md @@ -0,0 +1,27 @@ +--- +agent: agent +--- + +The goal is for the tests to cover all lines of code. + +Generate a coverage report with: + +pytest --cov --cov-report=annotate:cov_annotate + +If you are checking for coverage of a specific module, you can specify it like this: + +pytest --cov=your_module_name --cov-report=annotate:cov_annotate + +You can also specify specific tests to run, for example: + +pytest tests/test_your_module.py --cov=your_module_name --cov-report=annotate:cov_annotate + +Open the cov_annotate directory to view the annotated source code. +There will be one file per source file. If a file has 100% source coverage, it means all lines are covered by tests, so you do not need to open the file. + +For each file that has less than 100% test coverage, find the matching file in cov_annotate and review the file. + +If a line starts with a ! (exclamation mark), it means that the line is not covered by tests. +Add tests to cover the missing lines. + +Keep running the tests and improving coverage until all lines are covered. diff --git a/.gitignore b/.gitignore index 05bbf3b060..0102334ea2 100644 --- a/.gitignore +++ b/.gitignore @@ -148,6 +148,8 @@ npm-debug.log* node_modules static/ +app/functions/*/prepdocslib/ + data/**/*.md5 .DS_Store diff --git a/AGENTS.md b/AGENTS.md index dc5b4faaa6..58e1388fc7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,7 +17,30 @@ If necessary, edit this file to ensure it accurately reflects the current state * app/backend/approaches/prompts/chat_query_rewrite.prompty: Prompt used to rewrite the query based off search history into a better search query * app/backend/approaches/prompts/chat_query_rewrite_tools.json: Tools used by the query rewriting prompt * app/backend/approaches/prompts/chat_answer_question.prompty: Prompt used by the Chat approach to actually answer the question based off sources + * app/backend/prepdocslib: Contains the document ingestion library used by both local and cloud ingestion + * app/backend/prepdocslib/blobmanager.py: Manages uploads to Azure Blob Storage + * app/backend/prepdocslib/cloudingestionstrategy.py: Builds the Azure AI Search indexer and skillset for the cloud ingestion pipeline + * app/backend/prepdocslib/csvparser.py: Parses CSV files + * app/backend/prepdocslib/embeddings.py: Generates embeddings for text and images using Azure OpenAI + * app/backend/prepdocslib/figureprocessor.py: Generates figure descriptions for both local ingestion and the cloud figure-processor skill + * app/backend/prepdocslib/fileprocessor.py: Orchestrates parsing and chunking of individual files + * app/backend/prepdocslib/filestrategy.py: Strategy for uploading and indexing files (local ingestion) + * app/backend/prepdocslib/htmlparser.py: Parses HTML files + * app/backend/prepdocslib/integratedvectorizerstrategy.py: Strategy using Azure AI Search integrated vectorization + * app/backend/prepdocslib/jsonparser.py: Parses JSON files + * app/backend/prepdocslib/listfilestrategy.py: Lists files from local filesystem or Azure Data Lake + * app/backend/prepdocslib/mediadescriber.py: Interfaces for describing images (Azure OpenAI GPT-4o, Content Understanding) + * app/backend/prepdocslib/page.py: Data classes for pages, images, and chunks + * app/backend/prepdocslib/parser.py: Base parser interface + * app/backend/prepdocslib/pdfparser.py: Parses PDFs using Azure Document Intelligence or local parser + * app/backend/prepdocslib/searchmanager.py: Manages Azure AI Search index creation and updates + * app/backend/prepdocslib/servicesetup.py: Shared service setup helpers for OpenAI, embeddings, blob storage, etc. + * app/backend/prepdocslib/strategy.py: Base strategy interface for document ingestion + * app/backend/prepdocslib/textparser.py: Parses plain text and markdown files + * app/backend/prepdocslib/textprocessor.py: Processes text chunks for cloud ingestion (merges figures, generates embeddings) + * app/backend/prepdocslib/textsplitter.py: Splits text into chunks using different strategies * app/backend/app.py: The main entry point for the backend application. + * app/functions: Azure Functions used for cloud ingestion custom skills (document extraction, figure processing, text processing). Each function bundles a synchronized copy of `prepdocslib`; run `python scripts/copy_prepdocslib.py` to refresh the local copies if you modify the library. * app/frontend: Contains the React frontend code, built with TypeScript, built with vite. * app/frontend/src/api: Contains the API client code for communicating with the backend. * app/frontend/src/components: Contains the React components for the frontend. @@ -65,7 +88,7 @@ When adding a new developer setting, update: * app/backend/approaches/retrievethenread.py : Retrieve from overrides parameter * app/backend/app.py: Some settings may need to be sent down in the /config route. -## When adding tests for a new feature: +## When adding tests for a new feature All tests are in the `tests` folder and use the pytest framework. There are three styles of tests: @@ -124,3 +147,37 @@ cd scripts && mypy . --config-file=../pyproject.toml Note that we do not currently enforce type hints in the tests folder, as it would require adding a lot of `# type: ignore` comments to the existing tests. We only enforce type hints in the main application code and scripts. + +## Python code style + +Do not use single underscores in front of "private" methods or variables in Python code. We do not follow that convention in this codebase, since this is an application and not a library. + +## Deploying the application + +To deploy the application, use the `azd` CLI tool. Make sure you have the latest version of the `azd` CLI installed. Then, run the following command from the root of the repository: + +```shell +azd up +``` + +That command will BOTH provision the Azure resources AND deploy the application code. + +If you only changed the Bicep templates and want to re-provision the Azure resources, run: + +```shell +azd provision +``` + +If you only changed the application code and want to re-deploy the code, run: + +```shell +azd deploy +``` + +If you are using cloud ingestion and only want to deploy individual functions, run the necessary deploy commands, for example: + +```shell +azd deploy document-extractor +azd deploy figure-processor +azd deploy text-processor +``` diff --git a/README.md b/README.md index e7a8aa2aac..181573b13b 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ The repo includes sample data so it's ready to try end to end. In this sample ap - Chat (multi-turn) and Q&A (single turn) interfaces - Renders citations and thought process for each answer - Includes settings directly in the UI to tweak the behavior and experiment with options -- Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [integrated vectorization](/docs/data_ingestion.md#overview-of-integrated-vectorization) +- Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [cloud-based data ingestion](/docs/data_ingestion.md#cloud-based-ingestion) - Optional usage of [multimodal models](/docs/multimodal.md) to reason over image-heavy documents - Optional addition of [speech input/output](/docs/deploy_features.md#enabling-speech-inputoutput) for accessibility - Optional automation of [user login and data access](/docs/login_and_acl.md) via Microsoft Entra diff --git a/app/backend/app.py b/app/backend/app.py index d0771d73f1..2bf1c16697 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -467,6 +467,7 @@ async def setup_clients(): USE_CHAT_HISTORY_BROWSER = os.getenv("USE_CHAT_HISTORY_BROWSER", "").lower() == "true" USE_CHAT_HISTORY_COSMOS = os.getenv("USE_CHAT_HISTORY_COSMOS", "").lower() == "true" USE_AGENTIC_RETRIEVAL = os.getenv("USE_AGENTIC_RETRIEVAL", "").lower() == "true" + USE_VECTORS = os.getenv("USE_VECTORS", "").lower() != "false" # WEBSITE_HOSTNAME is always set by App Service, RUNNING_IN_PRODUCTION is set in main.bicep RUNNING_ON_AZURE = os.getenv("WEBSITE_HOSTNAME") is not None or os.getenv("RUNNING_IN_PRODUCTION") is not None @@ -582,7 +583,7 @@ async def setup_clients(): current_app.config[CONFIG_USER_BLOB_MANAGER] = user_blob_manager # Set up ingester - file_processors = setup_file_processors( + file_processors, figure_processor = setup_file_processors( azure_credential=azure_credential, document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER", "").lower() == "true", @@ -594,18 +595,21 @@ async def setup_clients(): openai_model=OPENAI_CHATGPT_MODEL, openai_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT if OPENAI_HOST == OpenAIHost.AZURE else None, ) - search_info = await setup_search_info( + search_info = setup_search_info( search_service=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, azure_credential=azure_credential ) - text_embeddings_service = setup_embeddings_service( - open_ai_client=openai_client, - openai_host=OPENAI_HOST, - emb_model_name=OPENAI_EMB_MODEL, - emb_model_dimensions=OPENAI_EMB_DIMENSIONS, - azure_openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, - azure_openai_endpoint=azure_openai_endpoint, - disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false", - ) + + text_embeddings_service = None + if USE_VECTORS: + text_embeddings_service = setup_embeddings_service( + open_ai_client=openai_client, + openai_host=OPENAI_HOST, + emb_model_name=OPENAI_EMB_MODEL, + emb_model_dimensions=OPENAI_EMB_DIMENSIONS, + azure_openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, + azure_openai_endpoint=azure_openai_endpoint, + ) + image_embeddings_service = setup_image_embeddings_service( azure_credential=azure_credential, vision_endpoint=AZURE_VISION_ENDPOINT, @@ -618,6 +622,7 @@ async def setup_clients(): image_embeddings=image_embeddings_service, search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING, blob_manager=user_blob_manager, + figure_processor=figure_processor, ) current_app.config[CONFIG_INGESTER] = ingester @@ -640,7 +645,7 @@ async def setup_clients(): OPENAI_CHATGPT_MODEL not in Approach.GPT_REASONING_MODELS or Approach.GPT_REASONING_MODELS[OPENAI_CHATGPT_MODEL].streaming ) - current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false" + current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = bool(USE_VECTORS) current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD) current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED] = ENABLE_LANGUAGE_PICKER current_app.config[CONFIG_SPEECH_INPUT_ENABLED] = USE_SPEECH_INPUT_BROWSER diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 254b3d64eb..df23e80542 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -2,53 +2,40 @@ import asyncio import logging import os -from collections.abc import Awaitable, Callable -from enum import Enum from typing import Optional import aiohttp -from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential -from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider +from azure.identity.aio import AzureDeveloperCliCredential from openai import AsyncOpenAI from rich.logging import RichHandler from load_azd_env import load_azd_env -from prepdocslib.blobmanager import BlobManager -from prepdocslib.csvparser import CsvParser -from prepdocslib.embeddings import ImageEmbeddings, OpenAIEmbeddings -from prepdocslib.fileprocessor import FileProcessor from prepdocslib.filestrategy import FileStrategy -from prepdocslib.htmlparser import LocalHTMLParser from prepdocslib.integratedvectorizerstrategy import ( IntegratedVectorizerStrategy, ) -from prepdocslib.jsonparser import JsonParser from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, ListFileStrategy, LocalListFileStrategy, ) -from prepdocslib.parser import Parser -from prepdocslib.pdfparser import ( - DocumentAnalysisParser, - LocalPdfParser, - MediaDescriptionStrategy, +from prepdocslib.servicesetup import ( + OpenAIHost, + build_file_processors, + clean_key_if_exists, + setup_blob_manager, + setup_embeddings_service, + setup_figure_processor, + setup_image_embeddings_service, + setup_openai_client, + setup_search_info, ) -from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy -from prepdocslib.textparser import TextParser -from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter +from prepdocslib.strategy import DocumentAction, Strategy logger = logging.getLogger("scripts") -def clean_key_if_exists(key: Optional[str]) -> Optional[str]: - """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None.""" - if key is not None and key.strip() != "": - return key.strip() - return None - - async def check_search_service_connectivity(search_service: str) -> bool: """Check if the search service is accessible by hitting the /ping endpoint.""" ping_url = f"https://{search_service}.search.windows.net/ping" @@ -62,61 +49,6 @@ async def check_search_service_connectivity(search_service: str) -> bool: return False -async def setup_search_info( - search_service: str, - index_name: str, - azure_credential: AsyncTokenCredential, - use_agentic_retrieval: Optional[bool] = None, - azure_openai_endpoint: Optional[str] = None, - agent_name: Optional[str] = None, - agent_max_output_tokens: Optional[int] = None, - azure_openai_searchagent_deployment: Optional[str] = None, - azure_openai_searchagent_model: Optional[str] = None, - search_key: Optional[str] = None, - azure_vision_endpoint: Optional[str] = None, -) -> SearchInfo: - search_creds: AsyncTokenCredential | AzureKeyCredential = ( - azure_credential if search_key is None else AzureKeyCredential(search_key) - ) - if use_agentic_retrieval and azure_openai_searchagent_model is None: - raise ValueError("Azure OpenAI SearchAgent model must be specified when using agentic retrieval.") - - return SearchInfo( - endpoint=f"https://{search_service}.search.windows.net/", - credential=search_creds, - index_name=index_name, - agent_name=agent_name, - agent_max_output_tokens=agent_max_output_tokens, - use_agentic_retrieval=use_agentic_retrieval, - azure_openai_endpoint=azure_openai_endpoint, - azure_openai_searchagent_model=azure_openai_searchagent_model, - azure_openai_searchagent_deployment=azure_openai_searchagent_deployment, - azure_vision_endpoint=azure_vision_endpoint, - ) - - -def setup_blob_manager( - azure_credential: AsyncTokenCredential, - storage_account: str, - storage_container: str, - storage_resource_group: str, - subscription_id: str, - storage_key: Optional[str] = None, - image_storage_container: Optional[str] = None, # Added this parameter -): - storage_creds: AsyncTokenCredential | str = azure_credential if storage_key is None else storage_key - - return BlobManager( - endpoint=f"https://{storage_account}.blob.core.windows.net", - container=storage_container, - account=storage_account, - credential=storage_creds, - resource_group=storage_resource_group, - subscription_id=subscription_id, - image_container=image_storage_container, - ) - - def setup_list_file_strategy( azure_credential: AsyncTokenCredential, local_files: Optional[str], @@ -149,100 +81,6 @@ def setup_list_file_strategy( return list_file_strategy -class OpenAIHost(str, Enum): - OPENAI = "openai" - AZURE = "azure" - AZURE_CUSTOM = "azure_custom" - LOCAL = "local" - - -def setup_embeddings_service( - open_ai_client: AsyncOpenAI, - openai_host: OpenAIHost, - emb_model_name: str, - emb_model_dimensions: int, - azure_openai_deployment: str | None, - azure_openai_endpoint: str | None, - disable_vectors: bool = False, - disable_batch_vectors: bool = False, -): - if disable_vectors: - logger.info("Not setting up embeddings service") - return None - - if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: - if azure_openai_endpoint is None: - raise ValueError("Azure OpenAI endpoint must be provided when using Azure OpenAI embeddings") - if azure_openai_deployment is None: - raise ValueError("Azure OpenAI deployment must be provided when using Azure OpenAI embeddings") - - return OpenAIEmbeddings( - open_ai_client=open_ai_client, - open_ai_model_name=emb_model_name, - open_ai_dimensions=emb_model_dimensions, - disable_batch=disable_batch_vectors, - azure_deployment_name=azure_openai_deployment, - azure_endpoint=azure_openai_endpoint, - ) - - -def setup_openai_client( - openai_host: OpenAIHost, - azure_credential: AsyncTokenCredential, - azure_openai_api_key: Optional[str] = None, - azure_openai_service: Optional[str] = None, - azure_openai_custom_url: Optional[str] = None, - openai_api_key: Optional[str] = None, - openai_organization: Optional[str] = None, -) -> tuple[AsyncOpenAI, Optional[str]]: - openai_client: AsyncOpenAI - azure_openai_endpoint: Optional[str] = None - - if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: - base_url: Optional[str] = None - api_key_or_token: Optional[str | Callable[[], Awaitable[str]]] = None - if openai_host == OpenAIHost.AZURE_CUSTOM: - logger.info("OPENAI_HOST is azure_custom, setting up Azure OpenAI custom client") - if not azure_openai_custom_url: - raise ValueError("AZURE_OPENAI_CUSTOM_URL must be set when OPENAI_HOST is azure_custom") - base_url = azure_openai_custom_url - else: - logger.info("OPENAI_HOST is azure, setting up Azure OpenAI client") - if not azure_openai_service: - raise ValueError("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure") - azure_openai_endpoint = f"https://{azure_openai_service}.openai.azure.com" - base_url = f"{azure_openai_endpoint}/openai/v1" - if azure_openai_api_key: - logger.info("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client") - api_key_or_token = azure_openai_api_key - else: - logger.info("Using Azure credential (passwordless authentication) for Azure OpenAI client") - api_key_or_token = get_bearer_token_provider( - azure_credential, "https://cognitiveservices.azure.com/.default" - ) - openai_client = AsyncOpenAI( - base_url=base_url, - api_key=api_key_or_token, # type: ignore[arg-type] - ) - elif openai_host == OpenAIHost.LOCAL: - logger.info("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key") - openai_client = AsyncOpenAI( - base_url=os.environ["OPENAI_BASE_URL"], - api_key="no-key-required", - ) - else: - logger.info( - "OPENAI_HOST is not azure, setting up OpenAI client using OPENAI_API_KEY and OPENAI_ORGANIZATION environment variables" - ) - if openai_api_key is None: - raise ValueError("OpenAI key is required when using the non-Azure OpenAI API") - openai_client = AsyncOpenAI( - api_key=openai_api_key, - organization=openai_organization, - ) - return openai_client, azure_openai_endpoint - - def setup_file_processors( azure_credential: AsyncTokenCredential, document_intelligence_service: Optional[str], @@ -256,90 +94,31 @@ def setup_file_processors( openai_deployment: Optional[str] = None, content_understanding_endpoint: Optional[str] = None, ): - sentence_text_splitter = SentenceTextSplitter() - - doc_int_parser: Optional[DocumentAnalysisParser] = None - # check if Azure Document Intelligence credentials are provided - if document_intelligence_service is not None: - documentintelligence_creds: AsyncTokenCredential | AzureKeyCredential = ( - azure_credential if document_intelligence_key is None else AzureKeyCredential(document_intelligence_key) - ) - doc_int_parser = DocumentAnalysisParser( - endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", - credential=documentintelligence_creds, - media_description_strategy=( - MediaDescriptionStrategy.OPENAI - if use_multimodal - else ( - MediaDescriptionStrategy.CONTENTUNDERSTANDING - if use_content_understanding - else MediaDescriptionStrategy.NONE - ) - ), - openai_client=openai_client, - openai_model=openai_model, - openai_deployment=openai_deployment, - content_understanding_endpoint=content_understanding_endpoint, - ) + """Setup file processors and figure processor for document ingestion. + + Uses build_file_processors from servicesetup to ensure consistent parser/splitter + selection logic with the Azure Functions cloud ingestion pipeline. + """ + file_processors = build_file_processors( + azure_credential=azure_credential, + document_intelligence_service=document_intelligence_service, + document_intelligence_key=document_intelligence_key, + use_local_pdf_parser=local_pdf_parser, + use_local_html_parser=local_html_parser, + process_figures=use_multimodal, + ) - pdf_parser: Optional[Parser] = None - if local_pdf_parser or document_intelligence_service is None: - pdf_parser = LocalPdfParser() - elif document_intelligence_service is not None: - pdf_parser = doc_int_parser - else: - logger.warning("No PDF parser available") + figure_processor = setup_figure_processor( + credential=azure_credential, + use_multimodal=use_multimodal, + use_content_understanding=use_content_understanding, + content_understanding_endpoint=content_understanding_endpoint, + openai_client=openai_client, + openai_model=openai_model, + openai_deployment=openai_deployment, + ) - html_parser: Optional[Parser] = None - if local_html_parser or document_intelligence_service is None: - html_parser = LocalHTMLParser() - elif document_intelligence_service is not None: - html_parser = doc_int_parser - else: - logger.warning("No HTML parser available") - - # These file formats can always be parsed: - file_processors = { - ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), - ".md": FileProcessor(TextParser(), sentence_text_splitter), - ".txt": FileProcessor(TextParser(), sentence_text_splitter), - ".csv": FileProcessor(CsvParser(), sentence_text_splitter), - } - # These require either a Python package or Document Intelligence - if pdf_parser is not None: - file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)}) - if html_parser is not None: - file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)}) - # These file formats require Document Intelligence - if doc_int_parser is not None: - file_processors.update( - { - ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".png": FileProcessor(doc_int_parser, sentence_text_splitter), - ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), - ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), - ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), - ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), - ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), - } - ) - return file_processors - - -def setup_image_embeddings_service( - azure_credential: AsyncTokenCredential, vision_endpoint: Optional[str], use_multimodal: bool -) -> Optional[ImageEmbeddings]: - image_embeddings_service: Optional[ImageEmbeddings] = None - if use_multimodal: - if vision_endpoint is None: - raise ValueError("An Azure AI Vision endpoint must be provided to use multimodal features.") - image_embeddings_service = ImageEmbeddings( - endpoint=vision_endpoint, - token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"), - ) - return image_embeddings_service + return file_processors, figure_processor async def main(strategy: Strategy, setup_index: bool = True): @@ -406,6 +185,12 @@ async def main(strategy: Strategy, setup_index: bool = True): load_azd_env() + if os.getenv("USE_CLOUD_INGESTION", "").lower() == "true": + logger.warning( + "Cloud ingestion is enabled. Please use setup_cloud_ingestion.py instead of prepdocs.py. Exiting." + ) + exit(0) + if ( os.getenv("AZURE_PUBLIC_NETWORK_ACCESS") == "Disabled" and os.getenv("AZURE_USE_VPN_GATEWAY", "").lower() != "true" @@ -446,20 +231,18 @@ async def main(strategy: Strategy, setup_index: bool = True): if use_agentic_retrieval and OPENAI_HOST not in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: raise Exception("Agentic retrieval requires an Azure OpenAI chat completion service") - search_info = loop.run_until_complete( - setup_search_info( - search_service=os.environ["AZURE_SEARCH_SERVICE"], - index_name=os.environ["AZURE_SEARCH_INDEX"], - use_agentic_retrieval=use_agentic_retrieval, - agent_name=os.getenv("AZURE_SEARCH_AGENT"), - agent_max_output_tokens=int(os.getenv("AZURE_SEARCH_AGENT_MAX_OUTPUT_TOKENS", 10000)), - azure_openai_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], - azure_openai_searchagent_deployment=os.getenv("AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT"), - azure_openai_searchagent_model=os.getenv("AZURE_OPENAI_SEARCHAGENT_MODEL"), - azure_credential=azd_credential, - search_key=clean_key_if_exists(args.searchkey), - azure_vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), - ) + search_info = setup_search_info( + search_service=os.environ["AZURE_SEARCH_SERVICE"], + index_name=os.environ["AZURE_SEARCH_INDEX"], + use_agentic_retrieval=use_agentic_retrieval, + agent_name=os.getenv("AZURE_SEARCH_AGENT"), + agent_max_output_tokens=int(os.getenv("AZURE_SEARCH_AGENT_MAX_OUTPUT_TOKENS", 10000)), + azure_openai_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], + azure_openai_searchagent_deployment=os.getenv("AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT"), + azure_openai_searchagent_model=os.getenv("AZURE_OPENAI_SEARCHAGENT_MODEL"), + azure_credential=azd_credential, + search_key=clean_key_if_exists(args.searchkey), + azure_vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), ) # Check search service connectivity @@ -510,16 +293,17 @@ async def main(strategy: Strategy, setup_index: bool = True): openai_api_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")), openai_organization=os.getenv("OPENAI_ORGANIZATION"), ) - openai_embeddings_service = setup_embeddings_service( - open_ai_client=openai_client, - openai_host=OPENAI_HOST, - emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], - emb_model_dimensions=emb_model_dimensions, - azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), - azure_openai_endpoint=azure_openai_endpoint, - disable_vectors=dont_use_vectors, - disable_batch_vectors=args.disablebatchvectors, - ) + openai_embeddings_service = None + if not dont_use_vectors: + openai_embeddings_service = setup_embeddings_service( + OPENAI_HOST, + openai_client, + emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], + emb_model_dimensions=emb_model_dimensions, + azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), + azure_openai_endpoint=azure_openai_endpoint, + disable_batch=args.disablebatchvectors, + ) ingestion_strategy: Strategy if use_int_vectorization: @@ -541,7 +325,7 @@ async def main(strategy: Strategy, setup_index: bool = True): enforce_access_control=enforce_access_control, ) else: - file_processors = setup_file_processors( + file_processors, figure_processor = setup_file_processors( azure_credential=azd_credential, document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), document_intelligence_key=clean_key_if_exists(args.documentintelligencekey), @@ -574,8 +358,7 @@ async def main(strategy: Strategy, setup_index: bool = True): search_field_name_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding"), use_acls=use_acls, category=args.category, - use_content_understanding=use_content_understanding, - content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), + figure_processor=figure_processor, enforce_access_control=enforce_access_control, ) diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index cb179615ff..f682ec5029 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -424,13 +424,15 @@ async def upload_blob(self, file: File) -> str: await container_client.create_container() # Re-open and upload the original file - if file.url is None: + # URL may be a path to a local file or already set to a blob URL + if file.url is None or os.path.exists(file.url): with open(file.content.name, "rb") as reopened_file: blob_name = self.blob_name_from_file_name(file.content.name) logger.info("Uploading blob for document '%s'", blob_name) blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True) file.url = blob_client.url + assert file.url is not None, "file.url must be set after upload" return unquote(file.url) async def upload_document_image( @@ -449,7 +451,7 @@ async def upload_document_image( raise ValueError( "user_oid is not supported for BlobManager. Use AdlsBlobManager for user-specific operations." ) - container_client = self.blob_service_client.get_container_client(self.container) + container_client = self.blob_service_client.get_container_client(self.image_container) if not await container_client.exists(): await container_client.create_container() image_bytes = self.add_image_citation(image_bytes, document_filename, image_filename, image_page_num) diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py new file mode 100644 index 0000000000..4238600ddf --- /dev/null +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -0,0 +1,327 @@ +"""Cloud ingestion strategy using Azure AI Search custom skills.""" + +import logging +from dataclasses import dataclass +from datetime import timedelta + +from azure.search.documents.indexes._generated.models import ( + NativeBlobSoftDeleteDeletionDetectionPolicy, +) +from azure.search.documents.indexes.models import ( + IndexingParameters, + IndexingParametersConfiguration, + IndexProjectionMode, + InputFieldMappingEntry, + OutputFieldMappingEntry, + SearchIndexer, + SearchIndexerDataContainer, + SearchIndexerDataSourceConnection, + SearchIndexerDataSourceType, + SearchIndexerDataUserAssignedIdentity, + SearchIndexerIndexProjection, + SearchIndexerIndexProjectionSelector, + SearchIndexerIndexProjectionsParameters, + SearchIndexerSkillset, + ShaperSkill, + WebApiSkill, +) + +from .blobmanager import BlobManager +from .embeddings import OpenAIEmbeddings +from .listfilestrategy import ListFileStrategy +from .searchmanager import SearchManager +from .strategy import DocumentAction, SearchInfo, Strategy + +logger = logging.getLogger("scripts") + +DEFAULT_SKILL_TIMEOUT = timedelta(seconds=230) +DEFAULT_BATCH_SIZE = 1 + + +@dataclass(slots=True) +class SkillConfig: + """Configuration for a custom Web API skill.""" + + name: str + description: str + uri: str + auth_resource_id: str + + +class CloudIngestionStrategy(Strategy): # pragma: no cover + """Ingestion strategy that wires Azure Function custom skills into an indexer.""" + + def __init__( + self, + *, + list_file_strategy: ListFileStrategy, + blob_manager: BlobManager, + search_info: SearchInfo, + embeddings: OpenAIEmbeddings, + search_field_name_embedding: str, + document_extractor_uri: str, + document_extractor_auth_resource_id: str, + figure_processor_uri: str, + figure_processor_auth_resource_id: str, + text_processor_uri: str, + text_processor_auth_resource_id: str, + subscription_id: str, + document_action: DocumentAction = DocumentAction.Add, + search_analyzer_name: str | None = None, + use_acls: bool = False, + use_multimodal: bool = False, + enforce_access_control: bool = False, + search_user_assigned_identity_resource_id: str, + ) -> None: + self.list_file_strategy = list_file_strategy + self.blob_manager = blob_manager + self.document_action = document_action + self.embeddings = embeddings + self.search_field_name_embedding = search_field_name_embedding + self.search_info = search_info + self.search_analyzer_name = search_analyzer_name + self.use_acls = use_acls + self.use_multimodal = use_multimodal + self.enforce_access_control = enforce_access_control + self.subscription_id = subscription_id + + prefix = f"{self.search_info.index_name}-cloud" + self.skillset_name = f"{prefix}-skillset" + self.indexer_name = f"{prefix}-indexer" + self.data_source_name = f"{prefix}-blob" + + self.document_extractor = SkillConfig( + name=f"{prefix}-document-extractor-skill", + description="Custom skill that downloads and parses source documents", + uri=document_extractor_uri, + auth_resource_id=document_extractor_auth_resource_id, + ) + self.figure_processor = SkillConfig( + name=f"{prefix}-figure-processor-skill", + description="Custom skill that enriches individual figures", + uri=figure_processor_uri, + auth_resource_id=figure_processor_auth_resource_id, + ) + self.text_processor = SkillConfig( + name=f"{prefix}-text-processor-skill", + description="Custom skill that merges figures, chunks text, and generates embeddings", + uri=text_processor_uri, + auth_resource_id=text_processor_auth_resource_id, + ) + + self._search_manager: SearchManager | None = None + self.search_user_assigned_identity_resource_id = search_user_assigned_identity_resource_id + + def _build_skillset(self) -> SearchIndexerSkillset: + prefix = f"{self.search_info.index_name}-cloud" + + # NOTE: Do NOT map the chunk id directly to the index key field. Azure AI Search + # index projections forbid mapping an input field onto the target index key when + # using parent/child projections. The service will generate keys for projected + # child documents automatically. Removing the explicit 'id' mapping resolves + # HttpResponseError: "Input 'id' cannot map to the key field". + mappings = [ + InputFieldMappingEntry(name="content", source="/document/chunks/*/content"), + InputFieldMappingEntry(name="sourcepage", source="/document/chunks/*/sourcepage"), + InputFieldMappingEntry(name="sourcefile", source="/document/chunks/*/sourcefile"), + InputFieldMappingEntry(name=self.search_field_name_embedding, source="/document/chunks/*/embedding"), + InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), + ] + if self.use_multimodal: + mappings.append(InputFieldMappingEntry(name="images", source="/document/chunks/*/images")) + + index_projection = SearchIndexerIndexProjection( + selectors=[ + SearchIndexerIndexProjectionSelector( + target_index_name=self.search_info.index_name, + parent_key_field_name="parent_id", + source_context="/document/chunks/*", + mappings=mappings, + ) + ], + parameters=SearchIndexerIndexProjectionsParameters( + projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS, + ), + ) + + document_extractor_skill = WebApiSkill( + name=self.document_extractor.name, + description=self.document_extractor.description, + context="/document", + uri=self.document_extractor.uri, + http_method="POST", + timeout=DEFAULT_SKILL_TIMEOUT, + batch_size=DEFAULT_BATCH_SIZE, + degree_of_parallelism=1, + # Managed identity: Search service authenticates against the function app using this resource ID. + auth_resource_id=self.document_extractor.auth_resource_id, + auth_identity=SearchIndexerDataUserAssignedIdentity( + resource_id=self.search_user_assigned_identity_resource_id + ), + inputs=[ + # Provide the binary payload expected by the document extractor custom skill. + InputFieldMappingEntry(name="file_data", source="/document/file_data"), + InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), + InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"), + ], + outputs=[ + OutputFieldMappingEntry(name="pages", target_name="pages"), + OutputFieldMappingEntry(name="figures", target_name="figures"), + ], + ) + + figure_processor_skill = WebApiSkill( + name=self.figure_processor.name, + description=self.figure_processor.description, + context="/document/figures/*", + uri=self.figure_processor.uri, + http_method="POST", + timeout=DEFAULT_SKILL_TIMEOUT, + batch_size=DEFAULT_BATCH_SIZE, + degree_of_parallelism=1, + # Managed identity: Search service authenticates against the function app using this resource ID. + auth_resource_id=self.figure_processor.auth_resource_id, + auth_identity=SearchIndexerDataUserAssignedIdentity( + resource_id=self.search_user_assigned_identity_resource_id + ), + inputs=[ + InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), + InputFieldMappingEntry(name="document_file_name", source="/document/figures/*/document_file_name"), + InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), + InputFieldMappingEntry(name="mime_type", source="/document/figures/*/mime_type"), + InputFieldMappingEntry(name="bytes_base64", source="/document/figures/*/bytes_base64"), + InputFieldMappingEntry(name="page_num", source="/document/figures/*/page_num"), + InputFieldMappingEntry(name="bbox", source="/document/figures/*/bbox"), + InputFieldMappingEntry(name="placeholder", source="/document/figures/*/placeholder"), + InputFieldMappingEntry(name="title", source="/document/figures/*/title"), + ], + outputs=[ + # Only output the enriched fields to avoid cyclic dependency + OutputFieldMappingEntry(name="description", target_name="description"), + OutputFieldMappingEntry(name="url", target_name="url"), + OutputFieldMappingEntry(name="embedding", target_name="embedding"), + ], + ) + + # Shaper skill to consolidate pages and enriched figures into a single object + shaper_skill = ShaperSkill( + name=f"{prefix}-document-shaper-skill", + description="Consolidates pages and enriched figures into a single document object", + context="/document", + inputs=[ + InputFieldMappingEntry(name="pages", source="/document/pages"), + InputFieldMappingEntry( + name="figures", + source_context="/document/figures/*", + inputs=[ + InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), + InputFieldMappingEntry( + name="document_file_name", source="/document/figures/*/document_file_name" + ), + InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), + InputFieldMappingEntry(name="mime_type", source="/document/figures/*/mime_type"), + InputFieldMappingEntry(name="page_num", source="/document/figures/*/page_num"), + InputFieldMappingEntry(name="bbox", source="/document/figures/*/bbox"), + InputFieldMappingEntry(name="placeholder", source="/document/figures/*/placeholder"), + InputFieldMappingEntry(name="title", source="/document/figures/*/title"), + InputFieldMappingEntry(name="description", source="/document/figures/*/description"), + InputFieldMappingEntry(name="url", source="/document/figures/*/url"), + InputFieldMappingEntry(name="embedding", source="/document/figures/*/embedding"), + ], + ), + InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), + InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), + ], + outputs=[OutputFieldMappingEntry(name="output", target_name="consolidated_document")], + ) + + text_processor_skill = WebApiSkill( + name=self.text_processor.name, + description=self.text_processor.description, + context="/document", + uri=self.text_processor.uri, + http_method="POST", + timeout=DEFAULT_SKILL_TIMEOUT, + batch_size=DEFAULT_BATCH_SIZE, + degree_of_parallelism=1, + # Managed identity: Search service authenticates against the function app using this resource ID. + auth_resource_id=self.text_processor.auth_resource_id, + auth_identity=SearchIndexerDataUserAssignedIdentity( + resource_id=self.search_user_assigned_identity_resource_id + ), + inputs=[ + InputFieldMappingEntry(name="consolidated_document", source="/document/consolidated_document"), + ], + outputs=[OutputFieldMappingEntry(name="chunks", target_name="chunks")], + ) + + return SearchIndexerSkillset( + name=self.skillset_name, + description="Skillset linking document extraction, figure enrichment, and text processing functions", + skills=[document_extractor_skill, figure_processor_skill, shaper_skill, text_processor_skill], + index_projection=index_projection, + ) + + async def setup(self) -> None: + logger.info("Setting up search index and skillset for cloud ingestion") + + if not self.embeddings.azure_endpoint or not self.embeddings.azure_deployment_name: + raise ValueError("Cloud ingestion requires Azure OpenAI endpoint and deployment") + + if not isinstance(self.embeddings, OpenAIEmbeddings): + raise TypeError("Cloud ingestion requires Azure OpenAI embeddings to configure the search index.") + + self._search_manager = SearchManager( + search_info=self.search_info, + search_analyzer_name=self.search_analyzer_name, + use_acls=self.use_acls, + use_parent_index_projection=True, + embeddings=self.embeddings, + field_name_embedding=self.search_field_name_embedding, + search_images=self.use_multimodal, + enforce_access_control=self.enforce_access_control, + ) + + await self._search_manager.create_index() + + async with self.search_info.create_search_indexer_client() as indexer_client: + data_source_connection = SearchIndexerDataSourceConnection( + name=self.data_source_name, + type=SearchIndexerDataSourceType.AZURE_BLOB, + connection_string=self.blob_manager.get_managedidentity_connectionstring(), + container=SearchIndexerDataContainer(name=self.blob_manager.container), + data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(), + ) + await indexer_client.create_or_update_data_source_connection(data_source_connection) + + skillset = self._build_skillset() + await indexer_client.create_or_update_skillset(skillset) + + indexer = SearchIndexer( + name=self.indexer_name, + description="Indexer orchestrating cloud ingestion pipeline", + data_source_name=self.data_source_name, + target_index_name=self.search_info.index_name, + skillset_name=self.skillset_name, + parameters=IndexingParameters( + configuration=IndexingParametersConfiguration( + query_timeout=None, # type: ignore + data_to_extract="storageMetadata", + allow_skillset_to_read_file_data=True, + ) + ), + ) + await indexer_client.create_or_update_indexer(indexer) + + async def run(self) -> None: + files = self.list_file_strategy.list() + async for file in files: + try: + await self.blob_manager.upload_blob(file) + finally: + if file: + file.close() + + async with self.search_info.create_search_indexer_client() as indexer_client: + await indexer_client.run_indexer(self.indexer_name) + logger.info("Triggered indexer '%s' for cloud ingestion", self.indexer_name) diff --git a/app/backend/prepdocslib/figureprocessor.py b/app/backend/prepdocslib/figureprocessor.py new file mode 100644 index 0000000000..b1e77ca6d4 --- /dev/null +++ b/app/backend/prepdocslib/figureprocessor.py @@ -0,0 +1,146 @@ +"""Utilities for describing and enriching figures extracted from documents.""" + +import logging +from enum import Enum +from typing import Any, Optional + +from azure.core.credentials import AzureKeyCredential +from azure.core.credentials_async import AsyncTokenCredential + +from .blobmanager import BaseBlobManager +from .embeddings import ImageEmbeddings +from .mediadescriber import ( + ContentUnderstandingDescriber, + MediaDescriber, + MultimodalModelDescriber, +) +from .page import ImageOnPage + +logger = logging.getLogger("scripts") + + +class MediaDescriptionStrategy(Enum): + """Supported mechanisms for describing images extracted from documents.""" + + NONE = "none" + OPENAI = "openai" + CONTENTUNDERSTANDING = "content_understanding" + + +class FigureProcessor: + """Helper that lazily creates a media describer and captions figures on demand.""" + + def __init__( + self, + *, + credential: AsyncTokenCredential | AzureKeyCredential | None = None, + strategy: MediaDescriptionStrategy = MediaDescriptionStrategy.NONE, + openai_client: Any | None = None, + openai_model: str | None = None, + openai_deployment: str | None = None, + content_understanding_endpoint: str | None = None, + ) -> None: + self.credential = credential + self.strategy = strategy + self.openai_client = openai_client + self.openai_model = openai_model + self.openai_deployment = openai_deployment + self.content_understanding_endpoint = content_understanding_endpoint + self.media_describer: MediaDescriber | None = None + self.content_understanding_ready = False + + async def get_media_describer(self) -> MediaDescriber | None: + """Return (and lazily create) the media describer for this processor.""" + + if self.strategy == MediaDescriptionStrategy.NONE: + return None + + if self.media_describer is not None: + return self.media_describer + + if self.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: + if self.content_understanding_endpoint is None: + raise ValueError("Content Understanding strategy requires an endpoint") + if self.credential is None: + raise ValueError("Content Understanding strategy requires a credential") + if isinstance(self.credential, AzureKeyCredential): + raise ValueError( + "Content Understanding does not support key credentials; provide a token credential instead" + ) + self.media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) + return self.media_describer + + if self.strategy == MediaDescriptionStrategy.OPENAI: + if self.openai_client is None or self.openai_model is None: + raise ValueError("OpenAI strategy requires both a client and a model name") + self.media_describer = MultimodalModelDescriber( + self.openai_client, model=self.openai_model, deployment=self.openai_deployment + ) + return self.media_describer + + logger.warning("Unknown media description strategy '%s'; skipping description", self.strategy) + return None + + def mark_content_understanding_ready(self) -> None: + """Record that the Content Understanding analyzer exists to avoid recreating it.""" + + self.content_understanding_ready = True + + async def describe(self, image_bytes: bytes) -> str | None: + """Generate a description for the provided image bytes if a describer is available.""" + + describer = await self.get_media_describer() + if describer is None: + return None + if isinstance(describer, ContentUnderstandingDescriber) and not self.content_understanding_ready: + await describer.create_analyzer() + self.content_understanding_ready = True + return await describer.describe_image(image_bytes) + + +def build_figure_markup(image: "ImageOnPage", description: Optional[str] = None) -> str: + """Create consistent HTML markup for a figure description on demand.""" + + caption_parts = [image.figure_id] + if image.title: + caption_parts.append(image.title) + caption = " ".join(part for part in caption_parts if part) + if description: + return f"
{caption}
{description}
" + return f"
{caption}
" + + +async def process_page_image( + *, + image: "ImageOnPage", + document_filename: str, + blob_manager: Optional[BaseBlobManager], + image_embeddings_client: Optional[ImageEmbeddings], + figure_processor: Optional[FigureProcessor] = None, + user_oid: Optional[str] = None, +) -> "ImageOnPage": + """Generate description, upload image, and optionally compute embedding for a figure.""" + + if blob_manager is None: + raise ValueError("BlobManager must be provided to process images.") + + # Generate plain (model) description text only; do not wrap in HTML markup here. + description_text: str | None = None + if figure_processor is not None: + description_text = await figure_processor.describe(image.bytes) + + # Store plain descriptive text (can be None). HTML rendering is deferred to build_figure_markup. + image.description = description_text + + if image.url is None: + image.url = await blob_manager.upload_document_image( + document_filename, image.bytes, image.filename, image.page_num, user_oid=user_oid + ) + + if image_embeddings_client is not None: + try: + image.embedding = await image_embeddings_client.create_embedding_for_image(image.bytes) + except Exception: # pragma: no cover - embedding failures shouldn't abort figure processing + logger.warning("Image embedding generation failed for figure %s", image.figure_id, exc_info=True) + + return image diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index 16b7f1aff9..66022d9178 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -1,15 +1,19 @@ import logging from typing import Optional -from azure.core.credentials import AzureKeyCredential - from .blobmanager import AdlsBlobManager, BaseBlobManager, BlobManager from .embeddings import ImageEmbeddings, OpenAIEmbeddings +from .figureprocessor import ( + FigureProcessor, + MediaDescriptionStrategy, + process_page_image, +) from .fileprocessor import FileProcessor from .listfilestrategy import File, ListFileStrategy from .mediadescriber import ContentUnderstandingDescriber from .searchmanager import SearchManager, Section from .strategy import DocumentAction, SearchInfo, Strategy +from .textprocessor import process_text logger = logging.getLogger("scripts") @@ -20,8 +24,10 @@ async def parse_file( category: Optional[str] = None, blob_manager: Optional[BaseBlobManager] = None, image_embeddings_client: Optional[ImageEmbeddings] = None, + figure_processor: Optional[FigureProcessor] = None, user_oid: Optional[str] = None, ) -> list[Section]: + key = file.file_extension().lower() processor = file_processors.get(key) if processor is None: @@ -31,21 +37,16 @@ async def parse_file( pages = [page async for page in processor.parser.parse(content=file.content)] for page in pages: for image in page.images: - if not blob_manager or not image_embeddings_client: - raise ValueError("BlobManager and ImageEmbeddingsClient must be provided to parse images in the file.") - if image.url is None: - image.url = await blob_manager.upload_document_image( - file.filename(), image.bytes, image.filename, image.page_num, user_oid=user_oid - ) - if image_embeddings_client: - image.embedding = await image_embeddings_client.create_embedding_for_image(image.bytes) - logger.info("Splitting '%s' into sections", file.filename()) - sections = [Section(chunk, content=file, category=category) for chunk in processor.splitter.split_pages(pages)] - # For now, add the images back to each split chunk based off chunk.page_num - for section in sections: - section.chunk.images = [ - image for page in pages if page.page_num == section.chunk.page_num for image in page.images - ] + logger.info("Processing image '%s' on page %d", image.filename, page.page_num) + await process_page_image( + image=image, + document_filename=file.filename(), + blob_manager=blob_manager, + image_embeddings_client=image_embeddings_client, + figure_processor=figure_processor, + user_oid=user_oid, + ) + sections = process_text(pages, file, processor.splitter, category) return sections @@ -67,8 +68,7 @@ def __init__( search_field_name_embedding: Optional[str] = None, use_acls: bool = False, category: Optional[str] = None, - use_content_understanding: bool = False, - content_understanding_endpoint: Optional[str] = None, + figure_processor: Optional[FigureProcessor] = None, enforce_access_control: bool = False, ): self.list_file_strategy = list_file_strategy @@ -82,8 +82,7 @@ def __init__( self.search_info = search_info self.use_acls = use_acls self.category = category - self.use_content_understanding = use_content_understanding - self.content_understanding_endpoint = content_understanding_endpoint + self.figure_processor = figure_processor self.enforce_access_control = enforce_access_control def setup_search_manager(self): @@ -91,7 +90,7 @@ def setup_search_manager(self): self.search_info, self.search_analyzer_name, self.use_acls, - False, + False, # use_parent_index_projection disabled for file-based ingestion self.embeddings, field_name_embedding=self.search_field_name_embedding, search_images=self.image_embeddings is not None, @@ -102,15 +101,14 @@ async def setup(self): self.setup_search_manager() await self.search_manager.create_index() - if self.use_content_understanding: - if self.content_understanding_endpoint is None: - raise ValueError("Content Understanding is enabled but no endpoint was provided") - if isinstance(self.search_info.credential, AzureKeyCredential): - raise ValueError( - "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead" - ) - cu_manager = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.search_info.credential) - await cu_manager.create_analyzer() + if ( + self.figure_processor is not None + and self.figure_processor.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING + ): + media_describer = await self.figure_processor.get_media_describer() + if isinstance(media_describer, ContentUnderstandingDescriber): + await media_describer.create_analyzer() + self.figure_processor.mark_content_understanding_ready() async def run(self): self.setup_search_manager() @@ -118,12 +116,17 @@ async def run(self): files = self.list_file_strategy.list() async for file in files: try: - await self.blob_manager.upload_blob(file) + blob_url = await self.blob_manager.upload_blob(file) sections = await parse_file( - file, self.file_processors, self.category, self.blob_manager, self.image_embeddings + file, + self.file_processors, + self.category, + self.blob_manager, + self.image_embeddings, + figure_processor=self.figure_processor, ) if sections: - await self.search_manager.update_content(sections, url=file.url) + await self.search_manager.update_content(sections, url=blob_url) finally: if file: file.close() @@ -151,27 +154,35 @@ def __init__( embeddings: Optional[OpenAIEmbeddings] = None, image_embeddings: Optional[ImageEmbeddings] = None, enforce_access_control: bool = False, + figure_processor: Optional[FigureProcessor] = None, ): self.file_processors = file_processors self.embeddings = embeddings self.image_embeddings = image_embeddings self.search_info = search_info self.blob_manager = blob_manager + self.figure_processor = figure_processor self.search_manager = SearchManager( search_info=self.search_info, search_analyzer_name=None, use_acls=True, - use_int_vectorization=False, + use_parent_index_projection=False, embeddings=self.embeddings, field_name_embedding=search_field_name_embedding, - search_images=False, + search_images=image_embeddings is not None, enforce_access_control=enforce_access_control, ) self.search_field_name_embedding = search_field_name_embedding async def add_file(self, file: File, user_oid: str): sections = await parse_file( - file, self.file_processors, None, self.blob_manager, self.image_embeddings, user_oid=user_oid + file, + self.file_processors, + None, + self.blob_manager, + self.image_embeddings, + figure_processor=self.figure_processor, + user_oid=user_oid, ) if sections: await self.search_manager.update_content(sections, url=file.url) diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py index 589b8d9888..b5eaa1e4b3 100644 --- a/app/backend/prepdocslib/integratedvectorizerstrategy.py +++ b/app/backend/prepdocslib/integratedvectorizerstrategy.py @@ -137,7 +137,7 @@ async def setup(self): search_info=self.search_info, search_analyzer_name=self.search_analyzer_name, use_acls=self.use_acls, - use_int_vectorization=True, + use_parent_index_projection=True, embeddings=self.embeddings, field_name_embedding=self.search_field_name_embedding, search_images=False, diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py index 1d7a7fd9e8..154569b391 100644 --- a/app/backend/prepdocslib/mediadescriber.py +++ b/app/backend/prepdocslib/mediadescriber.py @@ -139,6 +139,7 @@ def before_retry_sleep(retry_state): response = await self.openai_client.chat.completions.create( model=self.model if self.deployment is None else self.deployment, max_tokens=500, + seed=42, # Keep responses more consistent across runs messages=[ { "role": "system", diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index b87a81e88f..41cfb0cc2c 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -1,5 +1,6 @@ -from dataclasses import dataclass, field -from typing import Optional +import base64 +from dataclasses import asdict, dataclass, field +from typing import Any, Optional @dataclass @@ -7,11 +8,82 @@ class ImageOnPage: bytes: bytes bbox: tuple[float, float, float, float] # Pixels filename: str - description: str figure_id: str page_num: int # 0-indexed + placeholder: str # HTML placeholder in page text, e.g. '
' + mime_type: str = "image/png" # Set by parser; default assumes PNG rendering url: Optional[str] = None + title: str = "" embedding: Optional[list[float]] = None + description: Optional[str] = None + + def to_skill_payload( + self, + file_name: str, + *, + include_bytes_base64: bool = True, + ) -> dict[str, Any]: + data = asdict(self) + + # Remove raw bytes to keep payload lean (and JSON-friendly without extra handling). + data.pop("bytes", None) + + # Optionally include base64-encoded bytes for skills that need it + if include_bytes_base64: + b = self.bytes if isinstance(self.bytes, (bytes, bytearray)) else b"" + data["bytes_base64"] = base64.b64encode(b).decode("utf-8") + + data["document_file_name"] = file_name + return data + + @classmethod + def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: + # Decode base64 image data (optional - may be omitted if already persisted to blob) + bytes_base64 = data.get("bytes_base64") + if bytes_base64: + try: + raw_bytes = base64.b64decode(bytes_base64) + except Exception as exc: # pragma: no cover - defensive + raise ValueError("Invalid bytes_base64 image data") from exc + else: + raw_bytes = b"" # Empty bytes if not provided (already uploaded to blob) + + # page_num may arrive as str; coerce + try: + page_num = int(data.get("page_num") or 0) + except Exception: + page_num = 0 + + # bbox may arrive as list; coerce into tuple + bbox_val = data.get("bbox") + if isinstance(bbox_val, list) and len(bbox_val) == 4: + bbox = tuple(bbox_val) # type: ignore[assignment] + else: + bbox = (0, 0, 0, 0) + + filename = data.get("filename") + figure_id = data.get("figure_id") + placeholder = data.get("placeholder") + assert filename is not None, "filename is required" + assert figure_id is not None, "figure_id is required" + + # Generate placeholder if not provided + if placeholder is None: + placeholder = f'
' + + image = cls( + bytes=raw_bytes, + bbox=bbox, + page_num=page_num, + filename=filename, + figure_id=figure_id, + placeholder=placeholder, + mime_type=data.get("mime_type") or "image/png", + title=data.get("title") or "", + description=data.get("description"), + url=data.get("url"), + ) + return image, data.get("document_file_name", "") @dataclass @@ -29,6 +101,7 @@ class Page: offset: int text: str images: list[ImageOnPage] = field(default_factory=list) + tables: list[str] = field(default_factory=list) @dataclass diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index 6589c854a3..666ca33b2c 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -17,15 +17,9 @@ from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.core.exceptions import HttpResponseError -from openai import AsyncOpenAI from PIL import Image from pypdf import PdfReader -from .mediadescriber import ( - ContentUnderstandingDescriber, - MediaDescriber, - MultimodalModelDescriber, -) from .page import ImageOnPage, Page from .parser import Parser @@ -50,12 +44,6 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: offset += len(page_text) -class MediaDescriptionStrategy(Enum): - NONE = "none" - OPENAI = "openai" - CONTENTUNDERSTANDING = "content_understanding" - - class DocumentAnalysisParser(Parser): """ Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages @@ -66,29 +54,13 @@ def __init__( self, endpoint: str, credential: AsyncTokenCredential | AzureKeyCredential, - model_id="prebuilt-layout", - media_description_strategy: Enum = MediaDescriptionStrategy.NONE, - # If using OpenAI, this is the client to use - openai_client: Optional[AsyncOpenAI] = None, - openai_model: Optional[str] = None, - openai_deployment: Optional[str] = None, - # If using Content Understanding, this is the endpoint for the service - content_understanding_endpoint: Optional[str] = None, - # should this take the blob storage info too? - ): + model_id: str = "prebuilt-layout", + process_figures: bool = False, + ) -> None: self.model_id = model_id self.endpoint = endpoint self.credential = credential - self.media_description_strategy = media_description_strategy - if media_description_strategy == MediaDescriptionStrategy.OPENAI: - logger.info("Including media description with OpenAI") - self.use_content_understanding = False - self.openai_client = openai_client - self.openai_model = openai_model - self.openai_deployment = openai_deployment - if media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: - logger.info("Including media description with Azure Content Understanding") - self.content_understanding_endpoint = content_understanding_endpoint + self.process_figures = process_figures async def parse(self, content: IO) -> AsyncGenerator[Page, None]: logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name) @@ -97,27 +69,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: endpoint=self.endpoint, credential=self.credential ) as document_intelligence_client: file_analyzed = False - - media_describer: Optional[ContentUnderstandingDescriber | MultimodalModelDescriber] = None - if self.media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: - if self.content_understanding_endpoint is None: - raise ValueError( - "Content Understanding endpoint must be provided when using Content Understanding strategy" - ) - if isinstance(self.credential, AzureKeyCredential): - raise ValueError( - "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead" - ) - media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) - - if self.media_description_strategy == MediaDescriptionStrategy.OPENAI: - if self.openai_client is None or self.openai_model is None: - raise ValueError("OpenAI client must be provided when using OpenAI media description strategy") - media_describer = MultimodalModelDescriber( - self.openai_client, self.openai_model, self.openai_deployment - ) - - if media_describer is not None: + if self.process_figures: content_bytes = content.read() try: poller = await document_intelligence_client.begin_analyze_document( @@ -156,13 +108,14 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number ] figures_on_page = [] - if self.media_description_strategy != MediaDescriptionStrategy.NONE: + if self.process_figures: figures_on_page = [ figure for figure in (analyze_result.figures or []) if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number ] page_images: list[ImageOnPage] = [] + page_tables: list[str] = [] class ObjectType(Enum): NONE = -1 @@ -202,46 +155,52 @@ class ObjectType(Enum): if object_idx is None: raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: - page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) + table_html = DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) + page_tables.append(table_html) + page_text += table_html added_objects.add(mask_char) elif object_type == ObjectType.FIGURE: - if media_describer is None: - raise ValueError("media_describer should not be None, unable to describe figure") if object_idx is None: raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: - image_on_page = await DocumentAnalysisParser.process_figure( - doc_for_pymupdf, figures_on_page[object_idx], media_describer + image_on_page = await DocumentAnalysisParser.figure_to_image( + doc_for_pymupdf, figures_on_page[object_idx] ) page_images.append(image_on_page) - page_text += image_on_page.description + page_text += image_on_page.placeholder added_objects.add(mask_char) + # We remove these comments since they are not needed and skew the page numbers page_text = page_text.replace("", "") # We remove excess newlines at the beginning and end of the page page_text = page_text.strip() - yield Page(page_num=page.page_number - 1, offset=offset, text=page_text, images=page_images) + yield Page( + page_num=page.page_number - 1, + offset=offset, + text=page_text, + images=page_images, + tables=page_tables, + ) offset += len(page_text) @staticmethod - async def process_figure( - doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber - ) -> ImageOnPage: + async def figure_to_image(doc: pymupdf.Document, figure: DocumentFigure) -> ImageOnPage: figure_title = (figure.caption and figure.caption.content) or "" # Generate a random UUID if figure.id is None figure_id = figure.id or f"fig_{uuid.uuid4().hex[:8]}" figure_filename = f"figure{figure_id.replace('.', '_')}.png" - logger.info( - "Describing figure %s with title '%s' using %s", figure_id, figure_title, type(media_describer).__name__ - ) + logger.info("Cropping figure %s with title '%s'", figure_id, figure_title) + placeholder = f'
' if not figure.bounding_regions: return ImageOnPage( bytes=b"", - page_num=0, # O-indexed + page_num=0, # 0-indexed figure_id=figure_id, bbox=(0, 0, 0, 0), filename=figure_filename, - description=f"
{figure_id} {figure_title}
", + title=figure_title, + placeholder=placeholder, + mime_type="image/png", ) if len(figure.bounding_regions) > 1: logger.warning("Figure %s has more than one bounding region, using the first one", figure_id) @@ -255,14 +214,15 @@ async def process_figure( ) page_number = first_region["pageNumber"] # 1-indexed cropped_img, bbox_pixels = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) - figure_description = await media_describer.describe_image(cropped_img) return ImageOnPage( bytes=cropped_img, page_num=page_number - 1, # Convert to 0-indexed figure_id=figure_id, bbox=bbox_pixels, filename=figure_filename, - description=f"
{figure_id} {figure_title}
{figure_description}
", + title=figure_title, + placeholder=placeholder, + mime_type="image/png", ) @staticmethod diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index 59455fa87b..f1de6bc0b8 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -69,7 +69,7 @@ def __init__( search_info: SearchInfo, search_analyzer_name: Optional[str] = None, use_acls: bool = False, - use_int_vectorization: bool = False, + use_parent_index_projection: bool = False, embeddings: Optional[OpenAIEmbeddings] = None, field_name_embedding: Optional[str] = None, search_images: bool = False, @@ -78,7 +78,7 @@ def __init__( self.search_info = search_info self.search_analyzer_name = search_analyzer_name self.use_acls = use_acls - self.use_int_vectorization = use_int_vectorization + self.use_parent_index_projection = use_parent_index_projection self.embeddings = embeddings self.embedding_dimensions = self.embeddings.open_ai_dimensions if self.embeddings else None self.field_name_embedding = field_name_embedding @@ -235,7 +235,7 @@ async def create_index(self): fields = [ ( SimpleField(name="id", type="Edm.String", key=True) - if not self.use_int_vectorization + if not self.use_parent_index_projection else SearchField( name="id", type="Edm.String", @@ -280,8 +280,8 @@ async def create_index(self): else SearchIndexPermissionFilterOption.DISABLED ) - if self.use_int_vectorization: - logger.info("Including parent_id field for integrated vectorization support in new index") + if self.use_parent_index_projection: + logger.info("Including parent_id field for parent/child index projection support in new index") fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True)) vectorizers: list[VectorSearchVectorizer] = [] diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py new file mode 100644 index 0000000000..7b3bd45c65 --- /dev/null +++ b/app/backend/prepdocslib/servicesetup.py @@ -0,0 +1,336 @@ +"""Shared service setup helpers.""" + +import logging +import os +from collections.abc import Awaitable, Callable +from enum import Enum +from typing import Optional + +from azure.core.credentials import AzureKeyCredential +from azure.core.credentials_async import AsyncTokenCredential +from azure.identity.aio import get_bearer_token_provider +from openai import AsyncOpenAI + +from .blobmanager import BlobManager +from .csvparser import CsvParser +from .embeddings import ImageEmbeddings, OpenAIEmbeddings +from .figureprocessor import FigureProcessor, MediaDescriptionStrategy +from .fileprocessor import FileProcessor +from .htmlparser import LocalHTMLParser +from .jsonparser import JsonParser +from .parser import Parser +from .pdfparser import DocumentAnalysisParser, LocalPdfParser +from .strategy import SearchInfo +from .textparser import TextParser +from .textsplitter import SentenceTextSplitter, SimpleTextSplitter + +logger = logging.getLogger("scripts") + + +def clean_key_if_exists(key: Optional[str]) -> Optional[str]: + """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None.""" + if key is not None and key.strip() != "": + return key.strip() + return None + + +class OpenAIHost(str, Enum): + """Supported OpenAI hosting styles. + + OPENAI: Public OpenAI API. + AZURE: Standard Azure OpenAI (service name becomes endpoint). + AZURE_CUSTOM: A fully custom endpoint URL (for Network Isolation / APIM). + LOCAL: A locally hosted OpenAI-compatible endpoint (no key required). + """ + + OPENAI = "openai" + AZURE = "azure" + AZURE_CUSTOM = "azure_custom" + LOCAL = "local" + + +def setup_search_info( + search_service: str, + index_name: str, + azure_credential: AsyncTokenCredential, + use_agentic_retrieval: Optional[bool] = None, + azure_openai_endpoint: Optional[str] = None, + agent_name: Optional[str] = None, + agent_max_output_tokens: Optional[int] = None, + azure_openai_searchagent_deployment: Optional[str] = None, + azure_openai_searchagent_model: Optional[str] = None, + search_key: Optional[str] = None, + azure_vision_endpoint: Optional[str] = None, +) -> SearchInfo: + """Setup search service information.""" + search_creds: AsyncTokenCredential | AzureKeyCredential = ( + azure_credential if search_key is None else AzureKeyCredential(search_key) + ) + if use_agentic_retrieval and azure_openai_searchagent_model is None: + raise ValueError("Azure OpenAI SearchAgent model must be specified when using agentic retrieval.") + + return SearchInfo( + endpoint=f"https://{search_service}.search.windows.net/", + credential=search_creds, + index_name=index_name, + agent_name=agent_name, + agent_max_output_tokens=agent_max_output_tokens, + use_agentic_retrieval=use_agentic_retrieval, + azure_openai_endpoint=azure_openai_endpoint, + azure_openai_searchagent_model=azure_openai_searchagent_model, + azure_openai_searchagent_deployment=azure_openai_searchagent_deployment, + azure_vision_endpoint=azure_vision_endpoint, + ) + + +def setup_openai_client( + openai_host: OpenAIHost, + azure_credential: AsyncTokenCredential, + azure_openai_api_key: Optional[str] = None, + azure_openai_service: Optional[str] = None, + azure_openai_custom_url: Optional[str] = None, + openai_api_key: Optional[str] = None, + openai_organization: Optional[str] = None, +) -> tuple[AsyncOpenAI, Optional[str]]: + openai_client: AsyncOpenAI + azure_openai_endpoint: Optional[str] = None + + if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: + base_url: Optional[str] = None + api_key_or_token: Optional[str | Callable[[], Awaitable[str]]] = None + if openai_host == OpenAIHost.AZURE_CUSTOM: + logger.info("OPENAI_HOST is azure_custom, setting up Azure OpenAI custom client") + if not azure_openai_custom_url: + raise ValueError("AZURE_OPENAI_CUSTOM_URL must be set when OPENAI_HOST is azure_custom") + base_url = azure_openai_custom_url + else: + logger.info("OPENAI_HOST is azure, setting up Azure OpenAI client") + if not azure_openai_service: + raise ValueError("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure") + azure_openai_endpoint = f"https://{azure_openai_service}.openai.azure.com" + base_url = f"{azure_openai_endpoint}/openai/v1" + if azure_openai_api_key: + logger.info("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client") + api_key_or_token = azure_openai_api_key + else: + logger.info("Using Azure credential (passwordless authentication) for Azure OpenAI client") + api_key_or_token = get_bearer_token_provider( + azure_credential, "https://cognitiveservices.azure.com/.default" + ) + openai_client = AsyncOpenAI( + base_url=base_url, + api_key=api_key_or_token, # type: ignore[arg-type] + ) + elif openai_host == OpenAIHost.LOCAL: + logger.info("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key") + openai_client = AsyncOpenAI( + base_url=os.environ["OPENAI_BASE_URL"], + api_key="no-key-required", + ) + else: + logger.info( + "OPENAI_HOST is not azure, setting up OpenAI client using OPENAI_API_KEY and OPENAI_ORGANIZATION environment variables" + ) + if openai_api_key is None: + raise ValueError("OpenAI key is required when using the non-Azure OpenAI API") + openai_client = AsyncOpenAI( + api_key=openai_api_key, + organization=openai_organization, + ) + return openai_client, azure_openai_endpoint + + +def setup_image_embeddings_service( + azure_credential: AsyncTokenCredential, + vision_endpoint: Optional[str], + use_multimodal: bool, +) -> ImageEmbeddings | None: + image_embeddings_service: Optional[ImageEmbeddings] = None + if use_multimodal: + if vision_endpoint is None: + raise ValueError("An Azure AI Vision endpoint must be provided to use multimodal features.") + image_embeddings_service = ImageEmbeddings( + endpoint=vision_endpoint, + token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"), + ) + return image_embeddings_service + + +def setup_embeddings_service( + openai_host: OpenAIHost, + open_ai_client: AsyncOpenAI, + emb_model_name: str, + emb_model_dimensions: int, + azure_openai_deployment: Optional[str] = None, + azure_openai_endpoint: Optional[str] = None, + disable_batch: bool = False, +) -> OpenAIEmbeddings: + if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: + if azure_openai_endpoint is None: + raise ValueError("Azure OpenAI endpoint must be provided when using Azure OpenAI embeddings") + if azure_openai_deployment is None: + raise ValueError("Azure OpenAI deployment must be provided when using Azure OpenAI embeddings") + + return OpenAIEmbeddings( + open_ai_client=open_ai_client, + open_ai_model_name=emb_model_name, + open_ai_dimensions=emb_model_dimensions, + disable_batch=disable_batch, + azure_deployment_name=azure_openai_deployment, + azure_endpoint=azure_openai_endpoint, + ) + + +def setup_blob_manager( + azure_credential: AsyncTokenCredential | str, + storage_account: str, + storage_container: str, + storage_resource_group: Optional[str] = None, + subscription_id: Optional[str] = None, + storage_key: Optional[str] = None, + image_storage_container: Optional[str] = None, +) -> BlobManager: + """Create a BlobManager instance for document or figure storage. + + The optional resource group and subscription are retained for parity with + local ingestion (used for diagnostic operations) but not required by + Azure Functions. + The optional image storage container is used for the multimodal ingestion feature. + """ + endpoint = f"https://{storage_account}.blob.core.windows.net" + storage_credential: AsyncTokenCredential | str = azure_credential if storage_key is None else storage_key + + return BlobManager( + endpoint=endpoint, + container=storage_container, + account=storage_account, + credential=storage_credential, + resource_group=storage_resource_group, + subscription_id=subscription_id, + image_container=image_storage_container, + ) + + +def setup_figure_processor( + *, + credential: AsyncTokenCredential | None, + use_multimodal: bool, + use_content_understanding: bool, + content_understanding_endpoint: str | None, + openai_client: object | None, + openai_model: str | None, + openai_deployment: str | None, +) -> FigureProcessor | None: + """Create a FigureProcessor based on feature flags. + + Priority order: + 1. use_multimodal -> MediaDescriptionStrategy.OPENAI + 2. else if use_content_understanding and endpoint -> CONTENTUNDERSTANDING + 3. else -> return None (no figure description) + """ + if use_multimodal: + return FigureProcessor( + credential=credential, + strategy=MediaDescriptionStrategy.OPENAI, + openai_client=openai_client, + openai_model=openai_model, + openai_deployment=openai_deployment, + ) + if use_content_understanding and content_understanding_endpoint: + return FigureProcessor( + credential=credential, + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + content_understanding_endpoint=content_understanding_endpoint, + ) + return None + + +def build_file_processors( + *, + azure_credential: AsyncTokenCredential, + document_intelligence_service: str | None, + document_intelligence_key: str | None = None, + use_local_pdf_parser: bool = False, + use_local_html_parser: bool = False, + process_figures: bool = False, +) -> dict[str, FileProcessor]: + sentence_text_splitter = SentenceTextSplitter() + + doc_int_parser: Optional[DocumentAnalysisParser] = None + # check if Azure Document Intelligence credentials are provided + if document_intelligence_service: + credential: AsyncTokenCredential | AzureKeyCredential + if document_intelligence_key: + credential = AzureKeyCredential(document_intelligence_key) + else: + credential = azure_credential + doc_int_parser = DocumentAnalysisParser( + endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", + credential=credential, + process_figures=process_figures, + ) + + pdf_parser: Optional[Parser] = None + if use_local_pdf_parser or document_intelligence_service is None: + pdf_parser = LocalPdfParser() + elif doc_int_parser is not None: + pdf_parser = doc_int_parser + else: + logger.warning("No PDF parser available") + + html_parser: Optional[Parser] = None + if use_local_html_parser or document_intelligence_service is None: + html_parser = LocalHTMLParser() + elif doc_int_parser is not None: + html_parser = doc_int_parser + else: + logger.warning("No HTML parser available") + + # These file formats can always be parsed: + file_processors = { + ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), + ".md": FileProcessor(TextParser(), sentence_text_splitter), + ".txt": FileProcessor(TextParser(), sentence_text_splitter), + ".csv": FileProcessor(CsvParser(), sentence_text_splitter), + } + # These require either a Python package or Document Intelligence + if pdf_parser is not None: + file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)}) + if html_parser is not None: + file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)}) + # These file formats require Document Intelligence + if doc_int_parser is not None: + file_processors.update( + { + ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".png": FileProcessor(doc_int_parser, sentence_text_splitter), + ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), + ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), + ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), + ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), + ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), + } + ) + return file_processors + + +def select_processor_for_filename(file_name: str, file_processors: dict[str, FileProcessor]) -> FileProcessor: + """Select the appropriate file processor for a given filename. + + Args: + file_name: Name of the file to process + file_processors: Dictionary mapping file extensions to FileProcessor instances + + Returns: + FileProcessor instance for the file + + Raises: + ValueError: If the file extension is not supported + """ + file_ext = os.path.splitext(file_name)[1].lower() + file_processor = file_processors.get(file_ext) + if not file_processor: + raise ValueError(f"Unsupported file type: {file_name}") + return file_processor diff --git a/app/backend/prepdocslib/textprocessor.py b/app/backend/prepdocslib/textprocessor.py new file mode 100644 index 0000000000..2895a80588 --- /dev/null +++ b/app/backend/prepdocslib/textprocessor.py @@ -0,0 +1,51 @@ +"""Utilities for processing document text and combining it with figure descriptions.""" + +import logging + +from .figureprocessor import build_figure_markup +from .listfilestrategy import File +from .page import Page +from .searchmanager import Section +from .textsplitter import TextSplitter + +logger = logging.getLogger("scripts") + + +def combine_text_with_figures(page: "Page") -> None: + """Replace figure placeholders in page text with full description markup.""" + for image in page.images: + if image.description and image.placeholder in page.text: + figure_markup = build_figure_markup(image, image.description) + page.text = page.text.replace(image.placeholder, figure_markup) + logger.info("Replaced placeholder for figure %s with description markup", image.figure_id) + elif not image.description: + logger.debug("No description for figure %s; keeping placeholder", image.figure_id) + elif image.placeholder not in page.text: + logger.warning("Placeholder not found for figure %s in page %d", image.figure_id, page.page_num) + + +def process_text( + pages: list["Page"], + file: "File", + splitter: "TextSplitter", + category: str | None = None, +) -> list["Section"]: + """Process document text and figures into searchable sections. + Combines text with figure descriptions, splits into chunks, and + associates figures with their containing sections. + """ + # Step 1: Combine text with figures on each page + for page in pages: + combine_text_with_figures(page) + + # Step 2: Split combined text into chunks + logger.info("Splitting '%s' into sections", file.filename()) + sections = [Section(chunk, content=file, category=category) for chunk in splitter.split_pages(pages)] + + # Step 3: Add images back to each section based on page number + for section in sections: + section.chunk.images = [ + image for page in pages if page.page_num == section.chunk.page_num for image in page.images + ] + + return sections diff --git a/app/backend/requirements.in b/app/backend/requirements.in index 756a857192..ba8af3ef36 100644 --- a/app/backend/requirements.in +++ b/app/backend/requirements.in @@ -1,3 +1,4 @@ +azure-functions>=1.24.0 azure-identity quart quart-cors diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index 5eab109d1f..ccdb3b8a00 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -47,6 +47,8 @@ azure-core-tracing-opentelemetry==1.0.0b11 # via azure-monitor-opentelemetry azure-cosmos==4.9.0 # via -r requirements.in +azure-functions==1.24.0 + # via -r requirements.in azure-identity==1.17.1 # via # -r requirements.in @@ -436,6 +438,7 @@ uvicorn==0.30.6 # via -r requirements.in werkzeug==3.1.3 # via + # azure-functions # flask # quart wrapt==1.16.0 diff --git a/app/backend/setup_cloud_ingestion.py b/app/backend/setup_cloud_ingestion.py new file mode 100644 index 0000000000..ac9e617d0e --- /dev/null +++ b/app/backend/setup_cloud_ingestion.py @@ -0,0 +1,175 @@ +"""Script to setup cloud ingestion for Azure AI Search.""" + +import asyncio +import logging +import os + +from azure.core.credentials_async import AsyncTokenCredential +from azure.identity.aio import AzureDeveloperCliCredential +from openai import AsyncOpenAI +from rich.logging import RichHandler + +from load_azd_env import load_azd_env +from prepdocslib.blobmanager import BlobManager +from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy +from prepdocslib.listfilestrategy import LocalListFileStrategy +from prepdocslib.servicesetup import ( + OpenAIHost, + clean_key_if_exists, + setup_blob_manager, + setup_embeddings_service, + setup_openai_client, + setup_search_info, +) +from prepdocslib.strategy import DocumentAction + +logger = logging.getLogger("scripts") + + +async def setup_cloud_ingestion_strategy( + azure_credential: AsyncTokenCredential, + document_action: DocumentAction = DocumentAction.Add, +) -> tuple[CloudIngestionStrategy, AsyncOpenAI, AsyncTokenCredential, BlobManager]: + """Setup the cloud ingestion strategy with all required services.""" + + # Get environment variables + search_service = os.environ["AZURE_SEARCH_SERVICE"] + index_name = os.environ["AZURE_SEARCH_INDEX"] + search_user_assigned_identity_resource_id = os.environ["AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID"] + storage_account = os.environ["AZURE_STORAGE_ACCOUNT"] + storage_container = os.environ["AZURE_STORAGE_CONTAINER"] + storage_resource_group = os.environ["AZURE_STORAGE_RESOURCE_GROUP"] + subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"] + image_storage_container = os.environ.get("AZURE_IMAGESTORAGE_CONTAINER") + search_embedding_field = os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"] + + # Cloud ingestion specific endpoints + document_extractor_uri = os.environ["DOCUMENT_EXTRACTOR_SKILL_ENDPOINT"] + document_extractor_resource_id = os.environ["DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID"] + figure_processor_uri = os.environ["FIGURE_PROCESSOR_SKILL_ENDPOINT"] + figure_processor_resource_id = os.environ["FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID"] + text_processor_uri = os.environ["TEXT_PROCESSOR_SKILL_ENDPOINT"] + text_processor_resource_id = os.environ["TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID"] + + # Feature flags + use_multimodal = os.getenv("USE_MULTIMODAL", "").lower() == "true" + use_acls = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true" + enforce_access_control = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true" + + # Setup search info + search_info = setup_search_info( + search_service=search_service, + index_name=index_name, + azure_credential=azure_credential, + azure_vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), + ) + + # Setup blob manager + blob_manager = setup_blob_manager( + azure_credential=azure_credential, + storage_account=storage_account, + storage_container=storage_container, + storage_resource_group=storage_resource_group, + subscription_id=subscription_id, + storage_key=None, + image_storage_container=image_storage_container, + ) + + # Setup OpenAI embeddings + OPENAI_HOST = OpenAIHost(os.environ["OPENAI_HOST"]) + openai_client, azure_openai_endpoint = setup_openai_client( + openai_host=OPENAI_HOST, + azure_credential=azure_credential, + azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"), + azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), + azure_openai_api_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"), + openai_api_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")), + openai_organization=os.getenv("OPENAI_ORGANIZATION"), + ) + + emb_model_dimensions = 1536 + if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"): + emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) + + openai_embeddings_service = setup_embeddings_service( + OPENAI_HOST, + openai_client, + emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], + emb_model_dimensions=emb_model_dimensions, + azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), + azure_openai_endpoint=azure_openai_endpoint, + disable_batch=False, + ) + + # Create a list file strategy for uploading files from the data folder + list_file_strategy = LocalListFileStrategy(path_pattern="data/*", enable_global_documents=False) + + # Create the cloud ingestion strategy + ingestion_strategy = CloudIngestionStrategy( + list_file_strategy=list_file_strategy, + blob_manager=blob_manager, + search_info=search_info, + embeddings=openai_embeddings_service, + search_field_name_embedding=search_embedding_field, + document_extractor_uri=document_extractor_uri, + document_extractor_auth_resource_id=document_extractor_resource_id, + figure_processor_uri=figure_processor_uri, + figure_processor_auth_resource_id=figure_processor_resource_id, + text_processor_uri=text_processor_uri, + text_processor_auth_resource_id=text_processor_resource_id, + subscription_id=subscription_id, + document_action=document_action, + search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), + use_acls=use_acls, + use_multimodal=use_multimodal, + enforce_access_control=enforce_access_control, + search_user_assigned_identity_resource_id=search_user_assigned_identity_resource_id, + ) + + return ingestion_strategy, openai_client, azure_credential, blob_manager + + +async def main(): + """Main function to setup cloud ingestion.""" + load_azd_env() + + # Check if cloud ingestion is enabled + use_cloud_ingestion = os.getenv("USE_CLOUD_INGESTION", "").lower() == "true" + if not use_cloud_ingestion: + logger.info("Cloud ingestion is not enabled. Skipping setup.") + return + + # Setup logging + logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]) + logger.setLevel(logging.INFO) + + logger.info("Setting up cloud ingestion...") + + # Use the current user identity to connect to Azure services + if tenant_id := os.getenv("AZURE_TENANT_ID"): + logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id) + azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60) + else: + logger.info("Connecting to Azure services using the azd credential for home tenant") + azd_credential = AzureDeveloperCliCredential(process_timeout=60) + + try: + ingestion_strategy, openai_client, credential, blob_manager = await setup_cloud_ingestion_strategy( + azure_credential=azd_credential, + document_action=DocumentAction.Add, + ) + + # Setup the indexer, skillset, and data source + logger.info("Setting up indexer, skillset, and data source...") + await ingestion_strategy.setup() + logger.info("Triggering initial indexing run...") + await ingestion_strategy.run() + + finally: + await blob_manager.close_clients() + await openai_client.close() + await azd_credential.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/app/functions/__init__.py b/app/functions/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app/functions/document_extractor/.funcignore b/app/functions/document_extractor/.funcignore new file mode 100644 index 0000000000..5470b4c51a --- /dev/null +++ b/app/functions/document_extractor/.funcignore @@ -0,0 +1,11 @@ +.git* +.vscode +__pycache__ +*.pyc +.python_packages +.venv +local.settings.json +test +.pytest_cache +.coverage +*.md diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py new file mode 100644 index 0000000000..83e0bd3f85 --- /dev/null +++ b/app/functions/document_extractor/function_app.py @@ -0,0 +1,250 @@ +""" +Azure Function: Document Extractor +Custom skill for Azure AI Search that extracts and processes document content. +""" + +import base64 +import io +import json +import logging +import os +from dataclasses import dataclass +from typing import Any + +import azure.functions as func +from azure.core.exceptions import HttpResponseError +from azure.identity.aio import ManagedIdentityCredential + +from prepdocslib.fileprocessor import FileProcessor +from prepdocslib.page import Page +from prepdocslib.servicesetup import ( + build_file_processors, + select_processor_for_filename, +) + +app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) + +logger = logging.getLogger(__name__) + + +@dataclass +class GlobalSettings: + file_processors: dict[str, FileProcessor] + azure_credential: ManagedIdentityCredential + + +settings: GlobalSettings | None = None + + +def configure_global_settings(): + global settings + + # Environment configuration + use_local_pdf_parser = os.getenv("USE_LOCAL_PDF_PARSER", "false").lower() == "true" + use_local_html_parser = os.getenv("USE_LOCAL_HTML_PARSER", "false").lower() == "true" + use_multimodal = os.getenv("USE_MULTIMODAL", "false").lower() == "true" + document_intelligence_service = os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE") + + # Single shared managed identity credential + if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + azure_credential = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) + else: + logger.info("Using default Managed Identity without client ID") + azure_credential = ManagedIdentityCredential() + + # Build file processors dict for parser selection + file_processors = build_file_processors( + azure_credential=azure_credential, + document_intelligence_service=document_intelligence_service, + document_intelligence_key=None, + use_local_pdf_parser=use_local_pdf_parser, + use_local_html_parser=use_local_html_parser, + process_figures=use_multimodal, + ) + + settings = GlobalSettings( + file_processors=file_processors, + azure_credential=azure_credential, + ) + + +@app.function_name(name="extract") +@app.route(route="extract", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) +async def extract_document(req: func.HttpRequest) -> func.HttpResponse: + """ + Azure Search Custom Skill: Extract document content + + Input format (single record; file data only): + # https://learn.microsoft.com/azure/search/cognitive-search-skill-document-intelligence-layout#skill-inputs + { + "values": [ + { + "recordId": "1", + "data": { + // Base64 encoded file (skillset must enable file data) + "file_data": { + "$type": "file", + "data": "base64..." + }, + // Optional + "file_name": "doc.pdf" + } + } + ] + } + + Output format (snake_case only): + { + "values": [ + { + "recordId": "1", + "data": { + "pages": [ + {"page_num": 0, "text": "Page 1 text", "figure_ids": ["fig1"]}, + {"page_num": 1, "text": "Page 2 text", "figure_ids": []} + ], + "figures": [ + { + "figure_id": "fig1", + "page_num": 0, + "document_file_name": "doc.pdf", + "filename": "fig1.png", + "mime_type": "image/png", + "bytes_base64": "...", + "bbox": [100,150,300,400], + "title": "Figure Title", + "placeholder": "
" + } + ] + }, + "errors": [], + "warnings": [] + } + ] + } + """ + if settings is None: + return func.HttpResponse( + json.dumps({"error": "Settings not initialized"}), + mimetype="application/json", + status_code=500, + ) + + try: + # Parse custom skill input + req_body = req.get_json() + input_values = req_body.get("values", []) + + if len(input_values) != 1: + raise ValueError("document_extractor expects exactly one record per request, set batchSize to 1.") + + input_record = input_values[0] + record_id = input_record.get("recordId", "") + data = input_record.get("data", {}) + + try: + result = await process_document(data) + output_values = [ + { + "recordId": record_id, + "data": result, + "errors": [], + "warnings": [], + } + ] + except Exception as e: + logger.error(f"Error processing record {record_id}: {str(e)}", exc_info=True) + output_values = [ + { + "recordId": record_id, + "data": {}, + "errors": [{"message": str(e)}], + "warnings": [], + } + ] + + return func.HttpResponse(json.dumps({"values": output_values}), mimetype="application/json", status_code=200) + + except Exception as e: + logger.error(f"Fatal error in extract_document: {str(e)}", exc_info=True) + return func.HttpResponse(json.dumps({"error": str(e)}), mimetype="application/json", status_code=500) + + +async def process_document(data: dict[str, Any]) -> dict[str, Any]: + """ + Process a single document: download, parse, extract figures, upload images + + Args: + data: Input data with blobUrl, fileName, contentType + + Returns: + Dictionary with 'text' (markdown) and 'images' (list of {url, description}) + """ + document_stream, file_name, content_type = get_document_stream_filedata(data) + logger.info("Processing document: %s", file_name) + + # Get parser from file_processors dict based on file extension + file_processor = select_processor_for_filename(file_name, settings.file_processors) + parser = file_processor.parser + + pages: list[Page] = [] + try: + document_stream.seek(0) + pages = [page async for page in parser.parse(content=document_stream)] + except HttpResponseError as exc: + raise ValueError(f"Parser failed for {file_name}: {exc.message}") from exc + finally: + document_stream.close() + + components = build_document_components(file_name, pages) + return components + + +def get_document_stream_filedata(data: dict[str, Any]) -> tuple[io.BytesIO, str, str]: + """Return a BytesIO stream for file_data input only (skillset must send file bytes).""" + file_payload = data.get("file_data", {}) + encoded = file_payload.get("data") + if not encoded: + raise ValueError("file_data payload missing base64 data") + document_bytes = base64.b64decode(encoded) + file_name = data.get("file_name") or data.get("fileName") or file_payload.get("name") or "document" + content_type = data.get("contentType") or file_payload.get("contentType") or "application/octet-stream" + stream = io.BytesIO(document_bytes) + stream.name = file_name + return stream, file_name, content_type + + +def build_document_components(file_name: str, pages: list[Page]) -> dict[str, Any]: + page_entries: list[dict[str, Any]] = [] + figure_entries: list[dict[str, Any]] = [] + + for page in pages: + page_text = page.text or "" + figure_ids_on_page: list[str] = [] + if page.images: + for image in page.images: + figure_ids_on_page.append(image.figure_id) + figure_entries.append(image.to_skill_payload(file_name)) + + page_entries.append( + { + "page_num": page.page_num, + "text": page_text, + "figure_ids": figure_ids_on_page, + } + ) + + return { + "file_name": file_name, + "pages": page_entries, + "figures": figure_entries, + } + + +# Initialize settings at module load time, unless we're in a test environment +if os.environ.get("PYTEST_CURRENT_TEST") is None: + try: + configure_global_settings() + except KeyError as e: + logger.warning("Could not initialize settings at module load time: %s", e) diff --git a/app/functions/document_extractor/host.json b/app/functions/document_extractor/host.json new file mode 100644 index 0000000000..c00cc23f37 --- /dev/null +++ b/app/functions/document_extractor/host.json @@ -0,0 +1,27 @@ +{ + "version": "2.0", + "extensions": { + "mcp": { + "system": { + "webhookAuthorizationLevel": "anonymous" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "functionTimeout": "00:10:00", + "logging": { + "logLevel": { + "default": "Information", + "Function": "Information" + }, + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "maxTelemetryItemsPerSecond": 20 + } + } + } +} diff --git a/app/functions/document_extractor/requirements.txt b/app/functions/document_extractor/requirements.txt new file mode 100644 index 0000000000..ccdb3b8a00 --- /dev/null +++ b/app/functions/document_extractor/requirements.txt @@ -0,0 +1,456 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt --python-version 3.10 +aiofiles==24.1.0 + # via + # prompty + # quart +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 + # via + # -r requirements.in + # microsoft-kiota-authentication-azure +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # openai +asgiref==3.10.0 + # via opentelemetry-instrumentation-asgi +async-timeout==5.0.1 + # via aiohttp +attrs==25.3.0 + # via aiohttp +azure-ai-documentintelligence==1.0.0b4 + # via -r requirements.in +azure-cognitiveservices-speech==1.40.0 + # via -r requirements.in +azure-common==1.1.28 + # via azure-search-documents +azure-core==1.35.0 + # via + # azure-ai-documentintelligence + # azure-core-tracing-opentelemetry + # azure-cosmos + # azure-identity + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # microsoft-kiota-authentication-azure + # msrest +azure-core-tracing-opentelemetry==1.0.0b11 + # via azure-monitor-opentelemetry +azure-cosmos==4.9.0 + # via -r requirements.in +azure-functions==1.24.0 + # via -r requirements.in +azure-identity==1.17.1 + # via + # -r requirements.in + # azure-monitor-opentelemetry-exporter + # msgraph-sdk +azure-monitor-opentelemetry==1.8.1 + # via -r requirements.in +azure-monitor-opentelemetry-exporter==1.0.0b44 + # via azure-monitor-opentelemetry +azure-search-documents==11.7.0b1 + # via -r requirements.in +azure-storage-blob==12.22.0 + # via + # -r requirements.in + # azure-storage-file-datalake +azure-storage-file-datalake==12.16.0 + # via -r requirements.in +beautifulsoup4==4.12.3 + # via -r requirements.in +blinker==1.9.0 + # via + # flask + # quart +certifi==2024.7.4 + # via + # httpcore + # httpx + # msrest + # requests +cffi==1.17.0 + # via cryptography +charset-normalizer==3.3.2 + # via requests +click==8.3.0 + # via + # flask + # prompty + # quart + # uvicorn +cryptography==44.0.1 + # via + # -r requirements.in + # azure-identity + # azure-storage-blob + # msal + # pyjwt +distro==1.9.0 + # via openai +exceptiongroup==1.3.0 + # via + # anyio + # hypercorn + # taskgroup +fixedint==0.1.6 + # via azure-monitor-opentelemetry-exporter +flask==3.1.2 + # via quart +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +h11==0.16.0 + # via + # httpcore + # hypercorn + # uvicorn + # wsproto +h2==4.3.0 + # via + # httpx + # hypercorn +hpack==4.1.0 + # via h2 +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via + # microsoft-kiota-http + # msgraph-core + # openai +hypercorn==0.17.3 + # via quart +hyperframe==6.1.0 + # via h2 +idna==3.10 + # via + # anyio + # httpx + # requests + # yarl +importlib-metadata==8.0.0 + # via opentelemetry-api +isodate==0.6.1 + # via + # azure-ai-documentintelligence + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # msrest +itsdangerous==2.2.0 + # via + # flask + # quart +jinja2==3.1.6 + # via + # flask + # prompty + # quart +jiter==0.11.0 + # via openai +markdown-it-py==3.0.0 + # via rich +markupsafe==3.0.3 + # via + # flask + # jinja2 + # quart + # werkzeug +mdurl==0.1.2 + # via markdown-it-py +microsoft-kiota-abstractions==1.9.3 + # via + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # microsoft-kiota-serialization-form + # microsoft-kiota-serialization-json + # microsoft-kiota-serialization-multipart + # microsoft-kiota-serialization-text + # msgraph-core +microsoft-kiota-authentication-azure==1.9.3 + # via msgraph-core +microsoft-kiota-http==1.9.3 + # via msgraph-core +microsoft-kiota-serialization-form==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-json==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-multipart==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-text==1.9.3 + # via msgraph-sdk +msal==1.33.0 + # via + # -r requirements.in + # azure-identity + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +msgraph-core==1.3.3 + # via msgraph-sdk +msgraph-sdk==1.45.0 + # via -r requirements.in +msrest==0.7.1 + # via azure-monitor-opentelemetry-exporter +multidict==6.7.0 + # via + # aiohttp + # yarl +oauthlib==3.3.1 + # via requests-oauthlib +openai==2.6.1 + # via -r requirements.in +opentelemetry-api==1.38.0 + # via + # azure-core-tracing-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-instrumentation==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-aiohttp-client==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-asgi==0.59b0 + # via + # -r requirements.in + # opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-dbapi==0.59b0 + # via opentelemetry-instrumentation-psycopg2 +opentelemetry-instrumentation-django==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-fastapi==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-flask==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-httpx==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-openai==0.47.5 + # via -r requirements.in +opentelemetry-instrumentation-psycopg2==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-requests==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib3==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-wsgi==0.59b0 + # via + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-flask +opentelemetry-resource-detector-azure==0.1.5 + # via azure-monitor-opentelemetry +opentelemetry-sdk==1.38.0 + # via + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-resource-detector-azure +opentelemetry-semantic-conventions==0.59b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-semantic-conventions-ai==0.4.13 + # via opentelemetry-instrumentation-openai +opentelemetry-util-http==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +packaging==24.1 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-flask +pillow==12.0.0 + # via -r requirements.in +priority==2.0.0 + # via hypercorn +prompty==0.1.50 + # via -r requirements.in +propcache==0.2.0 + # via + # aiohttp + # yarl +psutil==7.1.2 + # via azure-monitor-opentelemetry-exporter +pycparser==2.22 + # via cffi +pydantic==2.12.3 + # via openai +pydantic-core==2.41.4 + # via pydantic +pygments==2.19.2 + # via rich +pyjwt==2.10.1 + # via + # -r requirements.in + # msal +pymupdf==1.26.0 + # via -r requirements.in +pypdf==6.1.3 + # via -r requirements.in +python-dotenv==1.1.1 + # via + # -r requirements.in + # prompty +pyyaml==6.0.2 + # via prompty +quart==0.20.0 + # via + # -r requirements.in + # quart-cors +quart-cors==0.7.0 + # via -r requirements.in +regex==2025.7.34 + # via tiktoken +requests==2.32.4 + # via + # azure-core + # msal + # msrest + # requests-oauthlib + # tiktoken +requests-oauthlib==2.0.0 + # via msrest +rich==14.1.0 + # via -r requirements.in +six==1.16.0 + # via + # azure-core + # isodate +sniffio==1.3.1 + # via + # anyio + # openai +soupsieve==2.7 + # via beautifulsoup4 +std-uritemplate==2.0.5 + # via microsoft-kiota-abstractions +taskgroup==0.2.2 + # via hypercorn +tenacity==9.1.2 + # via -r requirements.in +tiktoken==0.12.0 + # via -r requirements.in +tomli==2.2.1 + # via hypercorn +tqdm==4.66.5 + # via openai +types-beautifulsoup4==4.12.0.20240511 + # via -r requirements.in +types-html5lib==1.1.11.20241018 + # via types-beautifulsoup4 +types-pillow==10.2.0.20240822 + # via -r requirements.in +typing-extensions==4.15.0 + # via + # -r requirements.in + # aiosignal + # anyio + # asgiref + # azure-ai-documentintelligence + # azure-core + # azure-cosmos + # azure-identity + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # exceptiongroup + # hypercorn + # multidict + # openai + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pypdf + # taskgroup + # typing-inspection + # uvicorn +typing-inspection==0.4.2 + # via pydantic +urllib3==2.5.0 + # via requests +uvicorn==0.30.6 + # via -r requirements.in +werkzeug==3.1.3 + # via + # azure-functions + # flask + # quart +wrapt==1.16.0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-urllib3 +wsproto==1.2.0 + # via hypercorn +yarl==1.17.2 + # via aiohttp +zipp==3.21.0 + # via importlib-metadata diff --git a/app/functions/figure_processor/.funcignore b/app/functions/figure_processor/.funcignore new file mode 100644 index 0000000000..5470b4c51a --- /dev/null +++ b/app/functions/figure_processor/.funcignore @@ -0,0 +1,11 @@ +.git* +.vscode +__pycache__ +*.pyc +.python_packages +.venv +local.settings.json +test +.pytest_cache +.coverage +*.md diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py new file mode 100644 index 0000000000..2399171d31 --- /dev/null +++ b/app/functions/figure_processor/function_app.py @@ -0,0 +1,196 @@ +""" +Azure Function: Figure Processor +Custom skill for Azure AI Search that enriches figure payloads emitted by the document extractor. + +This function: +1. Accepts raw figure bytes and metadata (one record per request due to skill fanout). +2. Uploads rendered figure images to blob storage with citation overlays. +3. Generates natural-language captions via Azure OpenAI or Content Understanding (when configured). +4. Optionally computes image embeddings using Azure AI Vision (when multimodal is enabled). +5. Returns enriched figure metadata back to the indexer for downstream text processing. +""" + +import json +import logging +import os +from dataclasses import dataclass +from typing import Any + +import azure.functions as func +from azure.identity.aio import ManagedIdentityCredential, get_bearer_token_provider + +from prepdocslib.blobmanager import BlobManager +from prepdocslib.embeddings import ImageEmbeddings +from prepdocslib.figureprocessor import FigureProcessor, process_page_image +from prepdocslib.page import ImageOnPage +from prepdocslib.servicesetup import ( + OpenAIHost, + setup_blob_manager, + setup_figure_processor, + setup_openai_client, +) + +# Mark the function as anonymous since we are protecting it with built-in auth instead +app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) + +logger = logging.getLogger(__name__) + + +@dataclass +class GlobalSettings: + blob_manager: BlobManager + figure_processor: FigureProcessor | None + image_embeddings: ImageEmbeddings | None + + +settings: GlobalSettings | None = None + + +def configure_global_settings(): + global settings + + # Environment configuration + # Required variables + AZURE_STORAGE_ACCOUNT = os.environ["AZURE_STORAGE_ACCOUNT"] + IMAGE_CONTAINER = os.environ["AZURE_IMAGESTORAGE_CONTAINER"] + + # Optional feature flags + USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "false").lower() == "true" + USE_MEDIA_DESCRIBER_AZURE_CU = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "false").lower() == "true" + + # Conditionally required (based on feature flags) + CONTENT_UNDERSTANDING_ENDPOINT = os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT") + AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE") + AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL") + AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") + AZURE_OPENAI_CHATGPT_MODEL = os.getenv("AZURE_OPENAI_CHATGPT_MODEL") + AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT") + + # Single shared managed identity credential (matches document_extractor pattern) + if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + AZURE_CREDENTIAL = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) + else: + logger.info("Using default Managed Identity without client ID") + AZURE_CREDENTIAL = ManagedIdentityCredential() + + # Blob Manager + blob_manager = setup_blob_manager( + storage_account=AZURE_STORAGE_ACCOUNT, + storage_container=IMAGE_CONTAINER, + azure_credential=AZURE_CREDENTIAL, + image_storage_container=IMAGE_CONTAINER, + ) + + # Figure Processor (with optional OpenAI for multimodal) + openai_client = None + openai_model = None + openai_deployment = None + if USE_MULTIMODAL and (AZURE_OPENAI_SERVICE or AZURE_OPENAI_CUSTOM_URL) and AZURE_OPENAI_CHATGPT_DEPLOYMENT: + openai_client, _ = setup_openai_client( + openai_host=OpenAIHost.AZURE_CUSTOM if AZURE_OPENAI_CUSTOM_URL else OpenAIHost.AZURE, + azure_credential=AZURE_CREDENTIAL, + azure_openai_service=AZURE_OPENAI_SERVICE, + azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL, + ) + openai_model = AZURE_OPENAI_CHATGPT_MODEL or AZURE_OPENAI_CHATGPT_DEPLOYMENT + openai_deployment = AZURE_OPENAI_CHATGPT_DEPLOYMENT + elif USE_MULTIMODAL and not USE_MEDIA_DESCRIBER_AZURE_CU: + logger.warning( + "USE_MULTIMODAL is true but Azure OpenAI configuration incomplete and Content Understanding not enabled" + ) + + figure_processor = setup_figure_processor( + credential=AZURE_CREDENTIAL, + use_multimodal=USE_MULTIMODAL, + use_content_understanding=USE_MEDIA_DESCRIBER_AZURE_CU, + content_understanding_endpoint=CONTENT_UNDERSTANDING_ENDPOINT, + openai_client=openai_client, + openai_model=openai_model, + openai_deployment=openai_deployment, + ) + + # Image Embeddings (optional) + if USE_MULTIMODAL and AZURE_VISION_ENDPOINT: + token_provider = get_bearer_token_provider(AZURE_CREDENTIAL, "https://cognitiveservices.azure.com/.default") + image_embeddings = ImageEmbeddings(AZURE_VISION_ENDPOINT, token_provider) + else: + image_embeddings = None + + settings = GlobalSettings( + blob_manager=blob_manager, + figure_processor=figure_processor, + image_embeddings=image_embeddings, + ) + + +@app.function_name(name="process_figure") +@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) +async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: + """Entrypoint for Azure Search custom skill calls.""" + + if settings is None: + return func.HttpResponse( + json.dumps({"error": "Settings not initialized"}), + mimetype="application/json", + status_code=500, + ) + + try: + payload = req.get_json() + except ValueError as exc: + logger.error("Failed to parse request body: %s", exc) + return func.HttpResponse( + json.dumps({"error": "Invalid JSON payload"}), + mimetype="application/json", + status_code=400, + ) + + input_values = payload.get("values", []) + output_values: list[dict[str, Any]] = [] + + for record in input_values: + record_id = record.get("recordId", "") + data = record.get("data", {}) + try: + image_on_page, file_name = ImageOnPage.from_skill_payload(data) + await process_page_image( + image=image_on_page, + document_filename=file_name, + blob_manager=settings.blob_manager, + image_embeddings_client=settings.image_embeddings, + figure_processor=settings.figure_processor, + ) + figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False) + output_values.append( + { + "recordId": record_id, + "data": figure_payload, + "errors": [], + "warnings": [], + } + ) + except Exception as exc: # pragma: no cover - defensive + logger.error("Error processing figure %s: %s", record_id, exc, exc_info=True) + output_values.append( + { + "recordId": record_id, + "data": {}, + "errors": [{"message": str(exc)}], + "warnings": [], + } + ) + + return func.HttpResponse( + json.dumps({"values": output_values}), + mimetype="application/json", + status_code=200, + ) + + +# Initialize settings at module load time, unless we're in a test environment +if os.environ.get("PYTEST_CURRENT_TEST") is None: + try: + configure_global_settings() + except KeyError as e: + logger.warning("Could not initialize settings at module load time: %s", e) diff --git a/app/functions/figure_processor/host.json b/app/functions/figure_processor/host.json new file mode 100644 index 0000000000..d26b61f2a8 --- /dev/null +++ b/app/functions/figure_processor/host.json @@ -0,0 +1,27 @@ +{ + "version": "2.0", + "extensions": { + "mcp": { + "system": { + "webhookAuthorizationLevel": "anonymous" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "functionTimeout": "00:06:00", + "logging": { + "logLevel": { + "default": "Information", + "Function": "Information" + }, + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "maxTelemetryItemsPerSecond": 20 + } + } + } +} diff --git a/app/functions/figure_processor/requirements.txt b/app/functions/figure_processor/requirements.txt new file mode 100644 index 0000000000..ccdb3b8a00 --- /dev/null +++ b/app/functions/figure_processor/requirements.txt @@ -0,0 +1,456 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt --python-version 3.10 +aiofiles==24.1.0 + # via + # prompty + # quart +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 + # via + # -r requirements.in + # microsoft-kiota-authentication-azure +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # openai +asgiref==3.10.0 + # via opentelemetry-instrumentation-asgi +async-timeout==5.0.1 + # via aiohttp +attrs==25.3.0 + # via aiohttp +azure-ai-documentintelligence==1.0.0b4 + # via -r requirements.in +azure-cognitiveservices-speech==1.40.0 + # via -r requirements.in +azure-common==1.1.28 + # via azure-search-documents +azure-core==1.35.0 + # via + # azure-ai-documentintelligence + # azure-core-tracing-opentelemetry + # azure-cosmos + # azure-identity + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # microsoft-kiota-authentication-azure + # msrest +azure-core-tracing-opentelemetry==1.0.0b11 + # via azure-monitor-opentelemetry +azure-cosmos==4.9.0 + # via -r requirements.in +azure-functions==1.24.0 + # via -r requirements.in +azure-identity==1.17.1 + # via + # -r requirements.in + # azure-monitor-opentelemetry-exporter + # msgraph-sdk +azure-monitor-opentelemetry==1.8.1 + # via -r requirements.in +azure-monitor-opentelemetry-exporter==1.0.0b44 + # via azure-monitor-opentelemetry +azure-search-documents==11.7.0b1 + # via -r requirements.in +azure-storage-blob==12.22.0 + # via + # -r requirements.in + # azure-storage-file-datalake +azure-storage-file-datalake==12.16.0 + # via -r requirements.in +beautifulsoup4==4.12.3 + # via -r requirements.in +blinker==1.9.0 + # via + # flask + # quart +certifi==2024.7.4 + # via + # httpcore + # httpx + # msrest + # requests +cffi==1.17.0 + # via cryptography +charset-normalizer==3.3.2 + # via requests +click==8.3.0 + # via + # flask + # prompty + # quart + # uvicorn +cryptography==44.0.1 + # via + # -r requirements.in + # azure-identity + # azure-storage-blob + # msal + # pyjwt +distro==1.9.0 + # via openai +exceptiongroup==1.3.0 + # via + # anyio + # hypercorn + # taskgroup +fixedint==0.1.6 + # via azure-monitor-opentelemetry-exporter +flask==3.1.2 + # via quart +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +h11==0.16.0 + # via + # httpcore + # hypercorn + # uvicorn + # wsproto +h2==4.3.0 + # via + # httpx + # hypercorn +hpack==4.1.0 + # via h2 +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via + # microsoft-kiota-http + # msgraph-core + # openai +hypercorn==0.17.3 + # via quart +hyperframe==6.1.0 + # via h2 +idna==3.10 + # via + # anyio + # httpx + # requests + # yarl +importlib-metadata==8.0.0 + # via opentelemetry-api +isodate==0.6.1 + # via + # azure-ai-documentintelligence + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # msrest +itsdangerous==2.2.0 + # via + # flask + # quart +jinja2==3.1.6 + # via + # flask + # prompty + # quart +jiter==0.11.0 + # via openai +markdown-it-py==3.0.0 + # via rich +markupsafe==3.0.3 + # via + # flask + # jinja2 + # quart + # werkzeug +mdurl==0.1.2 + # via markdown-it-py +microsoft-kiota-abstractions==1.9.3 + # via + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # microsoft-kiota-serialization-form + # microsoft-kiota-serialization-json + # microsoft-kiota-serialization-multipart + # microsoft-kiota-serialization-text + # msgraph-core +microsoft-kiota-authentication-azure==1.9.3 + # via msgraph-core +microsoft-kiota-http==1.9.3 + # via msgraph-core +microsoft-kiota-serialization-form==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-json==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-multipart==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-text==1.9.3 + # via msgraph-sdk +msal==1.33.0 + # via + # -r requirements.in + # azure-identity + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +msgraph-core==1.3.3 + # via msgraph-sdk +msgraph-sdk==1.45.0 + # via -r requirements.in +msrest==0.7.1 + # via azure-monitor-opentelemetry-exporter +multidict==6.7.0 + # via + # aiohttp + # yarl +oauthlib==3.3.1 + # via requests-oauthlib +openai==2.6.1 + # via -r requirements.in +opentelemetry-api==1.38.0 + # via + # azure-core-tracing-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-instrumentation==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-aiohttp-client==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-asgi==0.59b0 + # via + # -r requirements.in + # opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-dbapi==0.59b0 + # via opentelemetry-instrumentation-psycopg2 +opentelemetry-instrumentation-django==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-fastapi==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-flask==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-httpx==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-openai==0.47.5 + # via -r requirements.in +opentelemetry-instrumentation-psycopg2==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-requests==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib3==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-wsgi==0.59b0 + # via + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-flask +opentelemetry-resource-detector-azure==0.1.5 + # via azure-monitor-opentelemetry +opentelemetry-sdk==1.38.0 + # via + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-resource-detector-azure +opentelemetry-semantic-conventions==0.59b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-semantic-conventions-ai==0.4.13 + # via opentelemetry-instrumentation-openai +opentelemetry-util-http==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +packaging==24.1 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-flask +pillow==12.0.0 + # via -r requirements.in +priority==2.0.0 + # via hypercorn +prompty==0.1.50 + # via -r requirements.in +propcache==0.2.0 + # via + # aiohttp + # yarl +psutil==7.1.2 + # via azure-monitor-opentelemetry-exporter +pycparser==2.22 + # via cffi +pydantic==2.12.3 + # via openai +pydantic-core==2.41.4 + # via pydantic +pygments==2.19.2 + # via rich +pyjwt==2.10.1 + # via + # -r requirements.in + # msal +pymupdf==1.26.0 + # via -r requirements.in +pypdf==6.1.3 + # via -r requirements.in +python-dotenv==1.1.1 + # via + # -r requirements.in + # prompty +pyyaml==6.0.2 + # via prompty +quart==0.20.0 + # via + # -r requirements.in + # quart-cors +quart-cors==0.7.0 + # via -r requirements.in +regex==2025.7.34 + # via tiktoken +requests==2.32.4 + # via + # azure-core + # msal + # msrest + # requests-oauthlib + # tiktoken +requests-oauthlib==2.0.0 + # via msrest +rich==14.1.0 + # via -r requirements.in +six==1.16.0 + # via + # azure-core + # isodate +sniffio==1.3.1 + # via + # anyio + # openai +soupsieve==2.7 + # via beautifulsoup4 +std-uritemplate==2.0.5 + # via microsoft-kiota-abstractions +taskgroup==0.2.2 + # via hypercorn +tenacity==9.1.2 + # via -r requirements.in +tiktoken==0.12.0 + # via -r requirements.in +tomli==2.2.1 + # via hypercorn +tqdm==4.66.5 + # via openai +types-beautifulsoup4==4.12.0.20240511 + # via -r requirements.in +types-html5lib==1.1.11.20241018 + # via types-beautifulsoup4 +types-pillow==10.2.0.20240822 + # via -r requirements.in +typing-extensions==4.15.0 + # via + # -r requirements.in + # aiosignal + # anyio + # asgiref + # azure-ai-documentintelligence + # azure-core + # azure-cosmos + # azure-identity + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # exceptiongroup + # hypercorn + # multidict + # openai + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pypdf + # taskgroup + # typing-inspection + # uvicorn +typing-inspection==0.4.2 + # via pydantic +urllib3==2.5.0 + # via requests +uvicorn==0.30.6 + # via -r requirements.in +werkzeug==3.1.3 + # via + # azure-functions + # flask + # quart +wrapt==1.16.0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-urllib3 +wsproto==1.2.0 + # via hypercorn +yarl==1.17.2 + # via aiohttp +zipp==3.21.0 + # via importlib-metadata diff --git a/app/functions/text_processor/.funcignore b/app/functions/text_processor/.funcignore new file mode 100644 index 0000000000..5470b4c51a --- /dev/null +++ b/app/functions/text_processor/.funcignore @@ -0,0 +1,11 @@ +.git* +.vscode +__pycache__ +*.pyc +.python_packages +.venv +local.settings.json +test +.pytest_cache +.coverage +*.md diff --git a/app/functions/text_processor/function_app.py b/app/functions/text_processor/function_app.py new file mode 100644 index 0000000000..0a3e15cd9f --- /dev/null +++ b/app/functions/text_processor/function_app.py @@ -0,0 +1,297 @@ +"""Azure Function: Text Processor. +Custom skill for Azure AI Search that merges page text with figure metadata, splits into chunks, and computes embeddings. +""" + +import io +import json +import logging +import os +from dataclasses import dataclass +from typing import Any + +import azure.functions as func +from azure.identity.aio import ManagedIdentityCredential + +from prepdocslib.blobmanager import BlobManager +from prepdocslib.embeddings import OpenAIEmbeddings +from prepdocslib.fileprocessor import FileProcessor +from prepdocslib.listfilestrategy import File +from prepdocslib.page import ImageOnPage, Page +from prepdocslib.servicesetup import ( + OpenAIHost, + build_file_processors, + select_processor_for_filename, + setup_embeddings_service, + setup_openai_client, +) +from prepdocslib.textprocessor import process_text + +# Mark the function as anonymous since we are protecting it with built-in auth instead +app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) + +logger = logging.getLogger(__name__) + + +@dataclass +class GlobalSettings: + use_vectors: bool + use_multimodal: bool + embedding_dimensions: int + file_processors: dict[str, FileProcessor] + embedding_service: OpenAIEmbeddings | None + + +settings: GlobalSettings | None = None + + +def configure_global_settings(): + global settings + + # Environment configuration + use_vectors = os.getenv("USE_VECTORS", "true").lower() == "true" + use_multimodal = os.getenv("USE_MULTIMODAL", "false").lower() == "true" + embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "3072")) + + # Conditionally required (based on feature flags) + openai_host_str = os.getenv("OPENAI_HOST", "azure") + azure_openai_service = os.getenv("AZURE_OPENAI_SERVICE") + azure_openai_custom_url = os.getenv("AZURE_OPENAI_CUSTOM_URL") + azure_openai_emb_deployment = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") + azure_openai_emb_model_name = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-3-large") + document_intelligence_service = os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE") + + # Single shared managed identity credential + if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + azure_credential = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) + else: + logger.info("Using default Managed Identity without client ID") + azure_credential = ManagedIdentityCredential() + + # Build file processors to get correct splitter for each file type + file_processors = build_file_processors( + azure_credential=azure_credential, + document_intelligence_service=document_intelligence_service, + document_intelligence_key=None, + use_local_pdf_parser=False, + use_local_html_parser=False, + process_figures=use_multimodal, + ) + + # Embedding service (optional) + embedding_service = None + if use_vectors: + if (azure_openai_service or azure_openai_custom_url) and ( + azure_openai_emb_deployment and azure_openai_emb_model_name + ): + openai_host = OpenAIHost(openai_host_str) + openai_client, azure_openai_endpoint = setup_openai_client( + openai_host=openai_host, + azure_credential=azure_credential, + azure_openai_service=azure_openai_service, + azure_openai_custom_url=azure_openai_custom_url, + ) + embedding_service = setup_embeddings_service( + openai_host, + openai_client, + emb_model_name=azure_openai_emb_model_name, + emb_model_dimensions=embedding_dimensions, + azure_openai_deployment=azure_openai_emb_deployment, + azure_openai_endpoint=azure_openai_endpoint, + ) + else: + logger.warning("USE_VECTORS is true but embedding configuration incomplete; embeddings disabled") + + settings = GlobalSettings( + use_vectors=use_vectors, + use_multimodal=use_multimodal, + embedding_dimensions=embedding_dimensions, + file_processors=file_processors, + embedding_service=embedding_service, + ) + + +@app.function_name(name="process_text") +@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) +async def process_text_entry(req: func.HttpRequest) -> func.HttpResponse: + """Azure Search custom skill entry point for chunking and embeddings.""" + + if settings is None: + return func.HttpResponse( + json.dumps({"error": "Settings not initialized"}), + mimetype="application/json", + status_code=500, + ) + + try: + payload = req.get_json() + except ValueError as exc: + logger.error("Invalid JSON payload: %s", exc) + return func.HttpResponse( + json.dumps({"error": "Request body must be valid JSON"}), + mimetype="application/json", + status_code=400, + ) + + values = payload.get("values", []) + output_values: list[dict[str, Any]] = [] + + for record in values: + record_id = record.get("recordId", "") + data = record.get("data", {}) + try: + chunks = await process_document(data) + output_values.append( + { + "recordId": record_id, + "data": {"chunks": chunks}, + "errors": [], + "warnings": [], + } + ) + except Exception as exc: # pragma: no cover - defensive logging + logger.error("Failed to process record %s: %s", record_id, exc, exc_info=True) + output_values.append( + { + "recordId": record_id, + "data": {}, + "errors": [{"message": str(exc)}], + "warnings": [], + } + ) + + return func.HttpResponse( + json.dumps({"values": output_values}), + mimetype="application/json", + status_code=200, + ) + + +async def process_document(data: dict[str, Any]) -> list[dict[str, Any]]: + """Combine figures with page text, split into chunks, and (optionally) embed. + + Parameters + ---------- + data: dict[str, Any] + Skill payload containing consolidated_document with file metadata, pages, and figures. + + Returns + ------- + list[dict[str, Any]] + Chunk dictionaries ready for downstream indexing. + """ + + # Extract consolidated_document object from Shaper skill + consolidated_doc = data.get("consolidated_document", data) + + file_name = consolidated_doc.get("file_name", "document") + storage_url = consolidated_doc.get("storageUrl") or consolidated_doc.get("metadata_storage_path") or file_name + pages_input = consolidated_doc.get("pages", []) # [{page_num, text, figure_ids}] + figures_input = consolidated_doc.get("figures", []) # serialized skill payload + + figures_by_id = {figure["figure_id"]: figure for figure in figures_input} + + logger.info("Processing %s: %d pages, %d figures", file_name, len(pages_input), len(figures_input)) + + # Build Page objects with placeholders intact (figure markup will be injected by combine_text_with_figures()) + pages: list[Page] = [] + offset = 0 + for page_entry in pages_input: + # Zero-based page numbering: pages emitted by extractor already zero-based + page_num = int(page_entry.get("page_num", len(pages))) + page_text = page_entry.get("text", "") + page_obj = Page(page_num=page_num, offset=offset, text=page_text) + offset += len(page_text) + + # Construct ImageOnPage objects from figureIds list + figure_ids: list[str] = page_entry.get("figure_ids", []) + for fid in figure_ids: + figure_payload = figures_by_id.get(fid) + if not figure_payload: + logger.warning("Figure ID %s not found in figures metadata for page %d", fid, page_num) + continue + try: + image_on_page, _ = ImageOnPage.from_skill_payload(figure_payload) + page_obj.images.append(image_on_page) + except Exception as exc: + logger.error("Failed to deserialize figure %s: %s", fid, exc, exc_info=True) + pages.append(page_obj) + + if not pages: + logger.info("No textual content found for %s", file_name) + return [] + + # Create a lightweight File wrapper required by process_text + dummy_stream = io.BytesIO(b"") + dummy_stream.name = file_name + file_wrapper = File(content=dummy_stream) + + # Get the appropriate splitter for this file type + file_processor = select_processor_for_filename(file_name, settings.file_processors) + splitter = file_processor.splitter + + sections = process_text(pages, file_wrapper, splitter, category=None) + if not sections: + return [] + + # Generate embeddings for section texts + chunk_texts = [s.chunk.text for s in sections] + embeddings: list[list[float]] | None = None + if settings.use_vectors and chunk_texts: + if settings.embedding_service: + embeddings = await settings.embedding_service.create_embeddings(chunk_texts) + else: + logger.warning("Embeddings requested but service not initialised; skipping vectors") + + # Use the same id base generation as local ingestion pipeline for parity + normalized_id = file_wrapper.filename_to_id() + outputs: list[dict[str, Any]] = [] + for idx, section in enumerate(sections): + content = section.chunk.text.strip() + if not content: + continue + embedding_vec = embeddings[idx] if embeddings else None + image_refs: list[dict[str, Any]] = [] + for image in section.chunk.images: + ref = { + "url": image.url or "", + "description": image.description or "", + "boundingbox": list(image.bbox), + } + if settings.use_multimodal and image.embedding is not None: + ref["embedding"] = image.embedding + image_refs.append(ref) + chunk_entry: dict[str, Any] = { + "id": f"{normalized_id}-{idx:04d}", + "content": content, + "sourcepage": BlobManager.sourcepage_from_file_page(file_name, section.chunk.page_num), + "sourcefile": file_name, + "parent_id": storage_url, + **({"images": image_refs} if image_refs else {}), + } + + if embedding_vec is not None: + if len(embedding_vec) == settings.embedding_dimensions: + chunk_entry["embedding"] = embedding_vec + else: + logger.warning( + "Skipping embedding for %s chunk %d due to dimension mismatch (expected %d, got %d)", + file_name, + idx, + settings.embedding_dimensions, + len(embedding_vec), + ) + elif settings.use_vectors: + logger.warning("Embeddings were requested but missing for %s chunk %d", file_name, idx) + + outputs.append(chunk_entry) + + return outputs + + +# Initialize settings at module load time, unless we're in a test environment +if os.environ.get("PYTEST_CURRENT_TEST") is None: + try: + configure_global_settings() + except KeyError as e: + logger.warning("Could not initialize settings at module load time: %s", e) diff --git a/app/functions/text_processor/host.json b/app/functions/text_processor/host.json new file mode 100644 index 0000000000..d6205f876a --- /dev/null +++ b/app/functions/text_processor/host.json @@ -0,0 +1,27 @@ +{ + "version": "2.0", + "extensions": { + "mcp": { + "system": { + "webhookAuthorizationLevel": "anonymous" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "functionTimeout": "00:05:00", + "logging": { + "logLevel": { + "default": "Information", + "Function": "Information" + }, + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "maxTelemetryItemsPerSecond": 20 + } + } + } +} diff --git a/app/functions/text_processor/requirements.txt b/app/functions/text_processor/requirements.txt new file mode 100644 index 0000000000..ccdb3b8a00 --- /dev/null +++ b/app/functions/text_processor/requirements.txt @@ -0,0 +1,456 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt --python-version 3.10 +aiofiles==24.1.0 + # via + # prompty + # quart +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 + # via + # -r requirements.in + # microsoft-kiota-authentication-azure +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # openai +asgiref==3.10.0 + # via opentelemetry-instrumentation-asgi +async-timeout==5.0.1 + # via aiohttp +attrs==25.3.0 + # via aiohttp +azure-ai-documentintelligence==1.0.0b4 + # via -r requirements.in +azure-cognitiveservices-speech==1.40.0 + # via -r requirements.in +azure-common==1.1.28 + # via azure-search-documents +azure-core==1.35.0 + # via + # azure-ai-documentintelligence + # azure-core-tracing-opentelemetry + # azure-cosmos + # azure-identity + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # microsoft-kiota-authentication-azure + # msrest +azure-core-tracing-opentelemetry==1.0.0b11 + # via azure-monitor-opentelemetry +azure-cosmos==4.9.0 + # via -r requirements.in +azure-functions==1.24.0 + # via -r requirements.in +azure-identity==1.17.1 + # via + # -r requirements.in + # azure-monitor-opentelemetry-exporter + # msgraph-sdk +azure-monitor-opentelemetry==1.8.1 + # via -r requirements.in +azure-monitor-opentelemetry-exporter==1.0.0b44 + # via azure-monitor-opentelemetry +azure-search-documents==11.7.0b1 + # via -r requirements.in +azure-storage-blob==12.22.0 + # via + # -r requirements.in + # azure-storage-file-datalake +azure-storage-file-datalake==12.16.0 + # via -r requirements.in +beautifulsoup4==4.12.3 + # via -r requirements.in +blinker==1.9.0 + # via + # flask + # quart +certifi==2024.7.4 + # via + # httpcore + # httpx + # msrest + # requests +cffi==1.17.0 + # via cryptography +charset-normalizer==3.3.2 + # via requests +click==8.3.0 + # via + # flask + # prompty + # quart + # uvicorn +cryptography==44.0.1 + # via + # -r requirements.in + # azure-identity + # azure-storage-blob + # msal + # pyjwt +distro==1.9.0 + # via openai +exceptiongroup==1.3.0 + # via + # anyio + # hypercorn + # taskgroup +fixedint==0.1.6 + # via azure-monitor-opentelemetry-exporter +flask==3.1.2 + # via quart +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +h11==0.16.0 + # via + # httpcore + # hypercorn + # uvicorn + # wsproto +h2==4.3.0 + # via + # httpx + # hypercorn +hpack==4.1.0 + # via h2 +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via + # microsoft-kiota-http + # msgraph-core + # openai +hypercorn==0.17.3 + # via quart +hyperframe==6.1.0 + # via h2 +idna==3.10 + # via + # anyio + # httpx + # requests + # yarl +importlib-metadata==8.0.0 + # via opentelemetry-api +isodate==0.6.1 + # via + # azure-ai-documentintelligence + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # msrest +itsdangerous==2.2.0 + # via + # flask + # quart +jinja2==3.1.6 + # via + # flask + # prompty + # quart +jiter==0.11.0 + # via openai +markdown-it-py==3.0.0 + # via rich +markupsafe==3.0.3 + # via + # flask + # jinja2 + # quart + # werkzeug +mdurl==0.1.2 + # via markdown-it-py +microsoft-kiota-abstractions==1.9.3 + # via + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # microsoft-kiota-serialization-form + # microsoft-kiota-serialization-json + # microsoft-kiota-serialization-multipart + # microsoft-kiota-serialization-text + # msgraph-core +microsoft-kiota-authentication-azure==1.9.3 + # via msgraph-core +microsoft-kiota-http==1.9.3 + # via msgraph-core +microsoft-kiota-serialization-form==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-json==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-multipart==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-text==1.9.3 + # via msgraph-sdk +msal==1.33.0 + # via + # -r requirements.in + # azure-identity + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +msgraph-core==1.3.3 + # via msgraph-sdk +msgraph-sdk==1.45.0 + # via -r requirements.in +msrest==0.7.1 + # via azure-monitor-opentelemetry-exporter +multidict==6.7.0 + # via + # aiohttp + # yarl +oauthlib==3.3.1 + # via requests-oauthlib +openai==2.6.1 + # via -r requirements.in +opentelemetry-api==1.38.0 + # via + # azure-core-tracing-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-instrumentation==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-aiohttp-client==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-asgi==0.59b0 + # via + # -r requirements.in + # opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-dbapi==0.59b0 + # via opentelemetry-instrumentation-psycopg2 +opentelemetry-instrumentation-django==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-fastapi==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-flask==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-httpx==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-openai==0.47.5 + # via -r requirements.in +opentelemetry-instrumentation-psycopg2==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-requests==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib3==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-wsgi==0.59b0 + # via + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-flask +opentelemetry-resource-detector-azure==0.1.5 + # via azure-monitor-opentelemetry +opentelemetry-sdk==1.38.0 + # via + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-resource-detector-azure +opentelemetry-semantic-conventions==0.59b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-semantic-conventions-ai==0.4.13 + # via opentelemetry-instrumentation-openai +opentelemetry-util-http==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +packaging==24.1 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-flask +pillow==12.0.0 + # via -r requirements.in +priority==2.0.0 + # via hypercorn +prompty==0.1.50 + # via -r requirements.in +propcache==0.2.0 + # via + # aiohttp + # yarl +psutil==7.1.2 + # via azure-monitor-opentelemetry-exporter +pycparser==2.22 + # via cffi +pydantic==2.12.3 + # via openai +pydantic-core==2.41.4 + # via pydantic +pygments==2.19.2 + # via rich +pyjwt==2.10.1 + # via + # -r requirements.in + # msal +pymupdf==1.26.0 + # via -r requirements.in +pypdf==6.1.3 + # via -r requirements.in +python-dotenv==1.1.1 + # via + # -r requirements.in + # prompty +pyyaml==6.0.2 + # via prompty +quart==0.20.0 + # via + # -r requirements.in + # quart-cors +quart-cors==0.7.0 + # via -r requirements.in +regex==2025.7.34 + # via tiktoken +requests==2.32.4 + # via + # azure-core + # msal + # msrest + # requests-oauthlib + # tiktoken +requests-oauthlib==2.0.0 + # via msrest +rich==14.1.0 + # via -r requirements.in +six==1.16.0 + # via + # azure-core + # isodate +sniffio==1.3.1 + # via + # anyio + # openai +soupsieve==2.7 + # via beautifulsoup4 +std-uritemplate==2.0.5 + # via microsoft-kiota-abstractions +taskgroup==0.2.2 + # via hypercorn +tenacity==9.1.2 + # via -r requirements.in +tiktoken==0.12.0 + # via -r requirements.in +tomli==2.2.1 + # via hypercorn +tqdm==4.66.5 + # via openai +types-beautifulsoup4==4.12.0.20240511 + # via -r requirements.in +types-html5lib==1.1.11.20241018 + # via types-beautifulsoup4 +types-pillow==10.2.0.20240822 + # via -r requirements.in +typing-extensions==4.15.0 + # via + # -r requirements.in + # aiosignal + # anyio + # asgiref + # azure-ai-documentintelligence + # azure-core + # azure-cosmos + # azure-identity + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # exceptiongroup + # hypercorn + # multidict + # openai + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pypdf + # taskgroup + # typing-inspection + # uvicorn +typing-inspection==0.4.2 + # via pydantic +urllib3==2.5.0 + # via requests +uvicorn==0.30.6 + # via -r requirements.in +werkzeug==3.1.3 + # via + # azure-functions + # flask + # quart +wrapt==1.16.0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-urllib3 +wsproto==1.2.0 + # via hypercorn +yarl==1.17.2 + # via aiohttp +zipp==3.21.0 + # via importlib-metadata diff --git a/azure.yaml b/azure.yaml index f629d9a374..afb975de9d 100644 --- a/azure.yaml +++ b/azure.yaml @@ -40,6 +40,37 @@ services: run: cd ../frontend;npm install;npm run build interactive: false continueOnError: false + # Un-comment this section if using USE_CLOUD_INGESTION option + # document-extractor: + # project: ./app/functions/document_extractor + # language: py + # host: function + # hooks: + # prepackage: + # shell: pwsh + # run: python ../../../scripts/copy_prepdocslib.py + # interactive: false + # continueOnError: false + # figure-processor: + # project: ./app/functions/figure_processor + # language: py + # host: function + # hooks: + # prepackage: + # shell: pwsh + # run: python ../../../scripts/copy_prepdocslib.py + # interactive: false + # continueOnError: false + # text-processor: + # project: ./app/functions/text_processor + # language: py + # host: function + # hooks: + # prepackage: + # shell: pwsh + # run: python ../../../scripts/copy_prepdocslib.py + # interactive: false + # continueOnError: false hooks: preprovision: windows: @@ -63,3 +94,14 @@ hooks: run: ./scripts/auth_update.sh;./scripts/prepdocs.sh interactive: true continueOnError: false + postdeploy: + windows: + shell: pwsh + run: ./scripts/setup_cloud_ingestion.ps1 + interactive: true + continueOnError: false + posix: + shell: sh + run: ./scripts/setup_cloud_ingestion.sh + interactive: true + continueOnError: false diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index c9f7b13410..329b82934f 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -2,15 +2,19 @@ The [azure-search-openai-demo](/) project can set up a full RAG chat app on Azure AI Search and OpenAI so that you can chat on custom data, like internal enterprise data or domain-specific knowledge sets. For full instructions on setting up the project, consult the [main README](/README.md), and then return here for detailed instructions on the data ingestion component. -The chat app provides two ways to ingest data: manual indexing and integrated vectorization. This document explains the differences between the two approaches and provides an overview of the manual indexing process. +The chat app provides two ways to ingest data: manual ingestion and cloud-based ingestion. Both approaches use the same code for processing the data, but the manual ingestion runs locally while cloud ingestion runs in Azure Functions as Azure AI Search custom skills. - [Supported document formats](#supported-document-formats) -- [Manual indexing process](#manual-indexing-process) - - [Chunking](#chunking) +- [Ingestion stages](#ingestion-stages) + - [Document extraction](#document-extraction) + - [Figure processing](#figure-processing) + - [Text processing](#text-processing) +- [Local ingestion](#local-ingestion) - [Categorizing data for enhanced search](#enhancing-search-functionality-with-data-categorization) - [Indexing additional documents](#indexing-additional-documents) - [Removing documents](#removing-documents) -- [Integrated Vectorization](#integrated-vectorization) +- [Cloud-based ingestion](#cloud-based-ingestion) + - [Custom skills pipeline](#custom-skills-pipeline) - [Indexing of additional documents](#indexing-of-additional-documents) - [Removal of documents](#removal-of-documents) - [Scheduled indexing](#scheduled-indexing) @@ -30,9 +34,72 @@ In order to ingest a document format, we need a tool that can turn it into text. | JSON | Yes (Local) | Yes | | CSV | Yes (Local) | Yes | -The Blob indexer used by the Integrated Vectorization approach also supports a few [additional formats](https://learn.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage#supported-document-formats). +## Ingestion stages -## Manual indexing process +The ingestion pipeline consists of three main stages that transform raw documents into searchable content in Azure AI Search. These stages apply to both [local ingestion](#local-ingestion) and [cloud-based ingestion](#cloud-based-ingestion). + +### Document extraction + +The first stage extracts text and structured content from source documents using parsers tailored to each file format. For PDF, HTML, DOCX, PPTX, XLSX, and image files, the pipeline defaults to using [Azure Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/overview) to extract text, tables, and figures with layout information. Alternatively, local parsers like PyPDF and BeautifulSoup can be used to reduce costs for simpler documents. For TXT, JSON, and CSV files, lightweight local parsers extract the content directly. + +During extraction, tables are converted to HTML markup to preserve their structure, and figures (when multimodal is enabled) are identified with bounding boxes and placeholders. + +The output from this stage is a list of pages, each containing the extracted text with embedded table HTML and figure placeholders like `
`. + +### Figure processing + +This stage is optional and only applies when the multimodal feature is enabled *and* the document itself has figures. See [multimodal feature documentation](./multimodal.md) for more details. + +When multimodal support is enabled, figures extracted in the previous stage are enriched with descriptions and embeddings. Each figure is: + +1. **Cropped and saved**: The figure image is cropped from the PDF using its bounding box coordinates and saved as a PNG file. +2. **Described**: A text description is generated using either Azure OpenAI's GPT-4 Vision model or Azure AI Content Understanding, depending on configuration. +3. **Uploaded**: The figure image is uploaded to Azure Blob Storage and assigned a URL. +4. **Embedded** (optional): If image embeddings are enabled, a vector embedding is computed for the figure using Azure AI Vision. + +The output from this stage is enriched figure metadata, including the description text, storage URL, and optional embedding vector. + +### Text processing + +The final stage combines the extracted text with figure descriptions, splits the content into searchable chunks, and computes embeddings. + +### Figure merging + +First, figure placeholders in the page text are replaced with full HTML markup that includes the figure caption and generated description, creating a cohesive text narrative that incorporates visual content. + +#### Chunking + +Next, the combined text is split into chunks using a sentence-aware splitter that respects semantic boundaries. The default chunk size is approximately 1000 characters (roughly 400-500 tokens for English), with a 10% overlap between consecutive chunks to preserve context across boundaries. The splitter uses a sliding window approach, ensuring that sentences ending one chunk also start the next, which reduces the risk of losing important context at chunk boundaries. + +**Why chunk documents?** While Azure AI Search can index full documents, chunking is essential for the RAG pattern because it limits the amount of information sent to OpenAI, which has token limits for context windows. By breaking content into focused chunks, the system can retrieve and inject only the most relevant pieces of text into the LLM prompt, improving both response quality and cost efficiency. + +If needed, you can modify the chunking algorithm in `app/backend/prepdocslib/textsplitter.py`. For a deeper, diagram-rich explanation of how the splitter works (figures, recursion, merge heuristics, guarantees, and examples), see the [text splitter documentation](./textsplitter.md). + +#### Embedding + +Finally, if vector search is enabled, text embeddings are computed for each chunk using Azure OpenAI's embedding models (text-embedding-ada-002, text-embedding-3-small, or text-embedding-3-large). These embeddings are generated in batches for efficiency, with retry logic to handle rate limits. + +### Indexing + +The final step is to index the chunks into Azure AI Search. Each chunk is stored as a separate document in the search index, with metadata linking it back to the source file and page number. If vector search is enabled, the computed embeddings are also stored alongside the text, enabling efficient similarity searches during query time. + +Here's an example of what a final indexed chunk document looks like: + +```json +{ + "id": "file-Northwind_Health_Plus_Benefits_Details_pdf-4E6F72746877696E645F4865616C74685F506C75735F42656E65666974735F44657461696C732E706466-page-0", + "content": "# Contoso Electronics\n\nNorthwind Health Plus Plan\n...", + "category": null, + "sourcepage": "Northwind_Health_Plus_Benefits_Details.pdf#page=1", + "sourcefile": "Northwind_Health_Plus_Benefits_Details.pdf", + "storageUrl": "https://std4gfbajn3e3yu.blob.core.windows.net/content/Northwind_Health_Plus_Benefits_Details.pdf", + "embedding": [0.0123, -0.0456, ...] +} +``` + +If multimodal is enabled, that document will also include an `"images"` field and figure descriptions in the `"content"` field. + +## Local ingestion The [`prepdocs.py`](../app/backend/prepdocs.py) script is responsible for both uploading and indexing documents. The typical usage is to call it using `scripts/prepdocs.sh` (Mac/Linux) or `scripts/prepdocs.ps1` (Windows), as these scripts will set up a Python virtual environment and pass in the required parameters based on the current `azd` environment. You can pass additional arguments directly to the script, for example `scripts/prepdocs.ps1 --removeall`. Whenever `azd up` or `azd provision` is run, the script is called automatically. @@ -45,14 +112,6 @@ The script uses the following steps to index documents: 3. Split the PDFs into chunks of text. 4. Upload the chunks to Azure AI Search. If using vectors (the default), also compute the embeddings and upload those alongside the text. -### Chunking - -We're often asked why we need to break up the PDFs into chunks when Azure AI Search supports searching large documents. - -Chunking allows us to limit the amount of information we send to OpenAI due to token limits. By breaking up the content, it allows us to easily find potential chunks of text that we can inject into OpenAI. The method of chunking we use leverages a sliding window of text such that sentences that end one chunk will start the next. This allows us to reduce the chance of losing the context of the text. - -If needed, you can modify the chunking algorithm in `app/backend/prepdocslib/textsplitter.py`. For a deeper, diagram-rich explanation of how the splitter works (figures, recursion, merge heuristics, guarantees, and examples), see the [text splitter documentation](./textsplitter.md). - ### Enhancing search functionality with data categorization To enhance search functionality, categorize data during the ingestion process with the `--category` argument, for example `scripts/prepdocs.ps1 --category ExampleCategoryName`. This argument specifies the category to which the data belongs, enabling you to filter search results based on these categories. @@ -63,7 +122,7 @@ After running the script with the desired category, ensure these categories are To upload more PDFs, put them in the data/ folder and run `./scripts/prepdocs.sh` or `./scripts/prepdocs.ps1`. -A [recent change](https://github.com/Azure-Samples/azure-search-openai-demo/pull/835) added checks to see what's been uploaded before. The prepdocs script now writes an .md5 file with an MD5 hash of each file that gets uploaded. Whenever the prepdocs script is re-run, that hash is checked against the current hash and the file is skipped if it hasn't changed. +The prepdocs script writes an .md5 file with an MD5 hash of each file that gets uploaded. Whenever the prepdocs script is re-run, that hash is checked against the current hash and the file is skipped if it hasn't changed. ### Removing documents @@ -73,31 +132,64 @@ To remove all documents, use `./scripts/prepdocs.sh --removeall` or `./scripts/p You can also remove individual documents by using the `--remove` flag. Open either `scripts/prepdocs.sh` or `scripts/prepdocs.ps1` and replace `/data/*` with `/data/YOUR-DOCUMENT-FILENAME-GOES-HERE.pdf`. Then run `scripts/prepdocs.sh --remove` or `scripts/prepdocs.ps1 --remove`. -## Integrated Vectorization +## Cloud-based ingestion + +This project includes an optional feature to perform data ingestion in the cloud using Azure Functions as custom skills for Azure AI Search indexers. This approach offloads the ingestion workload from your local machine to the cloud, allowing for more scalable and efficient processing of large datasets. + +You must first explicitly [enable cloud ingestion](./deploy_features.md#enabling-cloud-ingestion) in the `azd` environment to use this feature. + +This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index. In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks. + +### Custom skills pipeline + +The cloud ingestion pipeline uses four Azure Functions as custom skills within an Azure AI Search indexer. Each function corresponds to a stage in the ingestion process. Here's how it works: + +1. **User uploads documents** to Azure Blob Storage (content container) +2. **Azure AI Search Indexer** monitors the blob container and orchestrates processing +3. **Custom skills** process documents through three stages: + - **Document Extractor** (Skill #1): Extracts text and figure metadata from source documents + - **Figure Processor** (Skill #2): Enriches figures with descriptions and embeddings + - **Shaper Skill** (Skill #3): Built-in Azure AI Search skill that consolidates enriched data + - **Text Processor** (Skill #4): Combines text with enriched figures, chunks content, and generates embeddings +4. **Azure AI Search Index** receives the final processed chunks with embeddings + +The functions are defined in the `app/functions/` directory, and the custom skillset is configured in the `app/backend/setup_cloud_ingestion.py` script. + +#### [Document Extractor Function](app/functions/document_extractor/) + +- Implements the [document extraction](#document-extraction) stage +- Emits markdown text with `
` placeholders and figure metadata + +#### [Figure Processor Function](app/functions/figure_processor/) -Azure AI Search includes an [integrated vectorization feature](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in-azure-ai-search/3960809), a cloud-based approach to data ingestion. Integrated vectorization takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies. +- Implements the [figure processing](#figure-processing) stage +- Emits enriched figure metadata with descriptions, URLs, and embeddings -See [this notebook](https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb) to understand the process of setting up integrated vectorization. -We have integrated that code into our `prepdocs` script, so you can use it without needing to understand the details. +#### [Shaper Skill](https://learn.microsoft.com/azure/search/cognitive-search-skill-shaper) -You must first explicitly [enable integrated vectorization](./deploy_features.md#enabling-integrated-vectorization) in the `azd` environment to use this feature. +- Consolidates enrichments from the figure processor back into the main document context +- Required because Azure AI Search's enrichment tree isolates data by context +- The Shaper explicitly combines: + - Original `pages` array from `document_extractor` + - Enriched `figures` array with descriptions, URLs, and embeddings from `figure_processor` + - File metadata (file_name, storageUrl) +- Creates a `consolidated_document` object that the text processor can consume -This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index. -In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks. +#### [Text Processor Function](app/functions/text_processor/) -This feature is not supported in the free SKU for Azure AI Search. +- Implements the [text processing](#text-processing) stage (figure merging, chunking, embedding) +- Receives the consolidated document with enriched figures from the Shaper skill +- Emits search-ready chunks with figure references and embeddings ### Indexing of additional documents To add additional documents to the index, first upload them to your data source (Blob storage, by default). -Then navigate to the Azure portal, find the index, and run it. -The Azure AI Search indexer will identify the new documents and ingest them into the index. +Then navigate to the Azure portal and run the indexer. The Azure AI Search indexer will identify the new documents and ingest them into the index. ### Removal of documents To remove documents from the index, remove them from your data source (Blob storage, by default). -Then navigate to the Azure portal, find the index, and run it. -The Azure AI Search indexer will take care of removing those documents from the index. +Then navigate to the Azure portal and run the indexer. The Azure AI Search indexer will take care of removing those documents from the index. ### Scheduled indexing diff --git a/docs/deploy_features.md b/docs/deploy_features.md index af8bce1463..0c63ca9b40 100644 --- a/docs/deploy_features.md +++ b/docs/deploy_features.md @@ -12,7 +12,6 @@ You should typically enable these features before running `azd up`. Once you've * [Enabling persistent chat history with Azure Cosmos DB](#enabling-persistent-chat-history-with-azure-cosmos-db) * [Enabling language picker](#enabling-language-picker) * [Enabling speech input/output](#enabling-speech-inputoutput) -* [Enabling Integrated Vectorization](#enabling-integrated-vectorization) * [Enabling authentication](#enabling-authentication) * [Enabling login and document level access control](#enabling-login-and-document-level-access-control) * [Enabling user document upload](#enabling-user-document-upload) @@ -236,8 +235,7 @@ Learn more in the [multimodal guide](./multimodal.md). ## Enabling media description with Azure Content Understanding -⚠️ This feature is not currently compatible with [integrated vectorization](#enabling-integrated-vectorization). -It is compatible with the [multimodal feature](./multimodal.md), but this feature enables only a subset of multimodal capabilities, +⚠️ This feature is compatible with the [multimodal feature](./multimodal.md), but this feature enables only a subset of multimodal capabilities, so you may want to enable the multimodal feature instead or as well. By default, if your documents contain image-like figures, the data ingestion process will ignore those figures, @@ -324,30 +322,35 @@ Alternatively you can use the browser's built-in [Speech Synthesis API](https:// azd env set USE_SPEECH_OUTPUT_BROWSER true ``` -## Enabling Integrated Vectorization +## Enabling cloud-based data ingestion -Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in-azure-ai-search/3960809). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies. +By default, this project runs a local script in order to ingest data. Once you move beyond the sample documents, you may want cloud-based ingestion, which uses Azure AI Search indexers and custom Azure AI Search skills based off the same code used by the local ingestion. That approach scales better to larger amounts of data. -To enable integrated vectorization with this sample: +To enable cloud ingestion: -1. If you've previously deployed, delete the existing search index. 🗑️ -2. To enable the use of integrated vectorization, run: +1. If you've previously deployed, delete the existing search index or create a new index using: ```shell - azd env set USE_FEATURE_INT_VECTORIZATION true + azd env set AZURE_SEARCH_INDEX cloudindex ``` -3. If you've already deployed your app, then you can run just the `provision` step: +2. Run this command: ```shell - azd provision + azd env set USE_CLOUD_INGESTION true ``` - That will set up necessary RBAC roles and configure the integrated vectorization feature on your search service. +3. Open `azure.yaml` and un-comment the document-extractor, figure-processor, and text-processor sections. Those are the Azure Functions apps that will be deployed and serve as Azure AI Search skills. - If you haven't deployed your app yet, then you should run the full `azd up` after configuring all optional features. +4. Provision the new Azure Functions resources, deploy the function apps, and update the search indexer with: -4. You can view the resources such as the indexer and skillset in Azure Portal and monitor the status of the vectorization process. + ```shell + azd up + ``` + +5. That will upload the documents in the `data/` folder to the Blob storage container, create the indexer and skillset, and run the indexer to ingest the data. You can monitor the indexer status from the portal. + +6. When you have new documents to ingest, you can upload documents to the Blob storage container and run the indexer from the Azure Portal to ingest new documents. ## Enabling authentication diff --git a/docs/multimodal.md b/docs/multimodal.md index b547cc1c37..24b7fb656f 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -112,5 +112,4 @@ and you may still see good results with just text inputs, since the inputs conta ## Compatibility -* This feature is **not** compatible with [integrated vectorization](./deploy_features.md#enabling-integrated-vectorization), as the currently configured built-in skills do not process images or store image embeddings. Azure AI Search does now offer built-in skills for multimodal support, as demonstrated in [azure-ai-search-multimodal-sample](https://github.com/Azure-Samples/azure-ai-search-multimodal-sample), but we have not integrated them in this project. Instead, we are working on making a custom skill based off the data ingestion code in this repository, and hosting that skill on Azure Functions. Stay tuned to the releases to find out when that's available. * This feature *is* compatible with the [reasoning models](./reasoning.md) feature, as long as you use a model that [supports image inputs](https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning?tabs=python-secure%2Cpy#api--feature-support). diff --git a/infra/app/functions-app.bicep b/infra/app/functions-app.bicep new file mode 100644 index 0000000000..f1171cf269 --- /dev/null +++ b/infra/app/functions-app.bicep @@ -0,0 +1,196 @@ +// Single function app module +param name string +param location string = resourceGroup().location +param tags object = {} +@description('Name of an existing Application Insights component. Leave empty to disable.') +param applicationInsightsName string +param appServicePlanId string +param appSettings object = {} +param runtimeName string +param runtimeVersion string +param storageAccountName string +param deploymentStorageContainerName string +param instanceMemoryMB int = 2048 +param maximumInstanceCount int = 10 +param identityId string +param identityClientId string + +// Authorization parameters +@description('The Entra ID application (client) ID for App Service Authentication') +param authClientId string + +@description('The Entra ID identifier URI for App Service Authentication') +param authIdentifierUri string + +@description('The Azure AD tenant ID for App Service Authentication') +param authTenantId string + +@description('The application client ID of the Search service user-assigned managed identity') +param searchUserAssignedIdentityClientId string + +// AVM expects authentication.type values: SystemAssignedIdentity | UserAssignedIdentity | StorageAccountConnectionString +// Use UserAssignedIdentity for per-function user-assigned managed identity deployment storage access. +var identityType = 'UserAssignedIdentity' +var kind = 'functionapp,linux' +var applicationInsightsIdentity = 'ClientId=${identityClientId};Authorization=AAD' + +// Reference existing resources +resource stg 'Microsoft.Storage/storageAccounts@2023-05-01' existing = { + name: storageAccountName +} + +resource applicationInsights 'Microsoft.Insights/components@2020-02-02' existing = if (!empty(applicationInsightsName)) { + name: applicationInsightsName +} + +// Create base application settings (independent of Application Insights) +var baseAppSettings = { + // Storage credentials for AzureWebJobsStorage + AzureWebJobsStorage__credential: 'managedidentity' + AzureWebJobsStorage__clientId: identityClientId + AzureWebJobsStorage__blobServiceUri: stg.properties.primaryEndpoints.blob + AzureWebJobsStorage__queueServiceUri: stg.properties.primaryEndpoints.queue + AzureWebJobsStorage__tableServiceUri: stg.properties.primaryEndpoints.table + FUNCTIONS_EXTENSION_VERSION: '~4' + AZURE_CLIENT_ID: identityClientId +} + +// Optional Application Insights settings +var appInsightsSettings = !empty(applicationInsightsName) ? { + APPLICATIONINSIGHTS_AUTHENTICATION_STRING: applicationInsightsIdentity + APPLICATIONINSIGHTS_CONNECTION_STRING: applicationInsights.?properties.ConnectionString ?? '' +} : {} + +var easyAuthSettings = { + OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID: identityClientId + WEBSITE_AUTH_PRM_DEFAULT_WITH_SCOPES: '${authIdentifierUri}/user_impersonation' + WEBSITE_AUTH_AAD_ALLOWED_TENANTS: authTenantId +} + +// Merge all app settings +var allAppSettings = union(appSettings, baseAppSettings, appInsightsSettings, easyAuthSettings) + +// Create Flex Consumption Function App using AVM +module functionApp 'br/public:avm/res/web/site:0.15.1' = { + name: '${name}-func-app' + params: { + kind: kind + name: name + location: location + tags: tags + serverFarmResourceId: appServicePlanId + managedIdentities: { + userAssignedResourceIds: [ + '${identityId}' + ] + } + functionAppConfig: { + deployment: { + storage: { + type: 'blobContainer' + value: '${stg.properties.primaryEndpoints.blob}${deploymentStorageContainerName}' + authentication: { + type: identityType + userAssignedIdentityResourceId: identityId + } + } + } + scaleAndConcurrency: { + instanceMemoryMB: instanceMemoryMB + maximumInstanceCount: maximumInstanceCount + } + runtime: { + name: runtimeName + version: runtimeVersion + } + } + siteConfig: { + alwaysOn: false + functionAppScaleLimit: maximumInstanceCount + httpsOnly: true + ftpsState: 'Disabled' + cors: { + allowedOrigins: ['https://portal.azure.com'] + } + } + appSettingsKeyValuePairs: allAppSettings + } +} + +// Enable Easy Auth (App Service authentication) for Azure Search custom skill access when a skillAppId is provided. +// Based on Microsoft guidance: require authentication, return 401 on unauthenticated, allowed audience api://{applicationId}. +resource auth 'Microsoft.Web/sites/config@2022-03-01' = { + name: '${name}/authsettingsV2' + dependsOn: [ + functionApp // Ensure the Function App module completes before configuring authentication + ] + properties: { + globalValidation: { + requireAuthentication: true + unauthenticatedClientAction: 'Return401' + redirectToProvider: 'azureactivedirectory' + } + httpSettings: { + requireHttps: true + routes: { + apiPrefix: '/.auth' + } + forwardProxy: { + convention: 'NoProxy' + } + } + identityProviders: { + azureActiveDirectory: { + enabled: true + registration: { + openIdIssuer: '${environment().authentication.loginEndpoint}${authTenantId}/v2.0' + clientId: authClientId + clientSecretSettingName: 'OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID' + } + validation: { + jwtClaimChecks: {} + allowedAudiences: [ + authIdentifierUri + ] + defaultAuthorizationPolicy: { + allowedPrincipals: {} + allowedApplications: [searchUserAssignedIdentityClientId] + } + } + isAutoProvisioned: false + } + } + login: { + routes: { + logoutEndpoint: '/.auth/logout' + } + tokenStore: { + enabled: true + tokenRefreshExtensionHours: 72 + fileSystem: {} + azureBlobStorage: {} + } + preserveUrlFragmentsForLogins: false + allowedExternalRedirectUrls: [] + cookieExpiration: { + convention: 'FixedTime' + timeToExpiration: '08:00:00' + } + nonce: { + validateNonce: true + nonceExpirationInterval: '00:05:00' + } + } + platform: { + enabled: true + runtimeVersion: '~1' + } + } +} + +// Outputs +output name string = functionApp.outputs.name +output defaultHostname string = functionApp.outputs.defaultHostname +// Expose resourceId for downstream skill auth configuration +output resourceId string = functionApp.outputs.resourceId +output authEnabled bool = !empty(authClientId) && !empty(authIdentifierUri) diff --git a/infra/app/functions-rbac.bicep b/infra/app/functions-rbac.bicep new file mode 100644 index 0000000000..f0fe437482 --- /dev/null +++ b/infra/app/functions-rbac.bicep @@ -0,0 +1,121 @@ +// RBAC assignments for function apps +param principalId string +param storageResourceGroupName string +param searchServiceResourceGroupName string +param openAiResourceGroupName string +param documentIntelligenceResourceGroupName string +param visionServiceName string = '' +param visionResourceGroupName string = '' +param contentUnderstandingServiceName string = '' +param contentUnderstandingResourceGroupName string = '' +param useMultimodal bool + + +// Storage: Blob Data Reader (read content container) +module storageBlobReaderRole '../core/security/role.bicep' = { + scope: resourceGroup(storageResourceGroupName) + name: 'function-storage-blob-reader-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1' // Storage Blob Data Reader + principalType: 'ServicePrincipal' + } +} + +// Storage: Blob Data Contributor (write images container, deployment container) +module storageBlobContributorRole '../core/security/role.bicep' = { + scope: resourceGroup(storageResourceGroupName) + name: 'function-storage-blob-contributor-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Storage Blob Data Contributor + principalType: 'ServicePrincipal' + } +} + +// Storage: Queue Data Contributor (for AzureWebJobsStorage) +module storageQueueContributorRole '../core/security/role.bicep' = { + scope: resourceGroup(storageResourceGroupName) + name: 'function-storage-queue-contributor-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: '974c5e8b-45b9-4653-ba55-5f855dd0fb88' // Storage Queue Data Contributor + principalType: 'ServicePrincipal' + } +} + +// Storage: Table Data Contributor (for AzureWebJobsStorage) +module storageTableContributorRole '../core/security/role.bicep' = { + scope: resourceGroup(storageResourceGroupName) + name: 'function-storage-table-contributor-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: '0a9a7e1f-b9d0-4cc4-a60d-0319b160aaa3' // Storage Table Data Contributor + principalType: 'ServicePrincipal' + } +} + +// Search: Index Data Contributor (write chunks to index) +module searchIndexContributorRole '../core/security/role.bicep' = { + scope: resourceGroup(searchServiceResourceGroupName) + name: 'function-search-index-contributor-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: '8ebe5a00-799e-43f5-93ac-243d3dce84a7' // Search Index Data Contributor + principalType: 'ServicePrincipal' + } +} + +// OpenAI: Cognitive Services OpenAI User +module openAiUserRole '../core/security/role.bicep' = { + scope: resourceGroup(openAiResourceGroupName) + name: 'function-openai-user-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' // Cognitive Services OpenAI User + principalType: 'ServicePrincipal' + } +} + +// Document Intelligence: Cognitive Services User +module documentIntelligenceUserRole '../core/security/role.bicep' = { + scope: resourceGroup(documentIntelligenceResourceGroupName) + name: 'function-doc-intelligence-user-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' // Cognitive Services User + principalType: 'ServicePrincipal' + } +} + +// Vision: Cognitive Services User (if multimodal) +module visionUserRole '../core/security/role.bicep' = if (useMultimodal && !empty(visionServiceName)) { + scope: resourceGroup(visionResourceGroupName) + name: 'function-vision-user-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' // Cognitive Services User + principalType: 'ServicePrincipal' + } +} + +// Content Understanding: Cognitive Services User (if multimodal) +module contentUnderstandingUserRole '../core/security/role.bicep' = if (useMultimodal && !empty(contentUnderstandingServiceName)) { + scope: resourceGroup(contentUnderstandingResourceGroupName) + name: 'function-content-understanding-user-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' // Cognitive Services User + principalType: 'ServicePrincipal' + } +} + +// Application Insights: Monitoring Metrics Publisher +module appInsightsMetricsPublisherRole '../core/security/role.bicep' = { + name: 'function-appinsights-metrics-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: '3913510d-42f4-4e42-8a64-420c390055eb' // Monitoring Metrics Publisher + principalType: 'ServicePrincipal' + } +} diff --git a/infra/app/functions.bicep b/infra/app/functions.bicep new file mode 100644 index 0000000000..b3c60e58c1 --- /dev/null +++ b/infra/app/functions.bicep @@ -0,0 +1,366 @@ +// Parameters for both function apps +param location string = resourceGroup().location +param tags object = {} +param applicationInsightsName string +param storageResourceGroupName string +param searchServiceResourceGroupName string +param openAiResourceGroupName string +param documentIntelligenceResourceGroupName string +param visionServiceName string = '' +param visionResourceGroupName string = '' +param contentUnderstandingServiceName string = '' +param contentUnderstandingResourceGroupName string = '' + +// App environment variables from main.bicep +param appEnvVariables object + +// Function App Names +param documentExtractorName string +param figureProcessorName string +param textProcessorName string +// OpenID issuer provided by main template (e.g. https://login.microsoftonline.com//v2.0) +param openIdIssuer string + +@description('The principal ID of the Search service user-assigned managed identity') +param searchUserAssignedIdentityClientId string + +var abbrs = loadJsonContent('../abbreviations.json') +var resourceToken = toLower(uniqueString(subscription().id, resourceGroup().id, location)) + +var documentExtractorRuntimeStorageName = '${abbrs.storageStorageAccounts}doc${take(resourceToken, 18)}' +var figureProcessorRuntimeStorageName = '${abbrs.storageStorageAccounts}fig${take(resourceToken, 18)}' +var textProcessorRuntimeStorageName = '${abbrs.storageStorageAccounts}txt${take(resourceToken, 18)}' + +var documentExtractorHostId = 'doc-skill-${take(resourceToken, 12)}' +var figureProcessorHostId = 'fig-skill-${take(resourceToken, 12)}' +var textProcessorHostId = 'txt-skill-${take(resourceToken, 12)}' + +var runtimeStorageRoles = [ + { + suffix: 'blob' + roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' + } + { + suffix: 'queue' + roleDefinitionId: '974c5e8b-45b9-4653-ba55-5f855dd0fb88' + } + { + suffix: 'table' + roleDefinitionId: '0a9a7e1f-b9d0-4cc4-a60d-0319b160aaa3' + } +] + +// Deployment storage container name (same name used in each function's storage account) +var deploymentContainerName = 'app-package-deployment' + +// Runtime storage accounts per function (Flex Consumption requirement) +module documentExtractorRuntimeStorageAccount '../core/storage/storage-account.bicep' = { + name: 'doc-extractor-runtime-storage' + params: { + name: documentExtractorRuntimeStorageName + location: location + tags: tags + allowBlobPublicAccess: false + containers: [ + { + name: deploymentContainerName + } + ] + } +} + +module figureProcessorRuntimeStorageAccount '../core/storage/storage-account.bicep' = { + name: 'figure-processor-runtime-storage' + params: { + name: figureProcessorRuntimeStorageName + location: location + tags: tags + allowBlobPublicAccess: false + containers: [ + { + name: deploymentContainerName + } + ] + } +} + +module textProcessorRuntimeStorageAccount '../core/storage/storage-account.bicep' = { + name: 'text-processor-runtime-storage' + params: { + name: textProcessorRuntimeStorageName + location: location + tags: tags + allowBlobPublicAccess: false + containers: [ + { + name: deploymentContainerName + } + ] + } +} + +resource documentExtractorRuntimeStorage 'Microsoft.Storage/storageAccounts@2024-01-01' existing = { + name: documentExtractorRuntimeStorageName +} + +resource figureProcessorRuntimeStorage 'Microsoft.Storage/storageAccounts@2024-01-01' existing = { + name: figureProcessorRuntimeStorageName +} + +resource textProcessorRuntimeStorage 'Microsoft.Storage/storageAccounts@2024-01-01' existing = { + name: textProcessorRuntimeStorageName +} + +resource documentExtractorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { + name: guid(documentExtractorRuntimeStorage.id, role.roleDefinitionId, 'doc-storage-roles') + scope: documentExtractorRuntimeStorage + properties: { + principalId: functionsUserIdentity.outputs.principalId + principalType: 'ServicePrincipal' + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) + } + dependsOn: [ + documentExtractorRuntimeStorageAccount + ] +}] + +resource figureProcessorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { + name: guid(figureProcessorRuntimeStorage.id, role.roleDefinitionId, 'figure-storage-roles') + scope: figureProcessorRuntimeStorage + properties: { + principalId: functionsUserIdentity.outputs.principalId + principalType: 'ServicePrincipal' + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) + } + dependsOn: [ + figureProcessorRuntimeStorageAccount + ] +}] + +resource textProcessorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { + name: guid(textProcessorRuntimeStorage.id, role.roleDefinitionId, 'text-storage-roles') + scope: textProcessorRuntimeStorage + properties: { + principalId: functionsUserIdentity.outputs.principalId + principalType: 'ServicePrincipal' + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) + } + dependsOn: [ + textProcessorRuntimeStorageAccount + ] +}] + +// Flex Consumption supports only one Function App per plan; create a dedicated plan per ingestion function +module documentExtractorPlan 'br/public:avm/res/web/serverfarm:0.1.1' = { + name: 'doc-extractor-plan' + params: { + name: '${abbrs.webServerFarms}doc-extractor-${resourceToken}' + sku: { + name: 'FC1' + tier: 'FlexConsumption' + } + reserved: true + location: location + tags: tags + } +} + +module figureProcessorPlan 'br/public:avm/res/web/serverfarm:0.1.1' = { + name: 'figure-processor-plan' + params: { + name: '${abbrs.webServerFarms}figure-processor-${resourceToken}' + sku: { + name: 'FC1' + tier: 'FlexConsumption' + } + reserved: true + location: location + tags: tags + } +} + +module textProcessorPlan 'br/public:avm/res/web/serverfarm:0.1.1' = { + name: 'text-processor-plan' + params: { + name: '${abbrs.webServerFarms}text-processor-${resourceToken}' + sku: { + name: 'FC1' + tier: 'FlexConsumption' + } + reserved: true + location: location + tags: tags + } +} + + +module functionsUserIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { + name: 'functions-user-identity' + params: { + location: location + tags: tags + name: 'functions-user-identity-${resourceToken}' + } +} + + + +// Document Extractor Function App +// App registration for document extractor (uses function identity principalId as FIC subject) +module documentExtractorAppReg '../core/auth/appregistration.bicep' = { + name: 'doc-extractor-appreg' + params: { + appUniqueName: '${documentExtractorName}-appreg' + cloudEnvironment: environment().name + webAppIdentityId: functionsUserIdentity.outputs.principalId + clientAppName: '${documentExtractorName}-app' + clientAppDisplayName: '${documentExtractorName} Entra App' + issuer: openIdIssuer + webAppEndpoint: 'https://${documentExtractorName}.azurewebsites.net' + } +} + +module documentExtractor 'functions-app.bicep' = { + name: 'document-extractor-func' + params: { + name: documentExtractorName + location: location + tags: union(tags, { 'azd-service-name': 'document-extractor' }) + applicationInsightsName: applicationInsightsName + appServicePlanId: documentExtractorPlan.outputs.resourceId + runtimeName: 'python' + runtimeVersion: '3.11' + identityId: functionsUserIdentity.outputs.resourceId + identityClientId: functionsUserIdentity.outputs.clientId + authClientId: documentExtractorAppReg.outputs.clientAppId + authIdentifierUri: documentExtractorAppReg.outputs.identifierUri + authTenantId: tenant().tenantId + searchUserAssignedIdentityClientId: searchUserAssignedIdentityClientId + storageAccountName: documentExtractorRuntimeStorageName + deploymentStorageContainerName: deploymentContainerName + appSettings: union(appEnvVariables, { + AzureFunctionsWebHost__hostid: documentExtractorHostId + }) + instanceMemoryMB: 4096 // High memory for document processing + maximumInstanceCount: 100 + } + dependsOn: [ + documentExtractorRuntimeStorageAccount + ] +} + +// Figure Processor Function App +module figureProcessorAppReg '../core/auth/appregistration.bicep' = { + name: 'figure-processor-appreg' + params: { + appUniqueName: '${figureProcessorName}-app' + cloudEnvironment: environment().name + webAppIdentityId: functionsUserIdentity.outputs.principalId + clientAppName: 'skill-${figureProcessorName}' + clientAppDisplayName: 'skill-${figureProcessorName}' + issuer: openIdIssuer + webAppEndpoint: 'https://${figureProcessorName}.azurewebsites.net' + } +} + +module figureProcessor 'functions-app.bicep' = { + name: 'figure-processor-func' + params: { + name: figureProcessorName + location: location + tags: union(tags, { 'azd-service-name': 'figure-processor' }) + applicationInsightsName: applicationInsightsName + appServicePlanId: figureProcessorPlan.outputs.resourceId + runtimeName: 'python' + runtimeVersion: '3.11' + storageAccountName: figureProcessorRuntimeStorageName + deploymentStorageContainerName: deploymentContainerName + identityId: functionsUserIdentity.outputs.resourceId + identityClientId: functionsUserIdentity.outputs.clientId + authClientId: figureProcessorAppReg.outputs.clientAppId + authIdentifierUri: figureProcessorAppReg.outputs.identifierUri + authTenantId: tenant().tenantId + searchUserAssignedIdentityClientId: searchUserAssignedIdentityClientId + appSettings: union(appEnvVariables, { + AzureFunctionsWebHost__hostid: figureProcessorHostId + }) + instanceMemoryMB: 2048 + maximumInstanceCount: 100 + } + dependsOn: [ + figureProcessorRuntimeStorageAccount + ] +} + +// Text Processor Function App +module textProcessorAppReg '../core/auth/appregistration.bicep' = { + name: 'text-processor-appreg' + params: { + appUniqueName: '${textProcessorName}-app' + cloudEnvironment: environment().name + webAppIdentityId: functionsUserIdentity.outputs.principalId + clientAppName: 'skill-${textProcessorName}' + clientAppDisplayName: 'skill-${textProcessorName}' + issuer: openIdIssuer + webAppEndpoint: 'https://${textProcessorName}.azurewebsites.net' + } +} + +module textProcessor 'functions-app.bicep' = { + name: 'text-processor-func' + params: { + name: textProcessorName + location: location + tags: union(tags, { 'azd-service-name': 'text-processor' }) + applicationInsightsName: applicationInsightsName + appServicePlanId: textProcessorPlan.outputs.resourceId + runtimeName: 'python' + runtimeVersion: '3.11' + storageAccountName: textProcessorRuntimeStorageName + deploymentStorageContainerName: deploymentContainerName + identityId: functionsUserIdentity.outputs.resourceId + identityClientId: functionsUserIdentity.outputs.clientId + authClientId: textProcessorAppReg.outputs.clientAppId + authIdentifierUri: textProcessorAppReg.outputs.identifierUri + authTenantId: tenant().tenantId + searchUserAssignedIdentityClientId: searchUserAssignedIdentityClientId + appSettings: union(appEnvVariables, { + AzureFunctionsWebHost__hostid: textProcessorHostId + }) + instanceMemoryMB: 2048 // Standard memory for embedding + maximumInstanceCount: 100 + } + dependsOn: [ + textProcessorRuntimeStorageAccount + ] +} + +// RBAC: Document Extractor Roles +module functionsIdentityRBAC 'functions-rbac.bicep' = { + name: 'doc-extractor-rbac' + params: { + principalId: functionsUserIdentity.outputs.principalId + storageResourceGroupName: storageResourceGroupName + searchServiceResourceGroupName: searchServiceResourceGroupName + openAiResourceGroupName: openAiResourceGroupName + documentIntelligenceResourceGroupName: documentIntelligenceResourceGroupName + visionServiceName: visionServiceName + visionResourceGroupName: visionResourceGroupName + contentUnderstandingServiceName: contentUnderstandingServiceName + contentUnderstandingResourceGroupName: contentUnderstandingResourceGroupName + useMultimodal: bool(appEnvVariables.USE_MULTIMODAL) + } +} + + +// Outputs +output documentExtractorName string = documentExtractor.outputs.name +output documentExtractorUrl string = documentExtractor.outputs.defaultHostname +output figureProcessorName string = figureProcessor.outputs.name +output figureProcessorUrl string = figureProcessor.outputs.defaultHostname +output textProcessorName string = textProcessor.outputs.name +output textProcessorUrl string = textProcessor.outputs.defaultHostname +// Resource IDs for each function app (used for auth_resource_id with managed identity secured skills) +output documentExtractorAuthIdentifierUri string = documentExtractorAppReg.outputs.identifierUri +output figureProcessorAuthIdentifierUri string = figureProcessorAppReg.outputs.identifierUri +output textProcessorAuthIdentifierUri string = textProcessorAppReg.outputs.identifierUri diff --git a/infra/app/storage-containers.bicep b/infra/app/storage-containers.bicep new file mode 100644 index 0000000000..bc3a45c13c --- /dev/null +++ b/infra/app/storage-containers.bicep @@ -0,0 +1,24 @@ +targetScope = 'resourceGroup' + +@description('Name of existing storage account to add deployment containers to') +param storageAccountName string +@description('List of container names to ensure exist') +param containerNames array + +// Existing storage account +resource stg 'Microsoft.Storage/storageAccounts@2023-05-01' existing = { + name: storageAccountName +} + +// Existing blob service +resource blob 'Microsoft.Storage/storageAccounts/blobServices@2023-05-01' existing = { + name: 'default' + parent: stg +} + +// Create each container (no public access, default properties) +resource containers 'Microsoft.Storage/storageAccounts/blobServices/containers@2023-05-01' = [for c in containerNames: { + name: c + parent: blob + properties: {} +}] diff --git a/infra/bicepconfig.json b/infra/bicepconfig.json new file mode 100644 index 0000000000..cd15f3f32a --- /dev/null +++ b/infra/bicepconfig.json @@ -0,0 +1,5 @@ +{ + "extensions": { + "microsoftGraphV1": "br:mcr.microsoft.com/bicep/extensions/microsoftgraph/v1.0:1.0.0" + } +} diff --git a/infra/core/auth/appregistration.bicep b/infra/core/auth/appregistration.bicep new file mode 100644 index 0000000000..baf8076de3 --- /dev/null +++ b/infra/core/auth/appregistration.bicep @@ -0,0 +1,116 @@ +extension microsoftGraphV1 + +@description('Specifies the name of cloud environment to run this deployment in.') +param cloudEnvironment string = environment().name + +@description('The unique name for the application registration (used for idempotency)') +param appUniqueName string + +// NOTE: Microsoft Graph Bicep file deployment is only supported in Public Cloud +@description('Audience uris for public and national clouds') +param audiences object = { + AzureCloud: { + uri: 'api://AzureADTokenExchange' + } + AzureUSGovernment: { + uri: 'api://AzureADTokenExchangeUSGov' + } + USNat: { + uri: 'api://AzureADTokenExchangeUSNat' + } + USSec: { + uri: 'api://AzureADTokenExchangeUSSec' + } + AzureChinaCloud: { + uri: 'api://AzureADTokenExchangeChina' + } +} + +@description('Specifies the ID of the user-assigned managed identity.') +param webAppIdentityId string + +@description('Specifies the unique name for the client application.') +param clientAppName string + +@description('Specifies the display name for the client application') +param clientAppDisplayName string + +param serviceManagementReference string = '' + +param issuer string + +param webAppEndpoint string + +// Combine default scope with custom scopes +var defaultScopeValue = 'user_impersonation' +var defaultScopeId = guid(appUniqueName, 'default-scope', defaultScopeValue) + +var userImpersonationScope = { + adminConsentDescription: 'Allow the application to access the API on behalf of the signed-in user' + adminConsentDisplayName: 'Access application as user' + id: defaultScopeId + isEnabled: true + type: 'User' + userConsentDescription: 'Allow the application to access the API on behalf of the signed-in user' + userConsentDisplayName: 'Access application as user' + value: defaultScopeValue +} + +var allScopes = [ + userImpersonationScope +] + +// Is this going to work with search service? Otherwise we have to set behind the scene? +var identifierUri = 'api://${appUniqueName}-${uniqueString(subscription().id, resourceGroup().id, appUniqueName)}' + +resource appRegistration 'Microsoft.Graph/applications@v1.0' = { + uniqueName: clientAppName + displayName: clientAppDisplayName + signInAudience: 'AzureADMyOrg' + serviceManagementReference: empty(serviceManagementReference) ? null : serviceManagementReference + identifierUris: [identifierUri] + api: { + oauth2PermissionScopes: allScopes + requestedAccessTokenVersion: 2 + // Not doing preauthorized apps + } + web: { + redirectUris: [ + '${webAppEndpoint}/.auth/login/aad/callback' + ] + implicitGrantSettings: { enableIdTokenIssuance: true } + } + requiredResourceAccess: [ + { + // Microsoft Graph permissions + resourceAppId: '00000003-0000-0000-c000-000000000000' + resourceAccess: [ + { + // User.Read delegated permission + id: 'e1fe6dd8-ba31-4d61-89e7-88639da4683d' + type: 'Scope' + } + ] + } + ] + +} + +resource appServicePrincipal 'Microsoft.Graph/servicePrincipals@v1.0' = { + appId: appRegistration.appId +} + +resource federatedIdentityCredential 'Microsoft.Graph/applications/federatedIdentityCredentials@v1.0' = { + name: '${appRegistration.uniqueName}/miAsFic' + audiences: [ + audiences[cloudEnvironment].uri + ] + issuer: issuer + subject: webAppIdentityId +} + +output clientAppId string = appRegistration.appId +output clientSpId string = appServicePrincipal.id + +@description('The identifier URI of the application - returns the actual URI that was set') +output identifierUri string = identifierUri diff --git a/infra/core/search/search-services.bicep b/infra/core/search/search-services.bicep index 4ee8d6a8fb..9d5f887aa5 100644 --- a/infra/core/search/search-services.bicep +++ b/infra/core/search/search-services.bicep @@ -33,11 +33,20 @@ param semanticSearch string = 'disabled' param sharedPrivateLinkStorageAccounts array = [] +resource searchIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = if (sku.name != 'free') { + name: '${name}-identity' + location: location + tags: tags +} + var searchIdentityProvider = (sku.name == 'free') ? null : { - type: 'SystemAssigned' + type: 'SystemAssigned, UserAssigned' + userAssignedIdentities: { + '${searchIdentity.id}': {} + } } -resource search 'Microsoft.Search/searchServices@2023-11-01' = { +resource search 'Microsoft.Search/searchServices@2025-05-01' = { name: name location: location tags: tags @@ -55,7 +64,7 @@ resource search 'Microsoft.Search/searchServices@2023-11-01' = { } sku: sku - resource sharedPrivateLinkResource 'sharedPrivateLinkResources@2023-11-01' = [for (resourceId, i) in sharedPrivateLinkStorageAccounts: { + resource sharedPrivateLinkResource 'sharedPrivateLinkResources@2025-05-01' = [for (resourceId, i) in sharedPrivateLinkStorageAccounts: { name: 'search-shared-private-link-${i}' properties: { groupId: 'blob' @@ -70,4 +79,8 @@ resource search 'Microsoft.Search/searchServices@2023-11-01' = { output id string = search.id output endpoint string = 'https://${name}.search.windows.net/' output name string = search.name -output principalId string = !empty(searchIdentityProvider) ? search.identity.principalId : '' +output systemAssignedPrincipalId string = (sku.name != 'free') ? search.identity.principalId : '' +output userAssignedPrincipalId string = (sku.name != 'free') ? (searchIdentity.?properties.?principalId ?? '') : '' +output userAssignedIdentityId string = (sku.name != 'free') ? (searchIdentity.?id ?? '') : '' +output userAssignedIdentityClientId string = (sku.name != 'free') ? (searchIdentity.?properties.?clientId ?? '') : '' +output userAssignedIdentityResourceId string = (sku.name != 'free') ? (searchIdentity.?id ?? '') : '' diff --git a/infra/main.bicep b/infra/main.bicep index 65b4893662..811dba0390 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -129,6 +129,7 @@ param speechServiceSkuName string // Set in main.parameters.json param speechServiceVoice string = '' param useMultimodal bool = false param useEval bool = false +param useCloudIngestion bool = false @allowed(['free', 'provisioned', 'serverless']) param cosmosDbSkuName string // Set in main.parameters.json @@ -655,6 +656,31 @@ module acaAuth 'core/host/container-apps-auth.bicep' = if (deploymentTarget == ' } } +// Optional Azure Functions for document ingestion and processing +module functions 'app/functions.bicep' = if (useCloudIngestion) { + name: 'functions' + scope: resourceGroup + params: { + location: location + tags: tags + applicationInsightsName: useApplicationInsights ? monitoring!.outputs.applicationInsightsName : '' + storageResourceGroupName: storageResourceGroup.name + searchServiceResourceGroupName: searchServiceResourceGroup.name + openAiResourceGroupName: openAiResourceGroup.name + documentIntelligenceResourceGroupName: documentIntelligenceResourceGroup.name + visionServiceName: useMultimodal ? vision!.outputs.name : '' + visionResourceGroupName: useMultimodal ? visionResourceGroup.name : resourceGroup.name + contentUnderstandingServiceName: useMediaDescriberAzureCU ? contentUnderstanding!.outputs.name : '' + contentUnderstandingResourceGroupName: useMediaDescriberAzureCU ? contentUnderstandingResourceGroup.name : resourceGroup.name + documentExtractorName: '${abbrs.webSitesFunctions}doc-extractor-${resourceToken}' + figureProcessorName: '${abbrs.webSitesFunctions}figure-processor-${resourceToken}' + textProcessorName: '${abbrs.webSitesFunctions}text-processor-${resourceToken}' + openIdIssuer: authenticationIssuerUri + appEnvVariables: appEnvVariables + searchUserAssignedIdentityClientId: searchService.outputs.userAssignedIdentityClientId + } +} + var defaultOpenAiDeployments = [ { name: chatGpt.deploymentName @@ -1125,7 +1151,7 @@ module openAiRoleSearchService 'core/security/role.bicep' = if (isAzureOpenAiHos scope: openAiResourceGroup name: 'openai-role-searchservice' params: { - principalId: searchService.outputs.principalId + principalId: searchService.outputs.systemAssignedPrincipalId roleDefinitionId: '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' principalType: 'ServicePrincipal' } @@ -1135,7 +1161,7 @@ module visionRoleSearchService 'core/security/role.bicep' = if (useMultimodal) { scope: visionResourceGroup name: 'vision-role-searchservice' params: { - principalId: searchService.outputs.principalId + principalId: searchService.outputs.systemAssignedPrincipalId roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' principalType: 'ServicePrincipal' } @@ -1165,11 +1191,12 @@ module storageOwnerRoleBackend 'core/security/role.bicep' = if (useUserUpload) { } } -module storageRoleSearchService 'core/security/role.bicep' = if (useIntegratedVectorization) { +// Search service needs blob read access for both integrated vectorization and cloud ingestion indexer data source +module storageRoleSearchService 'core/security/role.bicep' = if (useIntegratedVectorization || useCloudIngestion) { scope: storageResourceGroup name: 'storage-role-searchservice' params: { - principalId: searchService.outputs.principalId + principalId: searchService.outputs.systemAssignedPrincipalId roleDefinitionId: '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1' // Storage Blob Data Reader principalType: 'ServicePrincipal' } @@ -1179,7 +1206,7 @@ module storageRoleContributorSearchService 'core/security/role.bicep' = if (useI scope: storageResourceGroup name: 'storage-role-contributor-searchservice' params: { - principalId: searchService.outputs.principalId + principalId: searchService.outputs.systemAssignedPrincipalId roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Storage Blob Data Contributor principalType: 'ServicePrincipal' } @@ -1432,8 +1459,8 @@ output AZURE_SEARCH_AGENT string = searchAgentName output AZURE_SEARCH_SERVICE string = searchService.outputs.name output AZURE_SEARCH_SERVICE_RESOURCE_GROUP string = searchServiceResourceGroup.name output AZURE_SEARCH_SEMANTIC_RANKER string = actualSearchServiceSemanticRankerLevel -output AZURE_SEARCH_SERVICE_ASSIGNED_USERID string = searchService.outputs.principalId output AZURE_SEARCH_FIELD_NAME_EMBEDDING string = searchFieldNameEmbedding +output AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID string = searchService.outputs.userAssignedIdentityResourceId output AZURE_COSMOSDB_ACCOUNT string = (useAuthentication && useChatHistoryCosmos) ? cosmosDb.outputs.name : '' output AZURE_CHAT_HISTORY_DATABASE string = chatHistoryDatabaseName @@ -1450,6 +1477,15 @@ output AZURE_USERSTORAGE_RESOURCE_GROUP string = storageResourceGroup.name output AZURE_IMAGESTORAGE_CONTAINER string = useMultimodal ? imageStorageContainerName : '' +// Cloud ingestion function skill endpoints & resource IDs +output DOCUMENT_EXTRACTOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://${functions!.outputs.documentExtractorUrl}/api/extract' : '' +output FIGURE_PROCESSOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://${functions!.outputs.figureProcessorUrl}/api/process' : '' +output TEXT_PROCESSOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://${functions!.outputs.textProcessorUrl}/api/process' : '' +// Identifier URI used as authResourceId for all custom skill endpoints +output DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.documentExtractorAuthIdentifierUri : '' +output FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.figureProcessorAuthIdentifierUri : '' +output TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.textProcessorAuthIdentifierUri : '' + output AZURE_AI_PROJECT string = useAiProject ? ai.outputs.projectName : '' output AZURE_USE_AUTHENTICATION bool = useAuthentication diff --git a/infra/main.parameters.json b/infra/main.parameters.json index 606bbb4915..19170c1ff4 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -206,6 +206,9 @@ "useEval": { "value": "${USE_EVAL=false}" }, + "useCloudIngestion": { + "value": "${USE_CLOUD_INGESTION=false}" + }, "enableLanguagePicker": { "value": "${ENABLE_LANGUAGE_PICKER=false}" }, diff --git a/pyproject.toml b/pyproject.toml index 44072581ee..45d41bfd03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ target-version = "py310" lint.select = ["E", "F", "I", "UP"] lint.ignore = ["E501", "E701", "UP045"] # line too long, multiple statements on one line, keep Optional[X] -src = ["app/backend", "scripts"] +src = ["app/backend", "scripts", "app/functions"] [tool.ruff.lint.isort] known-local-folder = ["scripts"] @@ -12,7 +12,7 @@ line-length = 120 [tool.pytest.ini_options] addopts = "-ra" -pythonpath = ["app/backend", "scripts"] +pythonpath = ["app/backend", "scripts", "app/functions"] asyncio_default_fixture_loop_scope = "function" [tool.coverage.paths] diff --git a/scripts/copy_prepdocslib.py b/scripts/copy_prepdocslib.py new file mode 100644 index 0000000000..676ba5420e --- /dev/null +++ b/scripts/copy_prepdocslib.py @@ -0,0 +1,50 @@ +"""Synchronize prepdocslib ingestion library to function apps. +This script ensures that the latest version of the prepdocslib library used +by backend ingestion is copied into each of the Azure Function apps that +also rely on this library. + +Steps: +1. Copy `prepdocslib` into each function directory. +2. Overwrite each function's `requirements.txt` with backend `requirements.txt`. +""" + +import shutil +from pathlib import Path + + +def copy_tree(src: Path, dest: Path) -> None: + if dest.exists(): + shutil.rmtree(dest) + shutil.copytree(src, dest) + + +def main() -> None: + repo_root = Path(__file__).resolve().parent.parent + prep_source = repo_root / "app" / "backend" / "prepdocslib" + if not prep_source.exists(): + raise RuntimeError(f"Source prepdocslib directory not found: {prep_source}") + + backend_requirements = repo_root / "app" / "backend" / "requirements.txt" + if not backend_requirements.exists(): + raise RuntimeError(f"Backend requirements file not found: {backend_requirements}") + + targets = [ + repo_root / "app" / "functions" / "document_extractor" / "prepdocslib", + repo_root / "app" / "functions" / "figure_processor" / "prepdocslib", + repo_root / "app" / "functions" / "text_processor" / "prepdocslib", + ] + + for target in targets: + func_dir = target.parent + func_dir.mkdir(parents=True, exist_ok=True) + + # 1. Library sync + copy_tree(prep_source, target) + + # 2. Overwrite requirements.txt directly + overwrite_req = func_dir / "requirements.txt" + shutil.copy2(backend_requirements, overwrite_req) + + +if __name__ == "__main__": + main() diff --git a/scripts/prepdocs.ps1 b/scripts/prepdocs.ps1 index 6c9eddec19..d21329e5bd 100755 --- a/scripts/prepdocs.ps1 +++ b/scripts/prepdocs.ps1 @@ -1,3 +1,9 @@ +$USE_CLOUD_INGESTION = (azd env get-value USE_CLOUD_INGESTION) +if ($USE_CLOUD_INGESTION -eq "true") { + Write-Host "Cloud ingestion is enabled, so we are not running the manual ingestion process." + Exit 0 +} + ./scripts/load_python_env.ps1 $venvPythonPath = "./.venv/scripts/python.exe" diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh index c0254755e0..352a34cf92 100755 --- a/scripts/prepdocs.sh +++ b/scripts/prepdocs.sh @@ -1,5 +1,11 @@ #!/bin/sh +USE_CLOUD_INGESTION=$(azd env get-value USE_CLOUD_INGESTION) +if [ "$USE_CLOUD_INGESTION" = "true" ]; then + echo "Cloud ingestion is enabled, so we are not running the manual ingestion process." + exit 0 +fi + . ./scripts/load_python_env.sh echo 'Running "prepdocs.py"' diff --git a/scripts/setup_cloud_ingestion.ps1 b/scripts/setup_cloud_ingestion.ps1 new file mode 100644 index 0000000000..4c5859d595 --- /dev/null +++ b/scripts/setup_cloud_ingestion.ps1 @@ -0,0 +1,14 @@ +$USE_CLOUD_INGESTION = (azd env get-value USE_CLOUD_INGESTION) +if ($USE_CLOUD_INGESTION -ne "true") { + Exit 0 +} + +. ./scripts/load_python_env.ps1 + +$venvPythonPath = "./.venv/scripts/python.exe" +if (Test-Path -Path "/usr") { + # fallback to Linux venv path + $venvPythonPath = "./.venv/bin/python" +} + +Start-Process -FilePath $venvPythonPath -ArgumentList "./app/backend/setup_cloud_ingestion.py" -Wait -NoNewWindow diff --git a/scripts/setup_cloud_ingestion.sh b/scripts/setup_cloud_ingestion.sh new file mode 100755 index 0000000000..37e5e068dc --- /dev/null +++ b/scripts/setup_cloud_ingestion.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +USE_CLOUD_INGESTION=$(azd env get-value USE_CLOUD_INGESTION) +if [ "$USE_CLOUD_INGESTION" != "true" ]; then + exit 0 +fi + +. ./scripts/load_python_env.sh + +./.venv/bin/python ./app/backend/setup_cloud_ingestion.py diff --git a/tests/test_function_apps.py b/tests/test_function_apps.py new file mode 100644 index 0000000000..f2183acb8d --- /dev/null +++ b/tests/test_function_apps.py @@ -0,0 +1,1067 @@ +import base64 +import json +import logging +import os +from collections.abc import Iterable +from dataclasses import dataclass, field +from typing import Any + +import azure.functions as func +import pytest + +from document_extractor import function_app as document_extractor +from figure_processor import function_app as figure_processor +from prepdocslib.fileprocessor import FileProcessor +from prepdocslib.textparser import TextParser +from prepdocslib.textsplitter import SentenceTextSplitter +from tests.mocks import TEST_PNG_BYTES +from text_processor import function_app as text_processor + + +@dataclass +class ChunkStub: + page_num: int + text: str + images: list[Any] = field(default_factory=list) + + +@dataclass +class SectionStub: + chunk: ChunkStub + + +def build_request(payload: dict[str, Any]) -> func.HttpRequest: + """Construct an HttpRequest carrying the provided payload.""" + body = json.dumps(payload).encode("utf-8") + return func.HttpRequest( + method="POST", + url="http://localhost/api", + headers={}, + params={}, + body=body, + ) + + +def build_raw_request(body: bytes) -> func.HttpRequest: + """Construct an HttpRequest with a raw (non-JSON) payload.""" + return func.HttpRequest( + method="POST", + url="http://localhost/api", + headers={}, + params={}, + body=body, + ) + + +@pytest.mark.asyncio +async def test_document_extractor_emits_pages_and_figures(monkeypatch: pytest.MonkeyPatch) -> None: + """Document extractor returns pages with associated figures.""" + + class StubParser: + def __init__(self, pages: Iterable[Any]) -> None: + self._pages = list(pages) + + async def parse(self, content: Any): + for page in self._pages: + yield page + + placeholder = '
' + figure = figure_processor.ImageOnPage( + bytes=TEST_PNG_BYTES, + bbox=(10.0, 20.0, 30.0, 40.0), + filename="figure1.png", + figure_id="fig-1", + page_num=0, + placeholder=placeholder, + title="Drone Logo", + ) + page_text = f"# Heading\n\n{placeholder}\n\nConclusion." + page = document_extractor.Page(page_num=0, offset=0, text=page_text, images=[figure]) + + # Set up mock file processors and settings + mock_file_processors = { + ".pdf": FileProcessor(StubParser([page]), None), + } + + mock_settings = document_extractor.GlobalSettings( + file_processors=mock_file_processors, + azure_credential=object(), + ) + monkeypatch.setattr(document_extractor, "settings", mock_settings) + + request_payload = { + "values": [ + { + "recordId": "record-1", + "data": { + "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, + "file_name": "sample.pdf", + "contentType": "application/pdf", + }, + } + ] + } + response = await document_extractor.extract_document(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + assert result["recordId"] == "record-1" + + data = result["data"] + assert data["file_name"] == "sample.pdf" + assert data["pages"] == [ + {"page_num": 0, "text": page_text, "figure_ids": ["fig-1"]}, + ] + assert len(data["figures"]) == 1 + figure_entry = data["figures"][0] + assert figure_entry["figure_id"] == "fig-1" + assert figure_entry["document_file_name"] == "sample.pdf" + assert figure_entry["bbox"] == [10.0, 20.0, 30.0, 40.0] + assert figure_entry["bytes_base64"] == base64.b64encode(TEST_PNG_BYTES).decode("utf-8") + + +@pytest.mark.asyncio +async def test_document_extractor_requires_single_record(monkeypatch: pytest.MonkeyPatch) -> None: + mock_settings = document_extractor.GlobalSettings( + file_processors={".pdf": FileProcessor(None, None)}, + azure_credential=object(), + ) + monkeypatch.setattr(document_extractor, "settings", mock_settings) + response = await document_extractor.extract_document(build_request({"values": []})) + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] + + +@pytest.mark.asyncio +async def test_document_extractor_handles_processing_exception(monkeypatch: pytest.MonkeyPatch) -> None: + async def failing_process(data: dict[str, Any]) -> dict[str, Any]: + raise RuntimeError("boom") + + mock_settings = document_extractor.GlobalSettings( + file_processors={".pdf": FileProcessor(None, None)}, + azure_credential=object(), + ) + monkeypatch.setattr(document_extractor, "settings", mock_settings) + monkeypatch.setattr(document_extractor, "process_document", failing_process) + + payload = { + "values": [ + { + "recordId": "rec-error", + "data": { + "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, + "file_name": "sample.pdf", + "contentType": "application/pdf", + }, + } + ] + } + + response = await document_extractor.extract_document(build_request(payload)) + assert response.status_code == 200 + values = json.loads(response.get_body().decode("utf-8"))["values"] + assert values[0]["errors"][0]["message"] == "boom" + + +@pytest.mark.asyncio +async def test_document_extractor_invalid_json_returns_error() -> None: + response = await document_extractor.extract_document(build_raw_request(b"not json")) + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert "error" in body + + +@pytest.mark.asyncio +async def test_document_extractor_process_document_http_error(monkeypatch: pytest.MonkeyPatch) -> None: + class FailingParser: + async def parse(self, content): + raise document_extractor.HttpResponseError(message="fail") + yield # Make this an async generator + + mock_file_processors = { + ".pdf": FileProcessor(FailingParser(), None), + } + + mock_settings = document_extractor.GlobalSettings( + file_processors=mock_file_processors, + azure_credential=object(), + ) + monkeypatch.setattr(document_extractor, "settings", mock_settings) + + data = { + "file_data": {"data": base64.b64encode(b"content").decode("utf-8")}, + "file_name": "doc.pdf", + "contentType": "application/pdf", + } + + with pytest.raises(ValueError) as exc_info: + await document_extractor.process_document(data) + + assert "Parser failed" in str(exc_info.value) + + +def test_document_extractor_missing_file_data() -> None: + with pytest.raises(ValueError): + document_extractor.get_document_stream_filedata({"file_data": {}}) + + +def test_document_extractor_managed_identity_reload(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("AZURE_CLIENT_ID", "client-123") + document_extractor.configure_global_settings() + assert isinstance(document_extractor.settings.azure_credential, document_extractor.ManagedIdentityCredential) + monkeypatch.delenv("AZURE_CLIENT_ID", raising=False) + document_extractor.configure_global_settings() + + +@pytest.mark.asyncio +async def test_figure_processor_returns_enriched_metadata(monkeypatch: pytest.MonkeyPatch) -> None: + """Figure processor enriches images with URL and description.""" + + async def fake_process_page_image(*, image, document_filename: str, **kwargs: Any): + image.url = f"https://images.example.com/{document_filename}/{image.figure_id}.png" + image.description = f"Description for {image.figure_id}" + image.embedding = [0.11, 0.22, 0.33] + return image + + monkeypatch.setattr(figure_processor, "process_page_image", fake_process_page_image) + + # Create mock settings object + mock_settings = figure_processor.GlobalSettings( + blob_manager=object(), figure_processor=object(), image_embeddings=object() + ) + monkeypatch.setattr(figure_processor, "settings", mock_settings) + + figure = figure_processor.ImageOnPage( + bytes=TEST_PNG_BYTES, + bbox=(1.0, 2.0, 3.0, 4.0), + filename="figure1.png", + figure_id="fig-1", + page_num=0, + placeholder='
', + ) + figure_payload = figure.to_skill_payload("sample.pdf") + + request_payload = { + "values": [ + { + "recordId": "rec-1", + "data": figure_payload, + } + ] + } + + response = await figure_processor.process_figure_request(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + assert result["recordId"] == "rec-1" + + data = result["data"] + assert data["figure_id"] == "fig-1" + assert data["url"] == "https://images.example.com/sample.pdf/fig-1.png" + assert data["description"] == "Description for fig-1" + assert data["embedding"] == [0.11, 0.22, 0.33] + assert "bytes_base64" not in data + + +@pytest.mark.asyncio +async def test_figure_processor_invalid_json_returns_error(monkeypatch: pytest.MonkeyPatch) -> None: + # Set up minimal mock settings so the function can proceed to JSON parsing + mock_settings = figure_processor.GlobalSettings(blob_manager=object(), figure_processor=None, image_embeddings=None) + monkeypatch.setattr(figure_processor, "settings", mock_settings) + + response = await figure_processor.process_figure_request(build_raw_request(b"not json")) + assert response.status_code == 400 + payload = json.loads(response.get_body().decode("utf-8")) + assert payload["error"] == "Invalid JSON payload" + + +def test_figure_processor_initialisation_with_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("AZURE_CLIENT_ID", "client-456") + monkeypatch.setenv("AZURE_STORAGE_ACCOUNT", "acct") + monkeypatch.setenv("AZURE_IMAGESTORAGE_CONTAINER", "images") + monkeypatch.setenv("USE_MULTIMODAL", "true") + monkeypatch.setenv("AZURE_OPENAI_SERVICE", "svc") + monkeypatch.setenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "deploy") + monkeypatch.setenv("AZURE_VISION_ENDPOINT", "https://vision") + + call_state: dict[str, Any] = {} + + class StubCredential: + def __init__(self, client_id: str | None = None): + call_state["credential_client_id"] = client_id + + def fake_setup_blob_manager(**kwargs: Any) -> str: + call_state["blob_manager_kwargs"] = kwargs + return "blob" + + def fake_setup_figure_processor(**kwargs: Any) -> str: + call_state["figure_processor_kwargs"] = kwargs + return "figproc" + + def fake_setup_openai_client( + *, + openai_host: Any, + azure_credential: Any, + azure_openai_service: str | None, + azure_openai_custom_url: str | None, + ) -> tuple[str, None]: + call_state["openai_client_args"] = { + "openai_host": openai_host, + "azure_credential": azure_credential, + "azure_openai_service": azure_openai_service, + "azure_openai_custom_url": azure_openai_custom_url, + } + return ("openai-client", None) + + def fake_get_bearer_token_provider(credential: Any, scope: str): + call_state["token_scope"] = scope + call_state["token_credential"] = credential + return lambda: "token" + + class DummyImageEmbeddings: + def __init__(self, endpoint: str, token_provider): + self.endpoint = endpoint + self.token_provider = token_provider + + monkeypatch.setattr(figure_processor, "ManagedIdentityCredential", StubCredential) + monkeypatch.setattr(figure_processor, "setup_blob_manager", fake_setup_blob_manager) + monkeypatch.setattr(figure_processor, "setup_figure_processor", fake_setup_figure_processor) + monkeypatch.setattr(figure_processor, "setup_openai_client", fake_setup_openai_client) + monkeypatch.setattr(figure_processor, "get_bearer_token_provider", fake_get_bearer_token_provider) + monkeypatch.setattr(figure_processor, "ImageEmbeddings", DummyImageEmbeddings) + monkeypatch.setattr(figure_processor, "settings", None) + + figure_processor.configure_global_settings() + + assert figure_processor.settings is not None + assert figure_processor.settings.blob_manager == "blob" + assert figure_processor.settings.figure_processor == "figproc" + embeddings = figure_processor.settings.image_embeddings + assert isinstance(embeddings, DummyImageEmbeddings) + assert embeddings.endpoint == "https://vision" + assert embeddings.token_provider() == "token" + + assert call_state["credential_client_id"] == "client-456" + assert call_state["blob_manager_kwargs"]["storage_account"] == "acct" + assert call_state["figure_processor_kwargs"]["use_multimodal"] is True + assert call_state["token_scope"] == "https://cognitiveservices.azure.com/.default" + assert isinstance(call_state["token_credential"], StubCredential) + assert call_state["openai_client_args"]["azure_openai_service"] == "svc" + assert call_state["openai_client_args"]["azure_credential"] is call_state["token_credential"] + + +def test_figure_processor_warns_when_openai_incomplete(monkeypatch: pytest.MonkeyPatch, caplog) -> None: + """Figure processor is created with warning when USE_MULTIMODAL is true but OpenAI config is incomplete.""" + monkeypatch.setenv("USE_MULTIMODAL", "true") + monkeypatch.setenv("AZURE_STORAGE_ACCOUNT", "acct") + monkeypatch.setenv("AZURE_IMAGESTORAGE_CONTAINER", "images") + # OpenAI config missing, so figure_processor will be created but won't work properly + figure_processor.configure_global_settings() + # A FigureProcessor object is created even with incomplete config + assert figure_processor.settings.figure_processor is not None + assert "USE_MULTIMODAL is true but Azure OpenAI configuration incomplete" in caplog.text + + +@pytest.mark.asyncio +async def test_text_processor_builds_chunk_with_caption(monkeypatch: pytest.MonkeyPatch) -> None: + """Text processor merges figure metadata and emits chunk with embeddings.""" + + class StubSplitter: + def split_pages(self, pages: list[Any]): + for page in pages: + yield ChunkStub(page_num=page.page_num, text=page.text) + + class StubEmbeddingService: + async def create_embeddings(self, texts: list[str]) -> list[list[float]]: + return [[0.41, 0.42, 0.43] for _ in texts] + + # Set up mock file processors with stub splitter + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), StubSplitter()), + } + + # Set up mock settings + mock_settings = text_processor.GlobalSettings( + use_vectors=True, + use_multimodal=False, + embedding_dimensions=3, + file_processors=mock_file_processors, + embedding_service=StubEmbeddingService(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + figure = figure_processor.ImageOnPage( + bytes=TEST_PNG_BYTES, + bbox=(5.0, 6.0, 7.0, 8.0), + filename="figure1.png", + figure_id="fig-1", + page_num=0, + placeholder='
', + title="Drone Logo", + url="https://images.example.com/fig-1.png", + description="A drone-themed company logo.", + ) + figure_payload = figure.to_skill_payload("financial.pdf") + + page_text = 'Summary paragraph.\n\n
\n\nClosing remarks.' + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "financial.pdf", + "storageUrl": "https://storage.example.com/content/financial.pdf", + "pages": [ + {"page_num": 0, "text": page_text, "figure_ids": ["fig-1"]}, + ], + "figures": [figure_payload], + }, + "enriched_descriptions": ["A drone-themed company logo."], + "enriched_urls": ["https://images.example.com/fig-1.png"], + "enriched_embeddings": [[0.51, 0.52, 0.53]], + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + assert result["recordId"] == "doc-1" + + data = result["data"] + chunks = data["chunks"] + assert len(chunks) == 1 + chunk = chunks[0] + assert chunk["parent_id"] == "https://storage.example.com/content/financial.pdf" + assert chunk["sourcepage"] == "financial.pdf#page=1" + assert chunk["embedding"] == [0.41, 0.42, 0.43] + assert chunk["images"] == [ + { + "url": "https://images.example.com/fig-1.png", + "description": "A drone-themed company logo.", + "boundingbox": [5.0, 6.0, 7.0, 8.0], + } + ] + assert '
' not in chunk["content"] + assert "A drone-themed company logo." in chunk["content"] + assert chunk["id"].endswith("-0000") + + +@pytest.mark.asyncio +async def test_document_extractor_without_settings(monkeypatch: pytest.MonkeyPatch) -> None: + """Test document extractor returns error when settings not initialized.""" + monkeypatch.setattr(document_extractor, "settings", None) + + request_payload = { + "values": [ + { + "recordId": "record-1", + "data": { + "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, + "file_name": "sample.pdf", + "contentType": "application/pdf", + }, + } + ] + } + + response = await document_extractor.extract_document(build_request(request_payload)) + + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] == "Settings not initialized" + + +def test_document_extractor_module_init_key_error( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + """Reload module without pytest env to trigger init warning path.""" + import importlib + from unittest import mock + + saved_env = os.environ.get("PYTEST_CURRENT_TEST") + monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) + + caplog.set_level("WARNING") + + with mock.patch("azure.identity.aio.ManagedIdentityCredential", lambda *_, **__: object()), mock.patch( + "prepdocslib.servicesetup.build_file_processors", side_effect=KeyError("missing env") + ): + reloaded = importlib.reload(document_extractor) + + assert "Could not initialize settings at module load time" in caplog.text + + monkeypatch.setenv("PYTEST_CURRENT_TEST", "pytest") + + if saved_env is not None: + monkeypatch.setenv("PYTEST_CURRENT_TEST", saved_env) + + importlib.reload(reloaded) + reloaded.settings = None + + +@pytest.mark.asyncio +async def test_figure_processor_without_settings(monkeypatch: pytest.MonkeyPatch) -> None: + """Test figure processor returns error when settings not initialized.""" + monkeypatch.setattr(figure_processor, "settings", None) + + request_payload = { + "values": [ + { + "recordId": "img-1", + "data": { + "bytes_base64": base64.b64encode(TEST_PNG_BYTES).decode("utf-8"), + "filename": "figure1.png", + "figure_id": "fig-1", + "document_file_name": "sample.pdf", + "page_num": 1, + }, + } + ] + } + + response = await figure_processor.process_figure_request(build_request(request_payload)) + + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] == "Settings not initialized" + + +@pytest.mark.asyncio +async def test_text_processor_without_settings(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor returns error when settings not initialized.""" + monkeypatch.setattr(text_processor, "settings", None) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] == "Settings not initialized" + + +@pytest.mark.asyncio +async def test_text_processor_invalid_json(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor handles invalid JSON payload.""" + mock_settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=None, + file_processors={}, + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Send invalid JSON + response = await text_processor.process_text_entry(build_raw_request(b"not json")) + + assert response.status_code == 400 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] == "Request body must be valid JSON" + + +@pytest.mark.asyncio +async def test_text_processor_with_client_id(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor uses ManagedIdentityCredential with client ID.""" + # Set the AZURE_CLIENT_ID environment variable + monkeypatch.setenv("AZURE_CLIENT_ID", "test-client-id") + text_processor.configure_global_settings() + # Verify it was configured (actual verification would check the credential type) + monkeypatch.delenv("AZURE_CLIENT_ID", raising=False) + text_processor.configure_global_settings() + + +@pytest.mark.asyncio +async def test_text_processor_embeddings_setup(monkeypatch: pytest.MonkeyPatch) -> None: + """configure_global_settings wires up embedding service when configuration is complete.""" + + monkeypatch.setenv("USE_VECTORS", "true") + monkeypatch.setenv("AZURE_OPENAI_SERVICE", "svc") + monkeypatch.setenv("AZURE_OPENAI_EMB_DEPLOYMENT", "deployment") + monkeypatch.setenv("AZURE_OPENAI_EMB_MODEL_NAME", "model") + monkeypatch.setenv("OPENAI_HOST", "azure") + + class StubCredential: + def __init__(self, *args, **kwargs) -> None: + pass + + monkeypatch.setattr(text_processor, "ManagedIdentityCredential", StubCredential) + monkeypatch.setattr(text_processor, "build_file_processors", lambda **kwargs: {".pdf": object()}) + + calls: dict[str, object] = {} + + def fake_setup_openai_client(**kwargs): + calls["openai_host"] = kwargs["openai_host"] + return object(), "https://svc.openai.azure.com" + + def fake_setup_embeddings_service(openai_host, openai_client, **kwargs): + calls["embedding"] = kwargs + return "embedding-service" + + monkeypatch.setattr(text_processor, "setup_openai_client", fake_setup_openai_client) + monkeypatch.setattr(text_processor, "setup_embeddings_service", fake_setup_embeddings_service) + + text_processor.settings = None + text_processor.configure_global_settings() + + assert calls["openai_host"] == text_processor.OpenAIHost.AZURE + assert calls["embedding"]["emb_model_name"] == "model" + assert text_processor.settings is not None + assert text_processor.settings.embedding_service == "embedding-service" + + text_processor.settings = None + + +def test_text_processor_configure_logs_when_embedding_config_missing( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + monkeypatch.setenv("USE_VECTORS", "true") + monkeypatch.setattr(text_processor, "ManagedIdentityCredential", lambda *args, **kwargs: object()) + monkeypatch.setattr(text_processor, "build_file_processors", lambda **kwargs: {".pdf": object()}) + + text_processor.settings = None + + with caplog.at_level(logging.WARNING): + text_processor.configure_global_settings() + + assert "embedding configuration incomplete" in caplog.text + assert text_processor.settings is not None + assert text_processor.settings.embedding_service is None + + text_processor.settings = None + + +@pytest.mark.asyncio +async def test_text_processor_no_sections(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor handles empty sections.""" + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } + mock_settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=None, + file_processors=mock_file_processors, + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return empty list + def mock_process_text(pages, file, splitter, category): + return [] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + assert result["data"]["chunks"] == [] + + +@pytest.mark.asyncio +async def test_text_processor_embeddings_not_initialized(monkeypatch: pytest.MonkeyPatch, caplog) -> None: + """Test text processor logs warning when embeddings requested but not initialized.""" + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } + mock_settings = text_processor.GlobalSettings( + use_vectors=True, # Request embeddings + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=None, # But no service + file_processors=mock_file_processors, + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return a section + def mock_process_text(pages, file, splitter, category): + chunk = ChunkStub(page_num=0, text="Some content", images=[]) + return [SectionStub(chunk=chunk)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + with caplog.at_level(logging.WARNING): + await text_processor.process_text_entry(build_request(request_payload)) + + assert "Embeddings requested but service not initialised" in caplog.text + + +@pytest.mark.asyncio +async def test_text_processor_empty_chunk_skipped(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor skips empty chunks.""" + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } + mock_settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=None, + file_processors=mock_file_processors, + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return chunks with empty content + def mock_process_text(pages, file, splitter, category): + chunk1 = ChunkStub(page_num=0, text=" ", images=[]) # Whitespace only + chunk2 = ChunkStub(page_num=0, text="Valid content", images=[]) + return [SectionStub(chunk=chunk1), SectionStub(chunk=chunk2)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + # Only one chunk should be returned (the empty one is skipped) + assert len(result["data"]["chunks"]) == 1 + + +@pytest.mark.asyncio +async def test_text_processor_with_multimodal_embeddings(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor includes image embeddings when use_multimodal is true.""" + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } + mock_settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=True, + embedding_dimensions=1536, + embedding_service=None, + file_processors=mock_file_processors, + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return a section with an image that has embedding + figure = figure_processor.ImageOnPage( + bytes=TEST_PNG_BYTES, + bbox=(5.0, 6.0, 7.0, 8.0), + filename="figure1.png", + figure_id="fig-1", + page_num=0, + placeholder='
', + title="Test Figure", + description="A test image", + embedding=[0.1, 0.2, 0.3], + ) + + def mock_process_text(pages, file, splitter, category): + chunk = ChunkStub(page_num=0, text="Some content", images=[figure]) + return [SectionStub(chunk=chunk)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + chunks = result["data"]["chunks"] + assert len(chunks) == 1 + assert chunks[0]["images"][0]["embedding"] == [0.1, 0.2, 0.3] + + +@pytest.mark.asyncio +async def test_text_processor_embedding_dimension_mismatch(monkeypatch: pytest.MonkeyPatch, caplog) -> None: + """Test text processor logs warning when embedding dimensions don't match.""" + mock_embedding_service = type("MockEmbeddingService", (), {})() + + async def mock_create_embeddings(texts): + return [[0.1, 0.2]] # Only 2 dimensions instead of expected 1536 + + mock_embedding_service.create_embeddings = mock_create_embeddings + + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } + mock_settings = text_processor.GlobalSettings( + use_vectors=True, + use_multimodal=False, + embedding_dimensions=1536, # Expecting 1536 dimensions + embedding_service=mock_embedding_service, + file_processors=mock_file_processors, + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return a section + def mock_process_text(pages, file, splitter, category): + chunk = ChunkStub(page_num=0, text="Some content", images=[]) + return [SectionStub(chunk=chunk)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + with caplog.at_level(logging.WARNING): + await text_processor.process_text_entry(build_request(request_payload)) + + assert "dimension mismatch" in caplog.text + + +@pytest.mark.asyncio +async def test_text_processor_embeddings_missing_warning(monkeypatch: pytest.MonkeyPatch, caplog) -> None: + """Test text processor logs warning when embeddings are requested but missing.""" + mock_embedding_service = type("MockEmbeddingService", (), {})() + + async def mock_create_embeddings(texts): + # Return None to simulate embeddings service returning None + return None + + mock_embedding_service.create_embeddings = mock_create_embeddings + + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } + mock_settings = text_processor.GlobalSettings( + use_vectors=True, + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=mock_embedding_service, + file_processors=mock_file_processors, + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return a section + def mock_process_text(pages, file, splitter, category): + chunk = ChunkStub(page_num=0, text="Content 1", images=[]) + return [SectionStub(chunk=chunk)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + with caplog.at_level(logging.WARNING): + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + assert "were requested but missing" in caplog.text + + +@pytest.mark.asyncio +async def test_text_processor_process_document_handles_missing_figures( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + stub_processor = FileProcessor(TextParser(), SentenceTextSplitter()) + + monkeypatch.setattr(text_processor, "select_processor_for_filename", lambda *_args, **_kwargs: stub_processor) + monkeypatch.setattr( + text_processor, + "process_text", + lambda *args, **kwargs: [SectionStub(chunk=ChunkStub(page_num=0, text="Chunk", images=[]))], + ) + + text_processor.settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + file_processors={".pdf": stub_processor}, + embedding_service=None, + ) + + payload = { + "consolidated_document": { + "file_name": "sample.pdf", + "pages": [ + { + "page_num": 0, + "text": "Hello", + "figure_ids": ["missing", "bad"], + } + ], + "figures": [ + { + "figure_id": "bad", + # Missing filename forces ImageOnPage.from_skill_payload to raise AssertionError + } + ], + } + } + + with caplog.at_level(logging.WARNING): + chunks = await text_processor.process_document(payload) + + assert chunks + assert any("not found in figures metadata" in record.message for record in caplog.records) + assert any("Failed to deserialize figure" in record.message for record in caplog.records) + + text_processor.settings = None + + +@pytest.mark.asyncio +async def test_text_processor_process_document_returns_empty_when_no_pages(monkeypatch: pytest.MonkeyPatch) -> None: + text_processor.settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + file_processors={}, + embedding_service=None, + ) + + result = await text_processor.process_document({"consolidated_document": {"file_name": "empty.pdf", "pages": []}}) + + assert result == [] + + text_processor.settings = None + + +def test_text_processor_module_init_logs_warning( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + import importlib + from unittest import mock + + saved_env = os.environ.get("PYTEST_CURRENT_TEST") + monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) + + class StubCredential: + def __init__(self, *args, **kwargs) -> None: + pass + + caplog.set_level("WARNING") + + with mock.patch("azure.identity.aio.ManagedIdentityCredential", StubCredential), mock.patch( + "prepdocslib.servicesetup.build_file_processors", side_effect=KeyError("missing env") + ), mock.patch("prepdocslib.servicesetup.setup_openai_client", return_value=(object(), None)), mock.patch( + "prepdocslib.servicesetup.setup_embeddings_service", return_value=None + ): + reloaded = importlib.reload(text_processor) + + assert "Could not initialize settings at module load time" in caplog.text + + monkeypatch.setenv("PYTEST_CURRENT_TEST", "pytest") + + if saved_env is not None: + monkeypatch.setenv("PYTEST_CURRENT_TEST", saved_env) + + importlib.reload(reloaded) + reloaded.settings = None diff --git a/tests/test_mediadescriber.py b/tests/test_mediadescriber.py index 2f767f712e..6822e28468 100644 --- a/tests/test_mediadescriber.py +++ b/tests/test_mediadescriber.py @@ -68,7 +68,7 @@ def mock_get(self, url, **kwargs): "startPageNumber": 1, "endPageNumber": 1, "unit": "pixel", - "pages": [{"pageNumber": 1}], + "pages": [{"pageNumber": 0}], } ], }, diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index e22c4d9e7b..23cd2dcabf 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -18,21 +18,37 @@ DocumentTable, DocumentTableCell, ) +from azure.core.credentials import AzureKeyCredential from azure.core.exceptions import HttpResponseError from PIL import Image, ImageChops -from prepdocslib.mediadescriber import ( - ContentUnderstandingDescriber, - MultimodalModelDescriber, +from prepdocslib.figureprocessor import ( + FigureProcessor, + MediaDescriptionStrategy, + build_figure_markup, + process_page_image, ) from prepdocslib.page import ImageOnPage -from prepdocslib.pdfparser import DocumentAnalysisParser, MediaDescriptionStrategy +from prepdocslib.pdfparser import DocumentAnalysisParser from .mocks import MockAzureCredential TEST_DATA_DIR = pathlib.Path(__file__).parent / "test-data" +@pytest.fixture +def sample_image(): + """Fixture for a sample ImageOnPage object used across multiple tests.""" + return ImageOnPage( + bytes=b"fake", + bbox=(0, 0, 100, 100), + page_num=1, + figure_id="fig_1", + placeholder='
', + filename="test.png", + ) + + def assert_image_equal(image1, image2): assert image1.size == image2.size assert image1.mode == image2.mode @@ -114,15 +130,16 @@ def test_table_to_html_with_spans(): @pytest.mark.asyncio async def test_process_figure_without_bounding_regions(): - doc = MagicMock() figure = DocumentFigure(id="1", caption=None, bounding_regions=None) - media_describer = MagicMock() - - result = await DocumentAnalysisParser.process_figure(doc, figure, media_describer) - expected_html = "
1
" + result = await DocumentAnalysisParser.figure_to_image(None, figure) assert isinstance(result, ImageOnPage) - assert result.description == expected_html + assert result.description is None + assert result.title == "" + assert result.figure_id == "1" + assert result.page_num == 0 + assert result.bbox == (0, 0, 0, 0) + assert result.filename == "figure1.png" @pytest.mark.asyncio @@ -136,13 +153,6 @@ async def test_process_figure_with_bounding_regions(monkeypatch, caplog): BoundingRegion(page_number=2, polygon=[1.4703, 2.8371, 5.5409, 2.8415, 5.5381, 6.6022, 1.4681, 6.5978]), ], ) - media_describer = AsyncMock() - - async def mock_describe_image(image_bytes): - assert image_bytes == b"image_bytes" - return "Described Image" - - monkeypatch.setattr(media_describer, "describe_image", mock_describe_image) def mock_crop_image_from_pdf_page(doc, page_number, bounding_box): assert page_number == 0 @@ -152,11 +162,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box): monkeypatch.setattr(DocumentAnalysisParser, "crop_image_from_pdf_page", mock_crop_image_from_pdf_page) with caplog.at_level(logging.WARNING): - result = await DocumentAnalysisParser.process_figure(doc, figure, media_describer) - expected_html = "
1 Logo
Described Image
" + result = await DocumentAnalysisParser.figure_to_image(doc, figure) assert isinstance(result, ImageOnPage) - assert result.description == expected_html + assert result.description is None + assert result.title == "Logo" assert result.bytes == b"image_bytes" assert result.page_num == 0 assert result.figure_id == "1" @@ -186,7 +196,6 @@ async def mock_poller_result(): parser = DocumentAnalysisParser( endpoint="https://example.com", credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.NONE, ) content = io.BytesIO(b"pdf content bytes") content.name = "test.pdf" @@ -259,7 +268,6 @@ async def mock_poller_result(): parser = DocumentAnalysisParser( endpoint="https://example.com", credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.NONE, ) with open(TEST_DATA_DIR / "Simple Table.pdf", "rb") as f: content = io.BytesIO(f.read()) @@ -304,16 +312,8 @@ async def mock_poller_result(): monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) monkeypatch.setattr(mock_poller, "result", mock_poller_result) - async def mock_describe_image(self, image_bytes): - return "Pie chart" - - monkeypatch.setattr(ContentUnderstandingDescriber, "describe_image", mock_describe_image) - parser = DocumentAnalysisParser( - endpoint="https://example.com", - credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, - content_understanding_endpoint="https://example.com", + endpoint="https://example.com", credential=MockAzureCredential(), process_figures=True ) with open(TEST_DATA_DIR / "Simple Figure.pdf", "rb") as f: @@ -327,8 +327,9 @@ async def mock_describe_image(self, image_bytes): assert pages[0].offset == 0 assert ( pages[0].text - == "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
1.1 Figure 1
Pie chart
\n\n\nThis is text after the figure that's not part of it." + == '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
\n\n\nThis is text after the figure that\'s not part of it.' ) + assert pages[0].images[0].placeholder == '
' @pytest.mark.asyncio @@ -374,16 +375,13 @@ async def mock_poller_result(): monkeypatch.setattr(mock_poller, "result", mock_poller_result) parser = DocumentAnalysisParser( - endpoint="https://example.com", - credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, - content_understanding_endpoint="https://example.com", + endpoint="https://example.com", credential=MockAzureCredential(), process_figures=True ) content = io.BytesIO(b"pdf content bytes") content.name = "test.docx" with caplog.at_level(logging.ERROR): pages = [page async for page in parser.parse(content)] - assert "This document type does not support media description." in caplog.text + assert "does not support media description." in caplog.text assert len(pages) == 1 assert pages[0].page_num == 0 @@ -392,75 +390,292 @@ async def mock_poller_result(): @pytest.mark.asyncio -async def test_parse_doc_with_openai(monkeypatch): - mock_poller = MagicMock() +async def test_figure_processor_openai_requires_client(): + figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.OPENAI) - async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): - return mock_poller + with pytest.raises(ValueError, match="requires both a client and a model name"): + await figure_processor.describe(b"bytes") - async def mock_poller_result(): - content = open(TEST_DATA_DIR / "Simple Figure_content.txt").read() - return AnalyzeResult( - content=content, - pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=148)])], - figures=[ - DocumentFigure( - id="1.1", - caption=DocumentCaption(content="Figure 1"), - bounding_regions=[ - BoundingRegion( - page_number=1, polygon=[0.4295, 1.3072, 1.7071, 1.3076, 1.7067, 2.6088, 0.4291, 2.6085] - ) - ], - spans=[DocumentSpan(offset=70, length=22)], - ) - ], - ) - monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) - monkeypatch.setattr(mock_poller, "result", mock_poller_result) +@pytest.mark.asyncio +async def test_figure_processor_openai_describe(monkeypatch): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.OPENAI, + openai_client=Mock(), + openai_model="gpt-4o", + openai_deployment="gpt-4o", + ) - async def mock_describe_image(self, image_bytes): - return "Pie chart" + describer = AsyncMock() + describer.describe_image.return_value = "Pie chart" - monkeypatch.setattr(MultimodalModelDescriber, "describe_image", mock_describe_image) + async def fake_get_media_describer(self): + return describer - parser = DocumentAnalysisParser( - endpoint="https://example.com", + monkeypatch.setattr(FigureProcessor, "get_media_describer", fake_get_media_describer) + + result = await figure_processor.describe(b"bytes") + + assert result == "Pie chart" + describer.describe_image.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_figure_processor_content_understanding_initializes_once(monkeypatch): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.OPENAI, - openai_client=Mock(), + content_understanding_endpoint="https://example.com", + ) + + class FakeDescriber: + def __init__(self, endpoint, credential): + self.endpoint = endpoint + self.credential = credential + self.create_analyzer = AsyncMock() + self.describe_image = AsyncMock(return_value="A diagram") + + monkeypatch.setattr("prepdocslib.figureprocessor.ContentUnderstandingDescriber", FakeDescriber) + + result_first = await figure_processor.describe(b"image") + assert result_first == "A diagram" + describer_instance = figure_processor.media_describer # type: ignore[attr-defined] + assert isinstance(describer_instance, FakeDescriber) + describer_instance.create_analyzer.assert_awaited_once() + + result_second = await figure_processor.describe(b"image") + assert result_second == "A diagram" + assert describer_instance.create_analyzer.await_count == 1 + + +@pytest.mark.asyncio +async def test_figure_processor_none_strategy_returns_none(): + figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.NONE) + + describer = await figure_processor.get_media_describer() + assert describer is None + + result = await figure_processor.describe(b"bytes") + assert result is None + + +@pytest.mark.asyncio +async def test_figure_processor_content_understanding_missing_endpoint(): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + credential=MockAzureCredential(), + ) + + with pytest.raises(ValueError, match="Content Understanding strategy requires an endpoint"): + await figure_processor.get_media_describer() + + +@pytest.mark.asyncio +async def test_figure_processor_content_understanding_missing_credential(): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + content_understanding_endpoint="https://example.com", + ) + + with pytest.raises(ValueError, match="Content Understanding strategy requires a credential"): + await figure_processor.get_media_describer() + + +@pytest.mark.asyncio +async def test_figure_processor_content_understanding_key_credential(): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + credential=AzureKeyCredential("fake_key"), + content_understanding_endpoint="https://example.com", + ) + + with pytest.raises(ValueError, match="Content Understanding does not support key credentials"): + await figure_processor.get_media_describer() + + +@pytest.mark.asyncio +async def test_figure_processor_openai_returns_describer(monkeypatch): + mock_client = Mock() + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.OPENAI, + openai_client=mock_client, openai_model="gpt-4o", - openai_deployment="gpt-4o", + openai_deployment="gpt-4o-deployment", ) - with open(TEST_DATA_DIR / "Simple Figure.pdf", "rb") as f: - content = io.BytesIO(f.read()) - content.name = "Simple Figure.pdf" + describer = await figure_processor.get_media_describer() + assert describer is not None + assert figure_processor.media_describer is describer - pages = [page async for page in parser.parse(content)] + # Second call should return the same instance + describer2 = await figure_processor.get_media_describer() + assert describer2 is describer - assert len(pages) == 1 - assert pages[0].page_num == 0 - assert pages[0].offset == 0 - assert ( - pages[0].text - == "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
1.1 Figure 1
Pie chart
\n\n\nThis is text after the figure that's not part of it." + +@pytest.mark.asyncio +async def test_figure_processor_unknown_strategy(caplog): + # Create a processor with an invalid strategy by patching the enum + figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.NONE) + # Override the strategy to an unknown value + figure_processor.strategy = "unknown_strategy" # type: ignore[assignment] + + with caplog.at_level(logging.WARNING): + describer = await figure_processor.get_media_describer() + + assert describer is None + assert "Unknown media description strategy" in caplog.text + + +@pytest.mark.asyncio +async def test_figure_processor_mark_content_understanding_ready(): + figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.NONE) + + assert not figure_processor.content_understanding_ready + figure_processor.mark_content_understanding_ready() + assert figure_processor.content_understanding_ready + + +@pytest.mark.asyncio +async def test_build_figure_markup_without_description(sample_image): + sample_image.title = "Sample Figure" + + result = build_figure_markup(sample_image, description=None) + assert result == "
fig_1 Sample Figure
" + + +@pytest.mark.asyncio +async def test_process_page_image_without_blob_manager(sample_image): + with pytest.raises(ValueError, match="BlobManager must be provided"): + await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=None, + image_embeddings_client=None, + ) + + +@pytest.mark.asyncio +async def test_process_page_image_without_figure_processor(sample_image): + + blob_manager = AsyncMock() + blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") + + result = await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=blob_manager, + image_embeddings_client=None, + figure_processor=None, ) + assert result.description is None + assert result.url == "https://example.com/image.png" + blob_manager.upload_document_image.assert_awaited_once() + @pytest.mark.asyncio -async def test_parse_doc_with_openai_missing_parameters(): - parser = DocumentAnalysisParser( - endpoint="https://example.com", - credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.OPENAI, - # Intentionally not providing openai_client and openai_model +async def test_process_page_image_sets_description(sample_image): + + blob_manager = AsyncMock() + blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") + + figure_processor = AsyncMock() + figure_processor.describe = AsyncMock(return_value="A bar chart") + + result = await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=blob_manager, + image_embeddings_client=None, + figure_processor=figure_processor, ) - content = io.BytesIO(b"pdf content bytes") - content.name = "test.pdf" + assert result.description == "A bar chart" + figure_processor.describe.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_process_page_image_skips_upload_if_url_exists(sample_image): + + sample_image.url = "https://existing.com/image.png" + + blob_manager = AsyncMock() + blob_manager.upload_document_image = AsyncMock() + + result = await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=blob_manager, + image_embeddings_client=None, + ) + + assert result.url == "https://existing.com/image.png" + blob_manager.upload_document_image.assert_not_awaited() + + +@pytest.mark.asyncio +async def test_process_page_image_with_embeddings(sample_image): + + blob_manager = AsyncMock() + blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") + + image_embeddings = AsyncMock() + image_embeddings.create_embedding_for_image = AsyncMock(return_value=[0.1, 0.2, 0.3]) + + result = await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=blob_manager, + image_embeddings_client=image_embeddings, + ) + + assert result.embedding == [0.1, 0.2, 0.3] + image_embeddings.create_embedding_for_image.assert_awaited_once() + + +def test_image_on_page_from_skill_payload_without_bytes(): + """Test ImageOnPage.from_skill_payload when bytes_base64 is not provided.""" + payload = { + "filename": "test.png", + "figure_id": "fig_1", + "page_num": "1", + "bbox": [0, 0, 100, 100], + "document_file_name": "test.pdf", + } + + image, doc_filename = ImageOnPage.from_skill_payload(payload) + + assert image.bytes == b"" + assert image.filename == "test.png" + assert image.figure_id == "fig_1" + assert image.page_num == 1 + assert image.bbox == (0, 0, 100, 100) + assert doc_filename == "test.pdf" + + +def test_image_on_page_from_skill_payload_invalid_page_num(): + """Test ImageOnPage.from_skill_payload with invalid page_num.""" + payload = { + "filename": "test.png", + "figure_id": "fig_1", + "page_num": "invalid", + "bbox": [0, 0, 100, 100], + } + + image, _ = ImageOnPage.from_skill_payload(payload) + + assert image.page_num == 0 + + +def test_image_on_page_from_skill_payload_invalid_bbox(): + """Test ImageOnPage.from_skill_payload with invalid bbox.""" + payload = { + "filename": "test.png", + "figure_id": "fig_1", + "page_num": 1, + "bbox": [0, 0, 100], # Only 3 elements + } + + image, _ = ImageOnPage.from_skill_payload(payload) - with pytest.raises(ValueError, match="OpenAI client must be provided when using OpenAI media description strategy"): - # Call the first iteration of the generator without using async for - await parser.parse(content).__anext__() + assert image.bbox == (0, 0, 0, 0) diff --git a/tests/test_prepdocs.py b/tests/test_prepdocs.py index 795d18c2b8..6420959487 100644 --- a/tests/test_prepdocs.py +++ b/tests/test_prepdocs.py @@ -192,46 +192,6 @@ async def test_image_embeddings_success(mock_azurehttp_calls): mock_token_provider.assert_called_once() -def test_setup_blob_manager_respects_storage_key(monkeypatch: pytest.MonkeyPatch) -> None: - captured: dict[str, object] = {} - - class StubBlobManager: - def __init__( - self, - *, - endpoint: str, - container: str, - account: str, - credential: object, - resource_group: str, - subscription_id: str, - image_container: str | None = None, - ) -> None: - captured["endpoint"] = endpoint - captured["container"] = container - captured["account"] = account - captured["credential"] = credential - captured["resource_group"] = resource_group - captured["subscription_id"] = subscription_id - captured["image_container"] = image_container - - monkeypatch.setattr(prepdocs, "BlobManager", StubBlobManager) - - result = prepdocs.setup_blob_manager( - azure_credential=MockAzureCredential(), - storage_account="storageacct", - storage_container="docs", - storage_resource_group="rg", - subscription_id="sub-id", - storage_key="override-key", - image_storage_container="images", - ) - - assert isinstance(result, StubBlobManager) - assert captured["credential"] == "override-key" - assert captured["image_container"] == "images" - - def test_setup_list_file_strategy_uses_datalake_key(monkeypatch: pytest.MonkeyPatch) -> None: captured: dict[str, object] = {} @@ -268,72 +228,6 @@ def __init__( assert captured["enable_global_documents"] is True -def test_setup_embeddings_service_populates_azure_metadata() -> None: - embeddings = prepdocs.setup_embeddings_service( - open_ai_client=MockClient( - MockEmbeddingsClient( - openai.types.CreateEmbeddingResponse( - object="list", - data=[], - model="text-embedding-3-large", - usage=Usage(prompt_tokens=0, total_tokens=0), - ) - ) - ), - openai_host=prepdocs.OpenAIHost.AZURE, - emb_model_name=MOCK_EMBEDDING_MODEL_NAME, - emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, - azure_openai_deployment="deployment", - azure_openai_endpoint="https://service.openai.azure.com", - ) - - assert isinstance(embeddings, OpenAIEmbeddings) - assert embeddings.azure_deployment_name == "deployment" - assert embeddings.azure_endpoint == "https://service.openai.azure.com" - - -def test_setup_embeddings_service_requires_endpoint_for_azure() -> None: - with pytest.raises(ValueError): - prepdocs.setup_embeddings_service( - open_ai_client=MockClient( - MockEmbeddingsClient( - openai.types.CreateEmbeddingResponse( - object="list", - data=[], - model="text-embedding-3-large", - usage=Usage(prompt_tokens=0, total_tokens=0), - ) - ) - ), - openai_host=prepdocs.OpenAIHost.AZURE, - emb_model_name=MOCK_EMBEDDING_MODEL_NAME, - emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, - azure_openai_deployment="deployment", - azure_openai_endpoint=None, - ) - - -def test_setup_embeddings_service_requires_deployment_for_azure() -> None: - with pytest.raises(ValueError): - prepdocs.setup_embeddings_service( - open_ai_client=MockClient( - MockEmbeddingsClient( - openai.types.CreateEmbeddingResponse( - object="list", - data=[], - model="text-embedding-3-large", - usage=Usage(prompt_tokens=0, total_tokens=0), - ) - ) - ), - openai_host=prepdocs.OpenAIHost.AZURE, - emb_model_name=MOCK_EMBEDDING_MODEL_NAME, - emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, - azure_openai_deployment=None, - azure_openai_endpoint="https://service.openai.azure.com", - ) - - @pytest.mark.asyncio async def test_openai_embeddings_use_deployment_for_azure_model(): class RecordingEmbeddingsClient: @@ -424,99 +318,3 @@ async def run(self) -> None: assert captured["credentials"].key == "secret" assert captured["service_name"] == "searchsvc" assert captured["index_name"] == "searchindex" - - -def test_setup_openai_client_azure_constructs_endpoint_correctly(monkeypatch: pytest.MonkeyPatch) -> None: - """Test that setup_openai_client correctly constructs the Azure OpenAI endpoint URL from service name.""" - captured_base_url: list[str] = [] - - class StubAsyncOpenAI: - def __init__(self, *, base_url: str, api_key, **kwargs) -> None: - captured_base_url.append(base_url) - - monkeypatch.setattr(prepdocs, "AsyncOpenAI", StubAsyncOpenAI) - monkeypatch.setattr(prepdocs, "get_bearer_token_provider", lambda *args, **kwargs: lambda: "fake_token") - - client, endpoint = prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE, - azure_credential=MockAzureCredential(), - azure_openai_service="myopenaiservice", - ) - - # Verify the endpoint is constructed correctly - assert endpoint == "https://myopenaiservice.openai.azure.com" - # Verify the base_url includes the endpoint with the openai/v1 suffix - assert captured_base_url[0] == "https://myopenaiservice.openai.azure.com/openai/v1" - - -def test_setup_openai_client_azure_custom_uses_custom_url(monkeypatch: pytest.MonkeyPatch) -> None: - """Test that setup_openai_client uses the custom URL for azure_custom host.""" - captured_base_url: list[str] = [] - - class StubAsyncOpenAI: - def __init__(self, *, base_url: str, api_key, **kwargs) -> None: - captured_base_url.append(base_url) - - monkeypatch.setattr(prepdocs, "AsyncOpenAI", StubAsyncOpenAI) - - client, endpoint = prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE_CUSTOM, - azure_credential=MockAzureCredential(), - azure_openai_custom_url="https://custom.endpoint.com/openai", - azure_openai_api_key="test-key", - ) - - # Verify the custom URL is used - assert captured_base_url[0] == "https://custom.endpoint.com/openai" - # Verify endpoint is None for custom URLs - assert endpoint is None - - -def test_setup_openai_client_azure_respects_api_key(monkeypatch: pytest.MonkeyPatch) -> None: - """Test that setup_openai_client uses the API key override when provided.""" - captured_api_key: list[str] = [] - - class StubAsyncOpenAI: - def __init__(self, *, base_url: str, api_key: str, **kwargs) -> None: - captured_api_key.append(api_key) - - monkeypatch.setattr(prepdocs, "AsyncOpenAI", StubAsyncOpenAI) - - client, endpoint = prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE, - azure_credential=MockAzureCredential(), - azure_openai_service="myopenaiservice", - azure_openai_api_key="my-api-key-override", - ) - - assert captured_api_key[0] == "my-api-key-override" - - -def test_setup_openai_client_openai_requires_api_key() -> None: - """Test that setup_openai_client raises ValueError when using OpenAI without API key.""" - with pytest.raises(ValueError, match="OpenAI key is required"): - prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.OPENAI, - azure_credential=MockAzureCredential(), - openai_api_key=None, - ) - - -def test_setup_openai_client_azure_requires_service() -> None: - """Test that setup_openai_client raises ValueError when using Azure without service name.""" - with pytest.raises(ValueError, match="AZURE_OPENAI_SERVICE must be set"): - prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE, - azure_credential=MockAzureCredential(), - azure_openai_service=None, - ) - - -def test_setup_openai_client_azure_custom_requires_url() -> None: - """Test that setup_openai_client raises ValueError when using azure_custom without custom URL.""" - with pytest.raises(ValueError, match="AZURE_OPENAI_CUSTOM_URL must be set"): - prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE_CUSTOM, - azure_credential=MockAzureCredential(), - azure_openai_custom_url=None, - ) diff --git a/tests/test_prepdocslib_filestrategy.py b/tests/test_prepdocslib_filestrategy.py index 882832e739..556d074601 100644 --- a/tests/test_prepdocslib_filestrategy.py +++ b/tests/test_prepdocslib_filestrategy.py @@ -1,14 +1,19 @@ import os +from io import BytesIO import pytest from azure.search.documents.aio import SearchClient from prepdocslib.blobmanager import BlobManager +from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy from prepdocslib.fileprocessor import FileProcessor -from prepdocslib.filestrategy import FileStrategy +from prepdocslib.filestrategy import FileStrategy, parse_file from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, + File, + LocalListFileStrategy, ) +from prepdocslib.page import ImageOnPage, Page from prepdocslib.strategy import SearchInfo from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SimpleTextSplitter @@ -100,3 +105,140 @@ async def mock_upload_documents(self, documents): "storageUrl": "https://test.blob.core.windows.net/c.txt", }, ] + + +@pytest.mark.asyncio +async def test_parse_file_with_images(monkeypatch): + """Test that parse_file processes images and logs appropriately.""" + + # Create a mock file + mock_file = File(content=BytesIO(b"test content")) + mock_file.filename = lambda: "test.txt" + + # Create a mock processor + mock_parser = type("MockParser", (), {})() + + async def mock_parse(content): + # Create a page with an image + image = ImageOnPage( + bytes=b"fake_image", + bbox=(0, 0, 100, 100), + page_num=1, + figure_id="fig_1", + filename="test_image.png", + placeholder='
', + ) + page = Page(page_num=1, text="Some text", offset=0) + page.images = [image] + yield page + + mock_parser.parse = mock_parse + + mock_splitter = type("MockSplitter", (), {})() + mock_processor = type("MockProcessor", (), {"parser": mock_parser, "splitter": mock_splitter})() + + # Create mock blob manager + mock_blob_manager = type("MockBlobManager", (), {})() + + async def mock_upload(*args, **kwargs): + return "https://example.com/image.png" + + mock_blob_manager.upload_document_image = mock_upload + + # Create mock figure processor + mock_figure_processor = type("MockFigureProcessor", (), {})() + + async def mock_describe(bytes): + return "A test image" + + mock_figure_processor.describe = mock_describe + + # Mock process_text to return sections + def mock_process_text(pages, file, splitter, category): + return [] + + monkeypatch.setattr("prepdocslib.filestrategy.process_text", mock_process_text) + + # Call parse_file + sections = await parse_file( + mock_file, + {".txt": mock_processor}, + category=None, + blob_manager=mock_blob_manager, + image_embeddings_client=None, + figure_processor=mock_figure_processor, + user_oid=None, + ) + + assert sections == [] + + +@pytest.mark.asyncio +async def test_file_strategy_setup_with_content_understanding(monkeypatch, mock_env): + """Test that FileStrategy.setup() properly initializes content understanding.""" + + # Create mock list strategy + list_strategy = LocalListFileStrategy(path_pattern="*.txt") + + # Create blob manager + blob_manager = BlobManager( + endpoint=f"https://{os.environ['AZURE_STORAGE_ACCOUNT']}.blob.core.windows.net", + credential=MockAzureCredential(), + container=os.environ["AZURE_STORAGE_CONTAINER"], + account=os.environ["AZURE_STORAGE_ACCOUNT"], + resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"], + subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], + ) + + # Create search info + search_info = SearchInfo( + endpoint="https://testsearchclient.blob.core.windows.net", + credential=MockAzureCredential(), + index_name="test", + ) + + # Create mock content understanding describer + class MockContentUnderstandingDescriber: + def __init__(self, endpoint, credential): + self.endpoint = endpoint + self.credential = credential + self.create_analyzer_called = False + + async def create_analyzer(self): + self.create_analyzer_called = True + + # Monkeypatch the ContentUnderstandingDescriber in multiple places + monkeypatch.setattr("prepdocslib.figureprocessor.ContentUnderstandingDescriber", MockContentUnderstandingDescriber) + monkeypatch.setattr("prepdocslib.filestrategy.ContentUnderstandingDescriber", MockContentUnderstandingDescriber) + + # Create figure processor with content understanding + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + credential=MockAzureCredential(), + content_understanding_endpoint="https://example.com", + ) + + # Mock create_index + async def mock_create_index(self): + pass + + monkeypatch.setattr("prepdocslib.searchmanager.SearchManager.create_index", mock_create_index) + + # Create file strategy + file_strategy = FileStrategy( + list_file_strategy=list_strategy, + blob_manager=blob_manager, + search_info=search_info, + file_processors={".txt": FileProcessor(TextParser(), SimpleTextSplitter())}, + figure_processor=figure_processor, + ) + + # Call setup + await file_strategy.setup() + + # Verify content understanding was initialized during setup + assert figure_processor.media_describer is not None + assert isinstance(figure_processor.media_describer, MockContentUnderstandingDescriber) + # create_analyzer should be called during setup for content understanding + assert figure_processor.media_describer.create_analyzer_called + assert figure_processor.content_understanding_ready diff --git a/tests/test_searchmanager.py b/tests/test_searchmanager.py index 5aeb31685d..a730f469f7 100644 --- a/tests/test_searchmanager.py +++ b/tests/test_searchmanager.py @@ -54,7 +54,7 @@ async def mock_list_index_names(self): monkeypatch.setattr(SearchIndexClient, "create_index", mock_create_index) monkeypatch.setattr(SearchIndexClient, "list_index_names", mock_list_index_names) - manager = SearchManager(search_info, use_int_vectorization=False, field_name_embedding="embedding") + manager = SearchManager(search_info, use_parent_index_projection=False, field_name_embedding="embedding") await manager.create_index() assert len(indexes) == 1, "It should have created one index" assert indexes[0].name == "test" @@ -77,7 +77,7 @@ async def mock_list_index_names(self): manager = SearchManager( search_info, - use_int_vectorization=True, + use_parent_index_projection=True, field_name_embedding="embedding", ) await manager.create_index() @@ -694,6 +694,7 @@ async def mock_upload_documents(self, documents): description="Test image", figure_id="fig1", page_num=0, + placeholder="
", # required positional arg url="http://example.com/img1.png", embedding=[0.01, 0.02], ) diff --git a/tests/test_servicesetup.py b/tests/test_servicesetup.py new file mode 100644 index 0000000000..0334042034 --- /dev/null +++ b/tests/test_servicesetup.py @@ -0,0 +1,353 @@ +import openai +import pytest +from openai.types.create_embedding_response import Usage + +from prepdocslib.embeddings import OpenAIEmbeddings +from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy +from prepdocslib.fileprocessor import FileProcessor +from prepdocslib.pdfparser import DocumentAnalysisParser +from prepdocslib.servicesetup import ( + OpenAIHost, + build_file_processors, + clean_key_if_exists, + select_processor_for_filename, + setup_blob_manager, + setup_embeddings_service, + setup_figure_processor, + setup_image_embeddings_service, + setup_openai_client, + setup_search_info, +) +from prepdocslib.textparser import TextParser + +from .mocks import ( + MOCK_EMBEDDING_DIMENSIONS, + MOCK_EMBEDDING_MODEL_NAME, + MockAzureCredential, +) +from .test_prepdocs import MockClient, MockEmbeddingsClient + + +def test_setup_blob_manager_respects_storage_key(monkeypatch: pytest.MonkeyPatch) -> None: + captured: dict[str, object] = {} + + class StubBlobManager: + def __init__( + self, + *, + endpoint: str, + container: str, + account: str, + credential: object, + resource_group: str, + subscription_id: str, + image_container: str | None = None, + ) -> None: + captured["endpoint"] = endpoint + captured["container"] = container + captured["account"] = account + captured["credential"] = credential + captured["resource_group"] = resource_group + captured["subscription_id"] = subscription_id + captured["image_container"] = image_container + + monkeypatch.setattr("prepdocslib.servicesetup.BlobManager", StubBlobManager) + + result = setup_blob_manager( + azure_credential=MockAzureCredential(), + storage_account="storageacct", + storage_container="docs", + storage_resource_group="rg", + subscription_id="sub-id", + storage_key="override-key", + image_storage_container="images", + ) + + assert isinstance(result, StubBlobManager) + assert captured["credential"] == "override-key" + assert captured["image_container"] == "images" + + +def test_setup_embeddings_service_populates_azure_metadata() -> None: + embeddings = setup_embeddings_service( + open_ai_client=MockClient( + MockEmbeddingsClient( + openai.types.CreateEmbeddingResponse( + object="list", + data=[], + model="text-embedding-3-large", + usage=Usage(prompt_tokens=0, total_tokens=0), + ) + ) + ), + openai_host=OpenAIHost.AZURE, + emb_model_name=MOCK_EMBEDDING_MODEL_NAME, + emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, + azure_openai_deployment="deployment", + azure_openai_endpoint="https://service.openai.azure.com", + ) + + assert isinstance(embeddings, OpenAIEmbeddings) + assert embeddings.azure_deployment_name == "deployment" + assert embeddings.azure_endpoint == "https://service.openai.azure.com" + + +def test_setup_embeddings_service_requires_endpoint_for_azure() -> None: + with pytest.raises(ValueError): + setup_embeddings_service( + open_ai_client=MockClient( + MockEmbeddingsClient( + openai.types.CreateEmbeddingResponse( + object="list", + data=[], + model="text-embedding-3-large", + usage=Usage(prompt_tokens=0, total_tokens=0), + ) + ) + ), + openai_host=OpenAIHost.AZURE, + emb_model_name=MOCK_EMBEDDING_MODEL_NAME, + emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, + azure_openai_deployment="deployment", + azure_openai_endpoint=None, + ) + + +def test_setup_embeddings_service_requires_deployment_for_azure() -> None: + with pytest.raises(ValueError): + setup_embeddings_service( + open_ai_client=MockClient( + MockEmbeddingsClient( + openai.types.CreateEmbeddingResponse( + object="list", + data=[], + model="text-embedding-3-large", + usage=Usage(prompt_tokens=0, total_tokens=0), + ) + ) + ), + openai_host=OpenAIHost.AZURE, + emb_model_name=MOCK_EMBEDDING_MODEL_NAME, + emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, + azure_openai_deployment=None, + azure_openai_endpoint="https://service.openai.azure.com", + ) + + +def test_setup_openai_client_azure_constructs_endpoint_correctly(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that setup_openai_client correctly constructs the Azure OpenAI endpoint URL from service name.""" + captured_base_url: list[str] = [] + + class StubAsyncOpenAI: + def __init__(self, *, base_url: str, api_key, **kwargs) -> None: + captured_base_url.append(base_url) + + monkeypatch.setattr("prepdocslib.servicesetup.AsyncOpenAI", StubAsyncOpenAI) + monkeypatch.setattr( + "prepdocslib.servicesetup.get_bearer_token_provider", lambda *args, **kwargs: lambda: "fake_token" + ) + + _, endpoint = setup_openai_client( + openai_host=OpenAIHost.AZURE, + azure_credential=MockAzureCredential(), + azure_openai_service="myopenaiservice", + ) + + # Verify the endpoint is constructed correctly + assert endpoint == "https://myopenaiservice.openai.azure.com" + # Verify the base_url includes the endpoint with the openai/v1 suffix + assert captured_base_url[0] == "https://myopenaiservice.openai.azure.com/openai/v1" + + +def test_setup_openai_client_azure_custom_uses_custom_url(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that setup_openai_client uses the custom URL for azure_custom host.""" + captured_base_url: list[str] = [] + + class StubAsyncOpenAI: + def __init__(self, *, base_url: str, api_key, **kwargs) -> None: + captured_base_url.append(base_url) + + monkeypatch.setattr("prepdocslib.servicesetup.AsyncOpenAI", StubAsyncOpenAI) + + _, endpoint = setup_openai_client( + openai_host=OpenAIHost.AZURE_CUSTOM, + azure_credential=MockAzureCredential(), + azure_openai_custom_url="https://custom.endpoint.com/openai", + azure_openai_api_key="test-key", + ) + + # Verify the custom URL is used + assert captured_base_url[0] == "https://custom.endpoint.com/openai" + # Verify endpoint is None for custom URLs + assert endpoint is None + + +def test_setup_openai_client_azure_respects_api_key(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that setup_openai_client uses the API key override when provided.""" + captured_api_key: list[str] = [] + + class StubAsyncOpenAI: + def __init__(self, *, base_url: str, api_key: str, **kwargs) -> None: + captured_api_key.append(api_key) + + monkeypatch.setattr("prepdocslib.servicesetup.AsyncOpenAI", StubAsyncOpenAI) + + setup_openai_client( + openai_host=OpenAIHost.AZURE, + azure_credential=MockAzureCredential(), + azure_openai_service="myopenaiservice", + azure_openai_api_key="my-api-key-override", + ) + + assert captured_api_key[0] == "my-api-key-override" + + +def test_setup_openai_client_openai_requires_api_key() -> None: + """Test that setup_openai_client raises ValueError when using OpenAI without API key.""" + with pytest.raises(ValueError, match="OpenAI key is required"): + setup_openai_client( + openai_host=OpenAIHost.OPENAI, + azure_credential=MockAzureCredential(), + openai_api_key=None, + ) + + +def test_setup_openai_client_azure_requires_service() -> None: + """Test that setup_openai_client raises ValueError when using Azure without service name.""" + with pytest.raises(ValueError, match="AZURE_OPENAI_SERVICE must be set"): + setup_openai_client( + openai_host=OpenAIHost.AZURE, + azure_credential=MockAzureCredential(), + azure_openai_service=None, + ) + + +def test_setup_openai_client_azure_custom_requires_url() -> None: + """Test that setup_openai_client raises ValueError when using azure_custom without custom URL.""" + with pytest.raises(ValueError, match="AZURE_OPENAI_CUSTOM_URL must be set"): + setup_openai_client( + openai_host=OpenAIHost.AZURE_CUSTOM, + azure_credential=MockAzureCredential(), + azure_openai_custom_url=None, + ) + + +def test_setup_search_info_agentic_retrieval_without_model(): + """Test that setup_search_info raises ValueError when using agentic retrieval without search agent model.""" + with pytest.raises(ValueError, match="SearchAgent model must be specified"): + setup_search_info( + azure_credential=MockAzureCredential(), + search_service="mysearch", + index_name="myindex", + use_agentic_retrieval=True, + azure_openai_searchagent_model=None, + ) + + +def test_setup_image_embeddings_multimodal_without_vision(): + """Test that setup_image_embeddings_service raises ValueError when using multimodal without vision endpoint.""" + with pytest.raises(ValueError, match="Azure AI Vision endpoint must be provided"): + setup_image_embeddings_service( + use_multimodal=True, + vision_endpoint=None, + azure_credential=MockAzureCredential(), + ) + + +def test_setup_figure_processor_content_understanding(): + """Test that setup_figure_processor returns correct processor for content understanding.""" + processor = setup_figure_processor( + use_multimodal=False, + use_content_understanding=True, + content_understanding_endpoint="https://example.com", + credential=MockAzureCredential(), + openai_client=None, + openai_model=None, + openai_deployment=None, + ) + + assert isinstance(processor, FigureProcessor) + assert processor.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING + + +def test_build_file_processors_with_document_intelligence_key(): + """Test that build_file_processors uses key credential when provided.""" + file_processors = build_file_processors( + azure_credential=MockAzureCredential(), + document_intelligence_service="myservice", + document_intelligence_key="my-key", + use_local_pdf_parser=False, + use_local_html_parser=False, + ) + + assert ".pdf" in file_processors + assert isinstance(file_processors[".pdf"].parser, DocumentAnalysisParser) + + +def test_build_file_processors_text_files(): + """Test that build_file_processors includes text file parsers.""" + file_processors = build_file_processors( + azure_credential=MockAzureCredential(), + document_intelligence_service=None, + ) + + assert ".txt" in file_processors + assert isinstance(file_processors[".txt"].parser, TextParser) + assert ".md" in file_processors + assert isinstance(file_processors[".md"].parser, TextParser) + + +def test_build_file_processors_with_di_enables_office_formats(): + """Test that build_file_processors includes Office formats when DI is available.""" + file_processors = build_file_processors( + azure_credential=MockAzureCredential(), + document_intelligence_service="myservice", + ) + + assert ".docx" in file_processors + assert ".pptx" in file_processors + assert ".xlsx" in file_processors + assert isinstance(file_processors[".docx"].parser, DocumentAnalysisParser) + + +def test_build_file_processors_without_di_excludes_office_formats(): + """Test that build_file_processors excludes Office formats when DI is not available.""" + file_processors = build_file_processors( + azure_credential=MockAzureCredential(), + document_intelligence_service=None, + ) + + assert ".docx" not in file_processors + assert ".pptx" not in file_processors + assert ".xlsx" not in file_processors + + +def test_clean_key_if_exists_handles_whitespace() -> None: + assert clean_key_if_exists(" secret ") == "secret" + assert clean_key_if_exists(" ") is None + assert clean_key_if_exists(None) is None + + +def test_build_file_processors_logs_when_no_parsers( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + caplog.set_level("WARNING") + monkeypatch.setattr("prepdocslib.servicesetup.DocumentAnalysisParser", lambda *args, **kwargs: None) + + processors = build_file_processors( + azure_credential=MockAzureCredential(), + document_intelligence_service="service", + use_local_pdf_parser=False, + use_local_html_parser=False, + ) + + assert ".pdf" not in processors + assert ".html" not in processors + warnings = {record.message for record in caplog.records} + assert any("No PDF parser available" in message for message in warnings) + assert any("No HTML parser available" in message for message in warnings) + + +def test_select_processor_for_filename_raises_when_unknown() -> None: + with pytest.raises(ValueError, match="Unsupported file type: file.unsupported"): + select_processor_for_filename("file.unsupported", {".txt": FileProcessor(TextParser(), None)}) diff --git a/tests/test_textprocessor.py b/tests/test_textprocessor.py new file mode 100644 index 0000000000..1ff739280d --- /dev/null +++ b/tests/test_textprocessor.py @@ -0,0 +1,70 @@ +from prepdocslib.page import ImageOnPage, Page +from prepdocslib.textprocessor import combine_text_with_figures + + +def test_combine_text_with_figures_no_description(): + """Test combine_text_with_figures when image has no description.""" + image = ImageOnPage( + bytes=b"fake", + bbox=(0, 0, 100, 100), + filename="test.png", + page_num=1, + figure_id="fig_1", + placeholder="[PLACEHOLDER_fig_1]", + description=None, + ) + + page = Page(page_num=1, text="Some text [PLACEHOLDER_fig_1] more text", offset=0) + page.images = [image] + + # Should keep placeholder when no description + combine_text_with_figures(page) + + assert "[PLACEHOLDER_fig_1]" in page.text + assert "
" not in page.text + + +def test_combine_text_with_figures_placeholder_not_found(caplog): + """Test combine_text_with_figures when placeholder is not in text.""" + import logging + + image = ImageOnPage( + bytes=b"fake", + bbox=(0, 0, 100, 100), + filename="test.png", + page_num=1, + figure_id="fig_1", + placeholder="[PLACEHOLDER_fig_1]", + description="A test image", + ) + + page = Page(page_num=1, text="Some text without placeholder", offset=0) + page.images = [image] + + with caplog.at_level(logging.WARNING): + combine_text_with_figures(page) + + assert "Placeholder not found for figure fig_1" in caplog.text + + +def test_combine_text_with_figures_replaces_successfully(): + """Test combine_text_with_figures successfully replaces placeholder.""" + image = ImageOnPage( + bytes=b"fake", + bbox=(0, 0, 100, 100), + filename="test.png", + page_num=1, + figure_id="fig_1", + title="Test Figure", + placeholder="[PLACEHOLDER_fig_1]", + description="A test image", + ) + + page = Page(page_num=1, text="Some text [PLACEHOLDER_fig_1] more text", offset=0) + page.images = [image] + + combine_text_with_figures(page) + + assert "[PLACEHOLDER_fig_1]" not in page.text + assert "
" in page.text + assert "A test image" in page.text