diff --git a/.gitignore b/.gitignore index 0102334ea2..83b30e735c 100644 --- a/.gitignore +++ b/.gitignore @@ -149,6 +149,7 @@ node_modules static/ app/functions/*/prepdocslib/ +app/functions/*/requirements.txt data/**/*.md5 diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index 2da9c367ae..324ab2910b 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -161,10 +161,9 @@ def _build_skillset(self) -> SearchIndexerSkillset: resource_id=self.search_user_assigned_identity_resource_id ), inputs=[ - # Provide the binary payload expected by the document extractor custom skill. - InputFieldMappingEntry(name="file_data", source="/document/file_data"), - InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), - InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"), + # Always provide the blob URL so the function can download large files (> 16MB) + InputFieldMappingEntry(name="metadata_storage_path", source="/document/metadata_storage_path"), + # We are not using the SAS token since the functions have RBAC access via managed identity ], outputs=[ OutputFieldMappingEntry(name="pages", target_name="pages"), @@ -310,7 +309,7 @@ async def setup(self) -> None: configuration=IndexingParametersConfiguration( query_timeout=None, # type: ignore data_to_extract="storageMetadata", - allow_skillset_to_read_file_data=True, + allow_skillset_to_read_file_data=False, ) ), ) diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py index 22f269b2a1..816d135fbf 100644 --- a/app/functions/document_extractor/function_app.py +++ b/app/functions/document_extractor/function_app.py @@ -3,23 +3,25 @@ Custom skill for Azure AI Search that extracts and processes document content. """ -import base64 import io import json import logging import os from dataclasses import dataclass from typing import Any +from urllib.parse import unquote, urlparse import azure.functions as func from azure.core.exceptions import HttpResponseError from azure.identity.aio import ManagedIdentityCredential +from prepdocslib.blobmanager import BlobManager from prepdocslib.fileprocessor import FileProcessor from prepdocslib.page import Page from prepdocslib.servicesetup import ( build_file_processors, select_processor_for_filename, + setup_blob_manager, ) app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) @@ -31,6 +33,7 @@ class GlobalSettings: file_processors: dict[str, FileProcessor] azure_credential: ManagedIdentityCredential + blob_manager: BlobManager settings: GlobalSettings | None = None @@ -63,9 +66,18 @@ def configure_global_settings(): process_figures=use_multimodal, ) + blob_manager = setup_blob_manager( + azure_credential=azure_credential, + storage_account=os.environ["AZURE_STORAGE_ACCOUNT"], + storage_container=os.environ["AZURE_STORAGE_CONTAINER"], + storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"], + subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], + ) + settings = GlobalSettings( file_processors=file_processors, azure_credential=azure_credential, + blob_manager=blob_manager, ) @@ -75,20 +87,13 @@ async def extract_document(req: func.HttpRequest) -> func.HttpResponse: """ Azure Search Custom Skill: Extract document content - Input format (single record; file data only): - # https://learn.microsoft.com/azure/search/cognitive-search-skill-document-intelligence-layout#skill-inputs + Input format (single record): { "values": [ { "recordId": "1", "data": { - // Base64 encoded file (skillset must enable file data) - "file_data": { - "$type": "file", - "data": "base64..." - }, - // Optional - "file_name": "doc.pdf" + "metadata_storage_path": "https://.blob.core.windows.net//" } } ] @@ -176,16 +181,37 @@ async def process_document(data: dict[str, Any]) -> dict[str, Any]: Process a single document: download, parse, extract figures, upload images Args: - data: Input data with blobUrl, fileName, contentType + data: Input data with metadata_storage_path Returns: Dictionary with 'text' (markdown) and 'images' (list of {url, description}) """ - document_stream, file_name, content_type = get_document_stream_filedata(data) - logger.info("Processing document: %s", file_name) + if settings is None: + raise RuntimeError("Global settings not initialized") + + # Get blob path from metadata_storage_path URL + # URL format: https://.blob.core.windows.net// + storage_path = data["metadata_storage_path"] + parsed_url = urlparse(storage_path) + # Path is //, so split and take everything after container + path_parts = unquote(parsed_url.path).lstrip("/").split("/", 1) + if len(path_parts) < 2: + raise ValueError(f"Invalid storage path format: {storage_path}") + blob_path_within_container = path_parts[1] # Everything after the container name + + logger.info("Downloading blob: %s", blob_path_within_container) + result = await settings.blob_manager.download_blob(blob_path_within_container) + if result is None: + raise ValueError(f"Blob not found: {blob_path_within_container}") + + document_bytes, properties = result + document_stream = io.BytesIO(document_bytes) + document_stream.name = blob_path_within_container + + logger.info("Processing document: %s", blob_path_within_container) # Get parser from file_processors dict based on file extension - file_processor = select_processor_for_filename(file_name, settings.file_processors) + file_processor = select_processor_for_filename(blob_path_within_container, settings.file_processors) parser = file_processor.parser pages: list[Page] = [] @@ -193,28 +219,14 @@ async def process_document(data: dict[str, Any]) -> dict[str, Any]: document_stream.seek(0) pages = [page async for page in parser.parse(content=document_stream)] except HttpResponseError as exc: - raise ValueError(f"Parser failed for {file_name}: {exc.message}") from exc + raise ValueError(f"Parser failed for {blob_path_within_container}: {exc.message}") from exc finally: document_stream.close() - components = build_document_components(file_name, pages) + components = build_document_components(blob_path_within_container, pages) return components -def get_document_stream_filedata(data: dict[str, Any]) -> tuple[io.BytesIO, str, str]: - """Return a BytesIO stream for file_data input only (skillset must send file bytes).""" - file_payload = data.get("file_data", {}) - encoded = file_payload.get("data") - if not encoded: - raise ValueError("file_data payload missing base64 data") - document_bytes = base64.b64decode(encoded) - file_name = data.get("file_name") or data.get("fileName") or file_payload.get("name") or "document" - content_type = data.get("contentType") or file_payload.get("contentType") or "application/octet-stream" - stream = io.BytesIO(document_bytes) - stream.name = file_name - return stream, file_name, content_type - - def build_document_components(file_name: str, pages: list[Page]) -> dict[str, Any]: page_entries: list[dict[str, Any]] = [] figure_entries: list[dict[str, Any]] = [] diff --git a/app/functions/document_extractor/requirements.txt b/app/functions/document_extractor/requirements.txt deleted file mode 100644 index 70565b78c6..0000000000 --- a/app/functions/document_extractor/requirements.txt +++ /dev/null @@ -1,456 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile requirements.in -o requirements.txt --python-version 3.10 -aiofiles==24.1.0 - # via - # prompty - # quart -aiohappyeyeballs==2.6.1 - # via aiohttp -aiohttp==3.12.14 - # via - # -r requirements.in - # microsoft-kiota-authentication-azure -aiosignal==1.4.0 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -anyio==4.4.0 - # via - # httpx - # openai -asgiref==3.10.0 - # via opentelemetry-instrumentation-asgi -async-timeout==5.0.1 - # via aiohttp -attrs==25.3.0 - # via aiohttp -azure-ai-documentintelligence==1.0.0b4 - # via -r requirements.in -azure-cognitiveservices-speech==1.40.0 - # via -r requirements.in -azure-common==1.1.28 - # via azure-search-documents -azure-core==1.35.0 - # via - # azure-ai-documentintelligence - # azure-core-tracing-opentelemetry - # azure-cosmos - # azure-identity - # azure-monitor-opentelemetry - # azure-monitor-opentelemetry-exporter - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # microsoft-kiota-authentication-azure - # msrest -azure-core-tracing-opentelemetry==1.0.0b11 - # via azure-monitor-opentelemetry -azure-cosmos==4.9.0 - # via -r requirements.in -azure-functions==1.24.0 - # via -r requirements.in -azure-identity==1.17.1 - # via - # -r requirements.in - # azure-monitor-opentelemetry-exporter - # msgraph-sdk -azure-monitor-opentelemetry==1.8.1 - # via -r requirements.in -azure-monitor-opentelemetry-exporter==1.0.0b44 - # via azure-monitor-opentelemetry -azure-search-documents==11.7.0b2 - # via -r requirements.in -azure-storage-blob==12.22.0 - # via - # -r requirements.in - # azure-storage-file-datalake -azure-storage-file-datalake==12.16.0 - # via -r requirements.in -beautifulsoup4==4.12.3 - # via -r requirements.in -blinker==1.9.0 - # via - # flask - # quart -certifi==2024.7.4 - # via - # httpcore - # httpx - # msrest - # requests -cffi==1.17.0 - # via cryptography -charset-normalizer==3.3.2 - # via requests -click==8.3.0 - # via - # flask - # prompty - # quart - # uvicorn -cryptography==44.0.1 - # via - # -r requirements.in - # azure-identity - # azure-storage-blob - # msal - # pyjwt -distro==1.9.0 - # via openai -exceptiongroup==1.3.0 - # via - # anyio - # hypercorn - # taskgroup -fixedint==0.1.6 - # via azure-monitor-opentelemetry-exporter -flask==3.1.2 - # via quart -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -h11==0.16.0 - # via - # httpcore - # hypercorn - # uvicorn - # wsproto -h2==4.3.0 - # via - # httpx - # hypercorn -hpack==4.1.0 - # via h2 -httpcore==1.0.9 - # via httpx -httpx==0.28.1 - # via - # microsoft-kiota-http - # msgraph-core - # openai -hypercorn==0.17.3 - # via quart -hyperframe==6.1.0 - # via h2 -idna==3.10 - # via - # anyio - # httpx - # requests - # yarl -importlib-metadata==8.0.0 - # via opentelemetry-api -isodate==0.6.1 - # via - # azure-ai-documentintelligence - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # msrest -itsdangerous==2.2.0 - # via - # flask - # quart -jinja2==3.1.6 - # via - # flask - # prompty - # quart -jiter==0.11.0 - # via openai -markdown-it-py==3.0.0 - # via rich -markupsafe==3.0.3 - # via - # flask - # jinja2 - # quart - # werkzeug -mdurl==0.1.2 - # via markdown-it-py -microsoft-kiota-abstractions==1.9.3 - # via - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # microsoft-kiota-serialization-form - # microsoft-kiota-serialization-json - # microsoft-kiota-serialization-multipart - # microsoft-kiota-serialization-text - # msgraph-core -microsoft-kiota-authentication-azure==1.9.3 - # via msgraph-core -microsoft-kiota-http==1.9.3 - # via msgraph-core -microsoft-kiota-serialization-form==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-json==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-multipart==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-text==1.9.3 - # via msgraph-sdk -msal==1.33.0 - # via - # -r requirements.in - # azure-identity - # msal-extensions -msal-extensions==1.3.1 - # via azure-identity -msgraph-core==1.3.3 - # via msgraph-sdk -msgraph-sdk==1.45.0 - # via -r requirements.in -msrest==0.7.1 - # via azure-monitor-opentelemetry-exporter -multidict==6.7.0 - # via - # aiohttp - # yarl -oauthlib==3.3.1 - # via requests-oauthlib -openai==2.6.1 - # via -r requirements.in -opentelemetry-api==1.38.0 - # via - # azure-core-tracing-opentelemetry - # azure-monitor-opentelemetry-exporter - # microsoft-kiota-abstractions - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-psycopg2 - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi - # opentelemetry-sdk - # opentelemetry-semantic-conventions -opentelemetry-instrumentation==0.59b0 - # via - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-psycopg2 - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi -opentelemetry-instrumentation-aiohttp-client==0.59b0 - # via -r requirements.in -opentelemetry-instrumentation-asgi==0.59b0 - # via - # -r requirements.in - # opentelemetry-instrumentation-fastapi -opentelemetry-instrumentation-dbapi==0.59b0 - # via opentelemetry-instrumentation-psycopg2 -opentelemetry-instrumentation-django==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-fastapi==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-flask==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-httpx==0.59b0 - # via -r requirements.in -opentelemetry-instrumentation-openai==0.47.5 - # via -r requirements.in -opentelemetry-instrumentation-psycopg2==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-requests==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-urllib==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-urllib3==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-wsgi==0.59b0 - # via - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-flask -opentelemetry-resource-detector-azure==0.1.5 - # via azure-monitor-opentelemetry -opentelemetry-sdk==1.38.0 - # via - # azure-monitor-opentelemetry - # azure-monitor-opentelemetry-exporter - # microsoft-kiota-abstractions - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # opentelemetry-resource-detector-azure -opentelemetry-semantic-conventions==0.59b0 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi - # opentelemetry-sdk -opentelemetry-semantic-conventions-ai==0.4.13 - # via opentelemetry-instrumentation-openai -opentelemetry-util-http==0.59b0 - # via - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi -packaging==24.1 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-flask -pillow==12.0.0 - # via -r requirements.in -priority==2.0.0 - # via hypercorn -prompty==0.1.50 - # via -r requirements.in -propcache==0.2.0 - # via - # aiohttp - # yarl -psutil==7.1.2 - # via azure-monitor-opentelemetry-exporter -pycparser==2.22 - # via cffi -pydantic==2.12.3 - # via openai -pydantic-core==2.41.4 - # via pydantic -pygments==2.19.2 - # via rich -pyjwt==2.10.1 - # via - # -r requirements.in - # msal -pymupdf==1.26.0 - # via -r requirements.in -pypdf==6.1.3 - # via -r requirements.in -python-dotenv==1.1.1 - # via - # -r requirements.in - # prompty -pyyaml==6.0.2 - # via prompty -quart==0.20.0 - # via - # -r requirements.in - # quart-cors -quart-cors==0.7.0 - # via -r requirements.in -regex==2025.7.34 - # via tiktoken -requests==2.32.4 - # via - # azure-core - # msal - # msrest - # requests-oauthlib - # tiktoken -requests-oauthlib==2.0.0 - # via msrest -rich==14.1.0 - # via -r requirements.in -six==1.16.0 - # via - # azure-core - # isodate -sniffio==1.3.1 - # via - # anyio - # openai -soupsieve==2.7 - # via beautifulsoup4 -std-uritemplate==2.0.5 - # via microsoft-kiota-abstractions -taskgroup==0.2.2 - # via hypercorn -tenacity==9.1.2 - # via -r requirements.in -tiktoken==0.12.0 - # via -r requirements.in -tomli==2.2.1 - # via hypercorn -tqdm==4.66.5 - # via openai -types-beautifulsoup4==4.12.0.20240511 - # via -r requirements.in -types-html5lib==1.1.11.20241018 - # via types-beautifulsoup4 -types-pillow==10.2.0.20240822 - # via -r requirements.in -typing-extensions==4.15.0 - # via - # -r requirements.in - # aiosignal - # anyio - # asgiref - # azure-ai-documentintelligence - # azure-core - # azure-cosmos - # azure-identity - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # exceptiongroup - # hypercorn - # multidict - # openai - # opentelemetry-api - # opentelemetry-sdk - # opentelemetry-semantic-conventions - # pydantic - # pydantic-core - # pypdf - # taskgroup - # typing-inspection - # uvicorn -typing-inspection==0.4.2 - # via pydantic -urllib3==2.5.0 - # via requests -uvicorn==0.30.6 - # via -r requirements.in -werkzeug==3.1.3 - # via - # azure-functions - # flask - # quart -wrapt==1.16.0 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-urllib3 -wsproto==1.2.0 - # via hypercorn -yarl==1.17.2 - # via aiohttp -zipp==3.21.0 - # via importlib-metadata diff --git a/app/functions/figure_processor/requirements.txt b/app/functions/figure_processor/requirements.txt deleted file mode 100644 index 70565b78c6..0000000000 --- a/app/functions/figure_processor/requirements.txt +++ /dev/null @@ -1,456 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile requirements.in -o requirements.txt --python-version 3.10 -aiofiles==24.1.0 - # via - # prompty - # quart -aiohappyeyeballs==2.6.1 - # via aiohttp -aiohttp==3.12.14 - # via - # -r requirements.in - # microsoft-kiota-authentication-azure -aiosignal==1.4.0 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -anyio==4.4.0 - # via - # httpx - # openai -asgiref==3.10.0 - # via opentelemetry-instrumentation-asgi -async-timeout==5.0.1 - # via aiohttp -attrs==25.3.0 - # via aiohttp -azure-ai-documentintelligence==1.0.0b4 - # via -r requirements.in -azure-cognitiveservices-speech==1.40.0 - # via -r requirements.in -azure-common==1.1.28 - # via azure-search-documents -azure-core==1.35.0 - # via - # azure-ai-documentintelligence - # azure-core-tracing-opentelemetry - # azure-cosmos - # azure-identity - # azure-monitor-opentelemetry - # azure-monitor-opentelemetry-exporter - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # microsoft-kiota-authentication-azure - # msrest -azure-core-tracing-opentelemetry==1.0.0b11 - # via azure-monitor-opentelemetry -azure-cosmos==4.9.0 - # via -r requirements.in -azure-functions==1.24.0 - # via -r requirements.in -azure-identity==1.17.1 - # via - # -r requirements.in - # azure-monitor-opentelemetry-exporter - # msgraph-sdk -azure-monitor-opentelemetry==1.8.1 - # via -r requirements.in -azure-monitor-opentelemetry-exporter==1.0.0b44 - # via azure-monitor-opentelemetry -azure-search-documents==11.7.0b2 - # via -r requirements.in -azure-storage-blob==12.22.0 - # via - # -r requirements.in - # azure-storage-file-datalake -azure-storage-file-datalake==12.16.0 - # via -r requirements.in -beautifulsoup4==4.12.3 - # via -r requirements.in -blinker==1.9.0 - # via - # flask - # quart -certifi==2024.7.4 - # via - # httpcore - # httpx - # msrest - # requests -cffi==1.17.0 - # via cryptography -charset-normalizer==3.3.2 - # via requests -click==8.3.0 - # via - # flask - # prompty - # quart - # uvicorn -cryptography==44.0.1 - # via - # -r requirements.in - # azure-identity - # azure-storage-blob - # msal - # pyjwt -distro==1.9.0 - # via openai -exceptiongroup==1.3.0 - # via - # anyio - # hypercorn - # taskgroup -fixedint==0.1.6 - # via azure-monitor-opentelemetry-exporter -flask==3.1.2 - # via quart -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -h11==0.16.0 - # via - # httpcore - # hypercorn - # uvicorn - # wsproto -h2==4.3.0 - # via - # httpx - # hypercorn -hpack==4.1.0 - # via h2 -httpcore==1.0.9 - # via httpx -httpx==0.28.1 - # via - # microsoft-kiota-http - # msgraph-core - # openai -hypercorn==0.17.3 - # via quart -hyperframe==6.1.0 - # via h2 -idna==3.10 - # via - # anyio - # httpx - # requests - # yarl -importlib-metadata==8.0.0 - # via opentelemetry-api -isodate==0.6.1 - # via - # azure-ai-documentintelligence - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # msrest -itsdangerous==2.2.0 - # via - # flask - # quart -jinja2==3.1.6 - # via - # flask - # prompty - # quart -jiter==0.11.0 - # via openai -markdown-it-py==3.0.0 - # via rich -markupsafe==3.0.3 - # via - # flask - # jinja2 - # quart - # werkzeug -mdurl==0.1.2 - # via markdown-it-py -microsoft-kiota-abstractions==1.9.3 - # via - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # microsoft-kiota-serialization-form - # microsoft-kiota-serialization-json - # microsoft-kiota-serialization-multipart - # microsoft-kiota-serialization-text - # msgraph-core -microsoft-kiota-authentication-azure==1.9.3 - # via msgraph-core -microsoft-kiota-http==1.9.3 - # via msgraph-core -microsoft-kiota-serialization-form==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-json==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-multipart==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-text==1.9.3 - # via msgraph-sdk -msal==1.33.0 - # via - # -r requirements.in - # azure-identity - # msal-extensions -msal-extensions==1.3.1 - # via azure-identity -msgraph-core==1.3.3 - # via msgraph-sdk -msgraph-sdk==1.45.0 - # via -r requirements.in -msrest==0.7.1 - # via azure-monitor-opentelemetry-exporter -multidict==6.7.0 - # via - # aiohttp - # yarl -oauthlib==3.3.1 - # via requests-oauthlib -openai==2.6.1 - # via -r requirements.in -opentelemetry-api==1.38.0 - # via - # azure-core-tracing-opentelemetry - # azure-monitor-opentelemetry-exporter - # microsoft-kiota-abstractions - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-psycopg2 - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi - # opentelemetry-sdk - # opentelemetry-semantic-conventions -opentelemetry-instrumentation==0.59b0 - # via - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-psycopg2 - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi -opentelemetry-instrumentation-aiohttp-client==0.59b0 - # via -r requirements.in -opentelemetry-instrumentation-asgi==0.59b0 - # via - # -r requirements.in - # opentelemetry-instrumentation-fastapi -opentelemetry-instrumentation-dbapi==0.59b0 - # via opentelemetry-instrumentation-psycopg2 -opentelemetry-instrumentation-django==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-fastapi==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-flask==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-httpx==0.59b0 - # via -r requirements.in -opentelemetry-instrumentation-openai==0.47.5 - # via -r requirements.in -opentelemetry-instrumentation-psycopg2==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-requests==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-urllib==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-urllib3==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-wsgi==0.59b0 - # via - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-flask -opentelemetry-resource-detector-azure==0.1.5 - # via azure-monitor-opentelemetry -opentelemetry-sdk==1.38.0 - # via - # azure-monitor-opentelemetry - # azure-monitor-opentelemetry-exporter - # microsoft-kiota-abstractions - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # opentelemetry-resource-detector-azure -opentelemetry-semantic-conventions==0.59b0 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi - # opentelemetry-sdk -opentelemetry-semantic-conventions-ai==0.4.13 - # via opentelemetry-instrumentation-openai -opentelemetry-util-http==0.59b0 - # via - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi -packaging==24.1 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-flask -pillow==12.0.0 - # via -r requirements.in -priority==2.0.0 - # via hypercorn -prompty==0.1.50 - # via -r requirements.in -propcache==0.2.0 - # via - # aiohttp - # yarl -psutil==7.1.2 - # via azure-monitor-opentelemetry-exporter -pycparser==2.22 - # via cffi -pydantic==2.12.3 - # via openai -pydantic-core==2.41.4 - # via pydantic -pygments==2.19.2 - # via rich -pyjwt==2.10.1 - # via - # -r requirements.in - # msal -pymupdf==1.26.0 - # via -r requirements.in -pypdf==6.1.3 - # via -r requirements.in -python-dotenv==1.1.1 - # via - # -r requirements.in - # prompty -pyyaml==6.0.2 - # via prompty -quart==0.20.0 - # via - # -r requirements.in - # quart-cors -quart-cors==0.7.0 - # via -r requirements.in -regex==2025.7.34 - # via tiktoken -requests==2.32.4 - # via - # azure-core - # msal - # msrest - # requests-oauthlib - # tiktoken -requests-oauthlib==2.0.0 - # via msrest -rich==14.1.0 - # via -r requirements.in -six==1.16.0 - # via - # azure-core - # isodate -sniffio==1.3.1 - # via - # anyio - # openai -soupsieve==2.7 - # via beautifulsoup4 -std-uritemplate==2.0.5 - # via microsoft-kiota-abstractions -taskgroup==0.2.2 - # via hypercorn -tenacity==9.1.2 - # via -r requirements.in -tiktoken==0.12.0 - # via -r requirements.in -tomli==2.2.1 - # via hypercorn -tqdm==4.66.5 - # via openai -types-beautifulsoup4==4.12.0.20240511 - # via -r requirements.in -types-html5lib==1.1.11.20241018 - # via types-beautifulsoup4 -types-pillow==10.2.0.20240822 - # via -r requirements.in -typing-extensions==4.15.0 - # via - # -r requirements.in - # aiosignal - # anyio - # asgiref - # azure-ai-documentintelligence - # azure-core - # azure-cosmos - # azure-identity - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # exceptiongroup - # hypercorn - # multidict - # openai - # opentelemetry-api - # opentelemetry-sdk - # opentelemetry-semantic-conventions - # pydantic - # pydantic-core - # pypdf - # taskgroup - # typing-inspection - # uvicorn -typing-inspection==0.4.2 - # via pydantic -urllib3==2.5.0 - # via requests -uvicorn==0.30.6 - # via -r requirements.in -werkzeug==3.1.3 - # via - # azure-functions - # flask - # quart -wrapt==1.16.0 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-urllib3 -wsproto==1.2.0 - # via hypercorn -yarl==1.17.2 - # via aiohttp -zipp==3.21.0 - # via importlib-metadata diff --git a/app/functions/text_processor/requirements.txt b/app/functions/text_processor/requirements.txt deleted file mode 100644 index 70565b78c6..0000000000 --- a/app/functions/text_processor/requirements.txt +++ /dev/null @@ -1,456 +0,0 @@ -# This file was autogenerated by uv via the following command: -# uv pip compile requirements.in -o requirements.txt --python-version 3.10 -aiofiles==24.1.0 - # via - # prompty - # quart -aiohappyeyeballs==2.6.1 - # via aiohttp -aiohttp==3.12.14 - # via - # -r requirements.in - # microsoft-kiota-authentication-azure -aiosignal==1.4.0 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -anyio==4.4.0 - # via - # httpx - # openai -asgiref==3.10.0 - # via opentelemetry-instrumentation-asgi -async-timeout==5.0.1 - # via aiohttp -attrs==25.3.0 - # via aiohttp -azure-ai-documentintelligence==1.0.0b4 - # via -r requirements.in -azure-cognitiveservices-speech==1.40.0 - # via -r requirements.in -azure-common==1.1.28 - # via azure-search-documents -azure-core==1.35.0 - # via - # azure-ai-documentintelligence - # azure-core-tracing-opentelemetry - # azure-cosmos - # azure-identity - # azure-monitor-opentelemetry - # azure-monitor-opentelemetry-exporter - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # microsoft-kiota-authentication-azure - # msrest -azure-core-tracing-opentelemetry==1.0.0b11 - # via azure-monitor-opentelemetry -azure-cosmos==4.9.0 - # via -r requirements.in -azure-functions==1.24.0 - # via -r requirements.in -azure-identity==1.17.1 - # via - # -r requirements.in - # azure-monitor-opentelemetry-exporter - # msgraph-sdk -azure-monitor-opentelemetry==1.8.1 - # via -r requirements.in -azure-monitor-opentelemetry-exporter==1.0.0b44 - # via azure-monitor-opentelemetry -azure-search-documents==11.7.0b2 - # via -r requirements.in -azure-storage-blob==12.22.0 - # via - # -r requirements.in - # azure-storage-file-datalake -azure-storage-file-datalake==12.16.0 - # via -r requirements.in -beautifulsoup4==4.12.3 - # via -r requirements.in -blinker==1.9.0 - # via - # flask - # quart -certifi==2024.7.4 - # via - # httpcore - # httpx - # msrest - # requests -cffi==1.17.0 - # via cryptography -charset-normalizer==3.3.2 - # via requests -click==8.3.0 - # via - # flask - # prompty - # quart - # uvicorn -cryptography==44.0.1 - # via - # -r requirements.in - # azure-identity - # azure-storage-blob - # msal - # pyjwt -distro==1.9.0 - # via openai -exceptiongroup==1.3.0 - # via - # anyio - # hypercorn - # taskgroup -fixedint==0.1.6 - # via azure-monitor-opentelemetry-exporter -flask==3.1.2 - # via quart -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -h11==0.16.0 - # via - # httpcore - # hypercorn - # uvicorn - # wsproto -h2==4.3.0 - # via - # httpx - # hypercorn -hpack==4.1.0 - # via h2 -httpcore==1.0.9 - # via httpx -httpx==0.28.1 - # via - # microsoft-kiota-http - # msgraph-core - # openai -hypercorn==0.17.3 - # via quart -hyperframe==6.1.0 - # via h2 -idna==3.10 - # via - # anyio - # httpx - # requests - # yarl -importlib-metadata==8.0.0 - # via opentelemetry-api -isodate==0.6.1 - # via - # azure-ai-documentintelligence - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # msrest -itsdangerous==2.2.0 - # via - # flask - # quart -jinja2==3.1.6 - # via - # flask - # prompty - # quart -jiter==0.11.0 - # via openai -markdown-it-py==3.0.0 - # via rich -markupsafe==3.0.3 - # via - # flask - # jinja2 - # quart - # werkzeug -mdurl==0.1.2 - # via markdown-it-py -microsoft-kiota-abstractions==1.9.3 - # via - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # microsoft-kiota-serialization-form - # microsoft-kiota-serialization-json - # microsoft-kiota-serialization-multipart - # microsoft-kiota-serialization-text - # msgraph-core -microsoft-kiota-authentication-azure==1.9.3 - # via msgraph-core -microsoft-kiota-http==1.9.3 - # via msgraph-core -microsoft-kiota-serialization-form==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-json==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-multipart==1.9.3 - # via msgraph-sdk -microsoft-kiota-serialization-text==1.9.3 - # via msgraph-sdk -msal==1.33.0 - # via - # -r requirements.in - # azure-identity - # msal-extensions -msal-extensions==1.3.1 - # via azure-identity -msgraph-core==1.3.3 - # via msgraph-sdk -msgraph-sdk==1.45.0 - # via -r requirements.in -msrest==0.7.1 - # via azure-monitor-opentelemetry-exporter -multidict==6.7.0 - # via - # aiohttp - # yarl -oauthlib==3.3.1 - # via requests-oauthlib -openai==2.6.1 - # via -r requirements.in -opentelemetry-api==1.38.0 - # via - # azure-core-tracing-opentelemetry - # azure-monitor-opentelemetry-exporter - # microsoft-kiota-abstractions - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-psycopg2 - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi - # opentelemetry-sdk - # opentelemetry-semantic-conventions -opentelemetry-instrumentation==0.59b0 - # via - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-psycopg2 - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi -opentelemetry-instrumentation-aiohttp-client==0.59b0 - # via -r requirements.in -opentelemetry-instrumentation-asgi==0.59b0 - # via - # -r requirements.in - # opentelemetry-instrumentation-fastapi -opentelemetry-instrumentation-dbapi==0.59b0 - # via opentelemetry-instrumentation-psycopg2 -opentelemetry-instrumentation-django==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-fastapi==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-flask==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-httpx==0.59b0 - # via -r requirements.in -opentelemetry-instrumentation-openai==0.47.5 - # via -r requirements.in -opentelemetry-instrumentation-psycopg2==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-requests==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-urllib==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-urllib3==0.59b0 - # via azure-monitor-opentelemetry -opentelemetry-instrumentation-wsgi==0.59b0 - # via - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-flask -opentelemetry-resource-detector-azure==0.1.5 - # via azure-monitor-opentelemetry -opentelemetry-sdk==1.38.0 - # via - # azure-monitor-opentelemetry - # azure-monitor-opentelemetry-exporter - # microsoft-kiota-abstractions - # microsoft-kiota-authentication-azure - # microsoft-kiota-http - # opentelemetry-resource-detector-azure -opentelemetry-semantic-conventions==0.59b0 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-openai - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi - # opentelemetry-sdk -opentelemetry-semantic-conventions-ai==0.4.13 - # via opentelemetry-instrumentation-openai -opentelemetry-util-http==0.59b0 - # via - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-asgi - # opentelemetry-instrumentation-django - # opentelemetry-instrumentation-fastapi - # opentelemetry-instrumentation-flask - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-requests - # opentelemetry-instrumentation-urllib - # opentelemetry-instrumentation-urllib3 - # opentelemetry-instrumentation-wsgi -packaging==24.1 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-flask -pillow==12.0.0 - # via -r requirements.in -priority==2.0.0 - # via hypercorn -prompty==0.1.50 - # via -r requirements.in -propcache==0.2.0 - # via - # aiohttp - # yarl -psutil==7.1.2 - # via azure-monitor-opentelemetry-exporter -pycparser==2.22 - # via cffi -pydantic==2.12.3 - # via openai -pydantic-core==2.41.4 - # via pydantic -pygments==2.19.2 - # via rich -pyjwt==2.10.1 - # via - # -r requirements.in - # msal -pymupdf==1.26.0 - # via -r requirements.in -pypdf==6.1.3 - # via -r requirements.in -python-dotenv==1.1.1 - # via - # -r requirements.in - # prompty -pyyaml==6.0.2 - # via prompty -quart==0.20.0 - # via - # -r requirements.in - # quart-cors -quart-cors==0.7.0 - # via -r requirements.in -regex==2025.7.34 - # via tiktoken -requests==2.32.4 - # via - # azure-core - # msal - # msrest - # requests-oauthlib - # tiktoken -requests-oauthlib==2.0.0 - # via msrest -rich==14.1.0 - # via -r requirements.in -six==1.16.0 - # via - # azure-core - # isodate -sniffio==1.3.1 - # via - # anyio - # openai -soupsieve==2.7 - # via beautifulsoup4 -std-uritemplate==2.0.5 - # via microsoft-kiota-abstractions -taskgroup==0.2.2 - # via hypercorn -tenacity==9.1.2 - # via -r requirements.in -tiktoken==0.12.0 - # via -r requirements.in -tomli==2.2.1 - # via hypercorn -tqdm==4.66.5 - # via openai -types-beautifulsoup4==4.12.0.20240511 - # via -r requirements.in -types-html5lib==1.1.11.20241018 - # via types-beautifulsoup4 -types-pillow==10.2.0.20240822 - # via -r requirements.in -typing-extensions==4.15.0 - # via - # -r requirements.in - # aiosignal - # anyio - # asgiref - # azure-ai-documentintelligence - # azure-core - # azure-cosmos - # azure-identity - # azure-search-documents - # azure-storage-blob - # azure-storage-file-datalake - # exceptiongroup - # hypercorn - # multidict - # openai - # opentelemetry-api - # opentelemetry-sdk - # opentelemetry-semantic-conventions - # pydantic - # pydantic-core - # pypdf - # taskgroup - # typing-inspection - # uvicorn -typing-inspection==0.4.2 - # via pydantic -urllib3==2.5.0 - # via requests -uvicorn==0.30.6 - # via -r requirements.in -werkzeug==3.1.3 - # via - # azure-functions - # flask - # quart -wrapt==1.16.0 - # via - # opentelemetry-instrumentation - # opentelemetry-instrumentation-aiohttp-client - # opentelemetry-instrumentation-dbapi - # opentelemetry-instrumentation-httpx - # opentelemetry-instrumentation-urllib3 -wsproto==1.2.0 - # via hypercorn -yarl==1.17.2 - # via aiohttp -zipp==3.21.0 - # via importlib-metadata diff --git a/infra/main.bicep b/infra/main.bicep index ac8e97f311..7b713be370 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -469,6 +469,8 @@ module appServicePlan 'core/host/appserviceplan.bicep' = if (deploymentTarget == var appEnvVariables = { AZURE_STORAGE_ACCOUNT: storage.outputs.name AZURE_STORAGE_CONTAINER: storageContainerName + AZURE_STORAGE_RESOURCE_GROUP: storageResourceGroup.name + AZURE_SUBSCRIPTION_ID: subscription().subscriptionId AZURE_SEARCH_INDEX: searchIndexName AZURE_SEARCH_KNOWLEDGEBASE_NAME: knowledgeBaseName AZURE_SEARCH_SERVICE: searchService.outputs.name diff --git a/tests/test_function_apps.py b/tests/test_function_apps.py index f2183acb8d..bf555f52c6 100644 --- a/tests/test_function_apps.py +++ b/tests/test_function_apps.py @@ -83,9 +83,14 @@ async def parse(self, content: Any): ".pdf": FileProcessor(StubParser([page]), None), } + class MockBlobManager: + async def download_blob(self, blob_path: str): + return (b"pdf-bytes", {}) + mock_settings = document_extractor.GlobalSettings( file_processors=mock_file_processors, azure_credential=object(), + blob_manager=MockBlobManager(), ) monkeypatch.setattr(document_extractor, "settings", mock_settings) @@ -94,9 +99,7 @@ async def parse(self, content: Any): { "recordId": "record-1", "data": { - "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, - "file_name": "sample.pdf", - "contentType": "application/pdf", + "metadata_storage_path": "https://account.blob.core.windows.net/container/sample.pdf", }, } ] @@ -128,6 +131,7 @@ async def test_document_extractor_requires_single_record(monkeypatch: pytest.Mon mock_settings = document_extractor.GlobalSettings( file_processors={".pdf": FileProcessor(None, None)}, azure_credential=object(), + blob_manager=object(), ) monkeypatch.setattr(document_extractor, "settings", mock_settings) response = await document_extractor.extract_document(build_request({"values": []})) @@ -144,6 +148,7 @@ async def failing_process(data: dict[str, Any]) -> dict[str, Any]: mock_settings = document_extractor.GlobalSettings( file_processors={".pdf": FileProcessor(None, None)}, azure_credential=object(), + blob_manager=object(), ) monkeypatch.setattr(document_extractor, "settings", mock_settings) monkeypatch.setattr(document_extractor, "process_document", failing_process) @@ -153,9 +158,7 @@ async def failing_process(data: dict[str, Any]) -> dict[str, Any]: { "recordId": "rec-error", "data": { - "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, - "file_name": "sample.pdf", - "contentType": "application/pdf", + "metadata_storage_path": "https://account.blob.core.windows.net/container/sample.pdf", }, } ] @@ -186,16 +189,19 @@ async def parse(self, content): ".pdf": FileProcessor(FailingParser(), None), } + class MockBlobManager: + async def download_blob(self, blob_path: str): + return (b"content", {}) + mock_settings = document_extractor.GlobalSettings( file_processors=mock_file_processors, azure_credential=object(), + blob_manager=MockBlobManager(), ) monkeypatch.setattr(document_extractor, "settings", mock_settings) data = { - "file_data": {"data": base64.b64encode(b"content").decode("utf-8")}, - "file_name": "doc.pdf", - "contentType": "application/pdf", + "metadata_storage_path": "https://account.blob.core.windows.net/container/doc.pdf", } with pytest.raises(ValueError) as exc_info: @@ -204,12 +210,16 @@ async def parse(self, content): assert "Parser failed" in str(exc_info.value) -def test_document_extractor_missing_file_data() -> None: - with pytest.raises(ValueError): - document_extractor.get_document_stream_filedata({"file_data": {}}) +def test_document_extractor_managed_identity_reload(monkeypatch: pytest.MonkeyPatch) -> None: + # Set required environment variables + monkeypatch.setenv("AZURE_STORAGE_ACCOUNT", "teststorage") + monkeypatch.setenv("AZURE_STORAGE_CONTAINER", "testcontainer") + monkeypatch.setenv("AZURE_STORAGE_RESOURCE_GROUP", "testrg") + monkeypatch.setenv("AZURE_SUBSCRIPTION_ID", "test-sub-id") + # Mock setup_blob_manager to avoid actual Azure calls + monkeypatch.setattr(document_extractor, "setup_blob_manager", lambda **kwargs: object()) -def test_document_extractor_managed_identity_reload(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("AZURE_CLIENT_ID", "client-123") document_extractor.configure_global_settings() assert isinstance(document_extractor.settings.azure_credential, document_extractor.ManagedIdentityCredential) @@ -471,9 +481,7 @@ async def test_document_extractor_without_settings(monkeypatch: pytest.MonkeyPat { "recordId": "record-1", "data": { - "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, - "file_name": "sample.pdf", - "contentType": "application/pdf", + "metadata_storage_path": "https://account.blob.core.windows.net/container/sample.pdf", }, } ]